# Import key libraries
The preferred library abbreviation is `n2m`.

In [1]:
import pandas as pd
import numpy as np
import nucleotides_to_matrices as n2m

# View encodings
Encodings follow IUPAC nucleotides codes, with some additions. Includes RNA codes U (Uracil), and special code I (Inosine). Includes gap and degenerate codes.

In [2]:
n2m.encodings_df()

Unnamed: 0,Code,Name,DNA,RNA,Special,Degenerate,Matches,A,C,G,T
0,A,Adenine,True,True,False,False,A,1,0,0,0
1,C,Cytosine,True,True,False,False,C,0,1,0,0
2,G,Guanine,True,True,False,False,G,0,0,1,0
3,T,Thymine,True,False,False,False,T,0,0,0,1
4,W,Weak,True,True,False,True,A/T,1,0,0,1
5,S,Strong,True,True,False,True,C/G,0,1,1,0
6,M,Amino,True,True,False,True,A/C,1,1,0,0
7,K,Keto,True,True,False,True,G/T,0,0,1,1
8,R,Purine,True,True,False,True,A/G,1,0,1,0
9,Y,Pyrimidine,True,True,False,True,C/T,0,1,0,1


# Example usage

## Nucleotide code strings

In [3]:
example_dna_sequence = 'GATTACAN.'
example_rna_sequence = 'GAUUACIN-'

## Conversion of DNA and RNA strings to matrices
Matrices are "tall": Sequence length is the first dimension. They are numpy arrays of integers 1 or 0.

In [4]:
dna_matrix = n2m.sequence_to_matrix(example_dna_sequence)
dna_matrix

array([[0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0]])

In [5]:
rna_matrix = n2m.sequence_to_matrix(example_rna_sequence)
rna_matrix

array([[0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0]])

## Conversion of sequences back into matrices

In [6]:
dna_sequence = n2m.matrix_to_sequence(dna_matrix)
dna_sequence

'GATTACAN.'

In [7]:
rna_sequence = n2m.matrix_to_sequence(rna_matrix, type = 'rna')
rna_sequence

'GAUUACGN.'

In [8]:
n2m.matrix_to_sequence(n2m.sequence_to_matrix(example_dna_sequence)[::-1,::-1], type = 'rna')

'.NUGUAAUC'

## Reverse complementation
Reverse complements are found by inverting axes of the encoded matrices.

In [9]:
rc_dna_matrix = dna_matrix[::-1,::-1]
rc_dna_matrix

array([[0, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0]])

In [10]:
rc_sequence = n2m.matrix_to_sequence(rc_dna_matrix, type = 'rna')
rc_sequence

'.NUGUAAUC'