In [None]:
import pygsi, pandas

# this gives us a neat progress bar
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

In [None]:
# the oxa1 gene as specified on http://bigsi.io
oxa1_sequence="ATGAAAAACACAATACATATCAACTTCGCTATTTTTTTAATAATTGCAAATATTATCTACAGCAGCGCCAGTGCATCAACAGATATCTCTACTGTTGCATCTCCATTATTTGAAGGAACTGAAGGTTGTTTTTTACTTTACGATGCATCCACAAACGCTGAAATTGCTCAATTCAATAAAGCAAAGTGTGCAACGCAAATGGCACCAGATTCAACTTTCAAGATCGCATTATCACTTATGGCATTTGATGCGGAAATAATAGATCAGAAAACCATATTCAAATGGGATAAAACCCCCAAAGGAATGGAGATCTGGAACAGCAATCATACACCAAAGACGTGGATGCAATTTTCTGTTGTTTGGGTTTCGCAAGAAATAACCCAAAAAATTGGATTAAATAAAATCAAGAATTATCTCAAAGATTTTGATTATGGAAATCAAGACTTCTCTGGAGATAAAGAAAGAAACAACGGATTAACAGAAGCATGGCTCGAAAGTAGCTTAAAAATTTCACCAGAAGAACAAATTCAATTCCTGCGTAAAATTATTAATCACAATCTCCCAGTTAAAAACTCAGCCATAGAAAACACCATAGAGAACATGTATCTACAAGATCTGGATAATAGTACAAAACTGTATGGGAAAACTGGTGCAGGATTCACAGCAAATAGAACCTTACAAAACGGATGGTTTGAAGGGTTTATTATAAGCAAATCAGGACATAAATATGTTTTTGTGTCCGCACTTACAGGAAACTTGGGGTCGAATTTAACATCAAGCATAAAAGCCAAGAAAAATGCGATCACCATTCTAAACACACTAAATTTATAA"

print("OXA1 is %i bases and %i amino acids long" % (len(oxa1_sequence),len(oxa1_sequence)/3))

In [None]:
# create an instance of the class by giving it the nucleotide sequence as a string
oxa1=pygsi.NucleotideStretch(nucleotide_sequence=oxa1_sequence,\
                             gene_name="oxa1",\
                             first_amino_acid_position=1,\
                             species_name=None)

# note that as species_name is not specified, there will be no filtering based on
# species and so all sequences will considered

In [None]:
# let's have a look at the summary
print(oxa1)

In [None]:
# let's just look at a short stretch of the protein
# these are amino acid positions
# also remember, since we are querying bigsi with a triplet flanked by 30 bases on either side,
# and here we are just using the gene, the first amino acid we can consider is 11 and the last is
# N-10, which here is 266.

for position in tqdm(range(228,231)):

    # this form will permute each of the three positions in the triplet in turn 
    # with once for wildtype
    # 4x3x3=36 possible combinations 

    for i in [1,2,3]:    
        oxa1.permuate_position(position,triplet_position=i)

    # alternatively, comment out the above two lines, and uncomment this line which
    # instead considers ALL the possible triplets (incl. wt)
    # 4^3 = 64 combinations

    # oxa1.permuate_position(position)
    
    # save every 10 amino acids 
    if (position%10)==0:
        oxa1.save("dat/oxa1-whole-1.npy")
        oxa1.df.to_csv("dat/oxa1-whole-1.csv")    

In [None]:
# the data is stored in the class as a Pandas dataframe, which we can simply access via
print(oxa1.df)
print(oxa1.mutations)

In [None]:
print(oxa1.arrays["number_nucleotide_changes"])
print(oxa1.codons)
print(oxa1.arrays["number_nucleotide_changes"])

In [None]:
# save all the variables and arrays to a NPY file
oxa1.save("dat/oxa1-whole-1.npy")

# save the Pandas dataset to a CSV and DTA file
oxa1.df.to_csv("dat/oxa1-whole-1.csv")
oxa1.df.to_stata("dat/oxa1-whole-1.dta")
oxa1.mutations.to_pickle("dat/oxa1-whole-1-mutations.pkl")
oxa1.mutations.to_csv("dat/oxa1-whole-1-mutations.csv")

In [None]:
oxa1.calculate_metrics(threshold=1)