In [21]:
import pygsi, pandas

# this gives us a neat progress bar
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# the oxa1 gene as specified on http://bigsi.io
oxa1_sequence="ATGAAAAACACAATACATATCAACTTCGCTATTTTTTTAATAATTGCAAATATTATCTACAGCAGCGCCAGTGCATCAACAGATATCTCTACTGTTGCATCTCCATTATTTGAAGGAACTGAAGGTTGTTTTTTACTTTACGATGCATCCACAAACGCTGAAATTGCTCAATTCAATAAAGCAAAGTGTGCAACGCAAATGGCACCAGATTCAACTTTCAAGATCGCATTATCACTTATGGCATTTGATGCGGAAATAATAGATCAGAAAACCATATTCAAATGGGATAAAACCCCCAAAGGAATGGAGATCTGGAACAGCAATCATACACCAAAGACGTGGATGCAATTTTCTGTTGTTTGGGTTTCGCAAGAAATAACCCAAAAAATTGGATTAAATAAAATCAAGAATTATCTCAAAGATTTTGATTATGGAAATCAAGACTTCTCTGGAGATAAAGAAAGAAACAACGGATTAACAGAAGCATGGCTCGAAAGTAGCTTAAAAATTTCACCAGAAGAACAAATTCAATTCCTGCGTAAAATTATTAATCACAATCTCCCAGTTAAAAACTCAGCCATAGAAAACACCATAGAGAACATGTATCTACAAGATCTGGATAATAGTACAAAACTGTATGGGAAAACTGGTGCAGGATTCACAGCAAATAGAACCTTACAAAACGGATGGTTTGAAGGGTTTATTATAAGCAAATCAGGACATAAATATGTTTTTGTGTCCGCACTTACAGGAAACTTGGGGTCGAATTTAACATCAAGCATAAAAGCCAAGAAAAATGCGATCACCATTCTAAACACACTAAATTTATAA"

print("OXA1 is %i bases and %i amino acids long" % (len(oxa1_sequence),len(oxa1_sequence)/3))

OXA1 is 831 bases and 277 amino acids long


In [43]:
# create an instance of the class by giving it the nucleotide sequence as a string
oxa1=pygsi.NucleotideStretch(nucleotide_sequence=oxa1_sequence,\
                             gene_name="oxa1",\
                             first_amino_acid_position=1,\
                             species_name=None)

# note that as species_name is not specified, there will be no filtering based on
# species and so all sequences will considered

In [44]:
# let's have a look at the summary
print(oxa1.mutations)

Empty DataFrame
Columns: [ena_accession, mutation, amino_acid_position, new_triplet, original_triplet]
Index: []


In [45]:
# let's just look at a short stretch of the protein
# these are amino acid positions
# also remember, since we are querying bigsi with a triplet flanked by 30 bases on either side,
# and here we are just using the gene, the first amino acid we can consider is 11 and the last is
# N-10, which here is 266.

for position in tqdm(range(228,231)):

    # this form will permute each of the three positions in the triplet in turn 
    # with once for wildtype
    # 4x3x3=36 possible combinations 

    for i in [1,2,3]:    
        oxa1.permuate_position(position,triplet_position=i)

    # alternatively, comment out the above two lines, and uncomment this line which
    # instead considers ALL the possible triplets (incl. wt)
    # 4^3 = 64 combinations

    # oxa1.permuate_position(position)
    
    # save every 10 amino acids 
    if (position%10)==0:
        oxa1.save("dat/oxa1-whole-1.npy")
        oxa1.df.to_csv("dat/oxa1-whole-1.csv")    

100%|██████████| 3/3 [00:18<00:00,  6.19s/it]


In [49]:
# the data is stored in the class as a Pandas dataframe, which we can simply access via
print(oxa1.df)
print(oxa1.mutations)

   amino_acid_position mutation new_amino_acid new_triplet  non_synonymous  \
0                  230        -              W         tgg           False   
1                  228        -              N         aac           False   
2                  229    G229E              E         gaa            True   
3                  229        -              G         gga           False   

   number_genomes  number_nucleotide_changes original_amino_acid  \
0            7057                          0                   W   
1            7074                          0                   N   
2              14                          1                   G   
3            7071                          0                   G   

  original_triplet  synonymous  
0              tgg        True  
1              aac        True  
2              gga       False  
3              gga        True  
   ena_accession mutation amino_acid_position new_triplet original_triplet
0      ERR586863    G229E   

In [56]:
print(oxa1.arrays["number_nucleotide_changes"])
print(oxa1.codons)
print(oxa1.arrays["number_nucleotide_changes"])


[[2. 3. 3. ... 2. 1. 2.]
 [2. 3. 2. ... 3. 1. 2.]
 [2. 2. 3. ... 3. 0. 1.]
 ...
 [3. 3. 2. ... 3. 3. 3.]
 [3. 2. 3. ... 3. 2. 2.]
 [2. 3. 3. ... 3. 3. 3.]]
['ttt' 'ttc' 'tta' 'ttg' 'tct' 'tcc' 'tca' 'tcg' 'tat' 'tac' 'taa' 'tag'
 'tgt' 'tgc' 'tga' 'tgg' 'ctt' 'ctc' 'cta' 'ctg' 'cct' 'ccc' 'cca' 'ccg'
 'cat' 'cac' 'caa' 'cag' 'cgt' 'cgc' 'cga' 'cgg' 'att' 'atc' 'ata' 'atg'
 'act' 'acc' 'aca' 'acg' 'aat' 'aac' 'aaa' 'aag' 'agt' 'agc' 'aga' 'agg'
 'gtt' 'gtc' 'gta' 'gtg' 'gct' 'gcc' 'gca' 'gcg' 'gat' 'gac' 'gaa' 'gag'
 'ggt' 'ggc' 'gga' 'ggg']


In [48]:
# save all the variables and arrays to a NPY file
oxa1.save("dat/oxa1-whole-1.npy")

# save the Pandas dataset to a CSV and DTA file
oxa1.df.to_csv("dat/oxa1-whole-1.csv")
oxa1.df.to_stata("dat/oxa1-whole-1.dta")
oxa1.mutations.to_pickle("dat/oxa1-whole-1-mutations.pkl")
oxa1.mutations.to_csv("dat/oxa1-whole-1-mutations.csv")


In [31]:
a=pandas.DataFrame(columns=["A","B"])
print(a)
a=a.append({"A":1,"B":2},ignore_index=True)
a=a.append({"A":1,"B":2},ignore_index=True)
print(a)

Empty DataFrame
Columns: [A, B]
Index: []
   A  B
0  1  2
1  1  2
