In [1]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio import SeqIO
import pandas as pd
import numpy as np
import sourmash
import time

In [2]:
!sourmash info

[Ksourmash version 2.0.0
[K- loaded from path: /home/brian/anaconda3/lib/python3.7/site-packages/sourmash
[K


In [3]:
with open('influenza.fna') as fasta_file: 
    cgi_titles = []
    cgi_sequences = []
    for title, sequence in SimpleFastaParser(fasta_file):
        cgi_titles.append(title)
        cgi_sequences.append(sequence)
influenza_na = pd.DataFrame(list(zip(cgi_titles, cgi_sequences)), columns=['titles','sequences'])
uniflna = influenza_na.drop_duplicates('sequences')
uniflna_HA = uniflna.loc[uniflna.iloc[:,0].str.contains(r'\(HA\) | hemagglutinin | (segment 4)')]

  if __name__ == '__main__':


In [10]:
np.random.seed(12478)
uniflna_HA_sub1 = uniflna_HA.sample(n=1000)

In [13]:
uniflna_HA_sub1.head()

Unnamed: 0,titles,sequences
252112,gi|469936972|gb|KC738886|Influenza A virus (A/...,ACAGCTACATATGCAGACACAATATGTATAGGCTACCATGCCAACA...
259962,gi|484849114|gb|KC865637|Influenza A virus (A/...,AAAGCAGGGGATAATTCTATTAACCATGAAGACTATCATTGCTTTG...
428295,gi|984690389|gb|KU590388|Influenza A virus (A/...,ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCG...
628621,gi|1409674136|gb|MH540925|Influenza A virus (A...,GGATAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCTACAT...
244546,gi|451789264|gb|KC535453|Influenza A virus (A/...,ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCG...


In [20]:
len(uniflna_HA_sub1)

1000

In [30]:
# make it back into text FASTA file
with open ("uniflna_HA_sub1.fasta", 'w') as output:
    for i in range(len(uniflna_HA_sub1)):
        output.write(">" + list(uniflna_HA_sub1.titles.values)[i] + "\n" + list(uniflna_HA_sub1.sequences.values)[i] + "\n")

In [31]:
start = time.time()
!sourmash compute uniflna_HA_sub1.fasta  -k 5 --singleton
print("finished in " + str(time.time()-start) + "seconds")

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcomputing signatures for files: uniflna_HA_sub1.fasta
[KComputing signature for ksizes: [5]
[KComputing only nucleotide (and not protein) signatures.
[KComputing a total of 1 signature(s).
[Kcalculated 1000 signatures for 1000 sequences in uniflna_HA_sub1.fasta
[Ksaved 1000 signature(s). Note: signature license is CC0.
finished in 1.0879552364349365seconds


In [32]:
start = time.time()
!sourmash compare uniflna_HA_sub1.fasta.sig -o uniflna_HA_sub1_cmp
print("finished in " + str(time.time()-start) + "seconds")

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

...sig loading 999_HA_sub1.fasta.sig
[Kloaded 1000 signatures total.                                                  
[K
min similarity in matrix: 0.221
[Ksaving labels to: uniflna_HA_sub1_cmp.labels.txt
[Ksaving distance matrix to: uniflna_HA_sub1_cmp
finished in 63.475311279296875seconds


In [33]:
start = time.time()
!sourmash plot uniflna_HA_sub1_cmp
print("finished in " + str(time.time()-start) + "seconds")

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading comparison matrix from uniflna_HA_sub1_cmp...
[K...got 1000 x 1000 matrix.
[Kloading labels from uniflna_HA_sub1_cmp.labels.txt
[Ksaving histogram of matrix values => uniflna_HA_sub1_cmp.hist.png
[Kwrote dendrogram to: uniflna_HA_sub1_cmp.dendro.png
[Kwrote numpy distance matrix to: uniflna_HA_sub1_cmp.matrix.png
finished in 10.18021273612976seconds


tree:  
![dendogram](uniflna_HA_sub1_cmp.dendro.png)

matrix and heatmap:  
![matrix](uniflna_HA_sub1_cmp.matrix.png)

In [36]:
# comparison to csv for plotting in R
start = time.time()
!sourmash compare uniflna_HA_sub1.fasta.sig --csv uniflna_HA_sub1_cmp.csv
print("finished in " + str(time.time()-start) + "seconds")

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

...sig loading 999_HA_sub1.fasta.sig
[Kloaded 1000 signatures total.                                                  
[K
min similarity in matrix: 0.221
finished in 67.94533371925354seconds


## not just HA, but all seqs...

In [4]:
np.random.seed(8903)
uniflna_sub1 = uniflna.sample(n=1000)

In [5]:
# make it back into text FASTA file
with open ("uniflna_sub1.fasta", 'w') as output:
    for i in range(len(uniflna_sub1)):
        output.write(">" + list(uniflna_sub1.titles.values)[i] + "\n" + list(uniflna_sub1.sequences.values)[i] + "\n")

In [8]:
start = time.time()
!sourmash compute uniflna_sub1.fasta  -k 5 --singleton
print("finished in " + str(time.time()-start) + "seconds")

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcomputing signatures for files: uniflna_sub1.fasta
[KComputing signature for ksizes: [5]
[KComputing only nucleotide (and not protein) signatures.
[KComputing a total of 1 signature(s).
[Kcalculated 1000 signatures for 1000 sequences in uniflna_sub1.fasta
[Ksaved 1000 signature(s). Note: signature license is CC0.
finished in 1.0115101337432861seconds


In [9]:
# comparison to csv for plotting in R
start = time.time()
!sourmash compare uniflna_sub1.fasta.sig --csv uniflna_sub1_cmp.csv
print("finished in " + str(time.time()-start) + "seconds")

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

...sig loading 999_sub1.fasta.sig
[Kloaded 1000 signatures total.                                                  
[K
min similarity in matrix: 0.079
finished in 65.50607323646545seconds


In [13]:
# comparison to matrx
start = time.time()
!sourmash compare uniflna_sub1.fasta.sig -o uniflna_sub1_cmp
print("finished in " + str(time.time()-start) + "seconds")

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

...sig loading 999_sub1.fasta.sig
[Kloaded 1000 signatures total.                                                  
[K
min similarity in matrix: 0.079
[Ksaving labels to: uniflna_sub1_cmp.labels.txt
[Ksaving distance matrix to: uniflna_sub1_cmp
finished in 67.71590304374695seconds


In [14]:
!sourmash plot uniflna_sub1_cmp

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading comparison matrix from uniflna_sub1_cmp...
[K...got 1000 x 1000 matrix.
[Kloading labels from uniflna_sub1_cmp.labels.txt
[Ksaving histogram of matrix values => uniflna_sub1_cmp.hist.png
[Kwrote dendrogram to: uniflna_sub1_cmp.dendro.png
[Kwrote numpy distance matrix to: uniflna_sub1_cmp.matrix.png


tree:  
![dendogram](uniflna_sub1_cmp.dendro.png)

In [16]:
start = time.time()
!sourmash compute uniflna_sub1.fasta  -k 7 --singleton
print("finished in " + str(time.time()-start) + "seconds")

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcomputing signatures for files: uniflna_sub1.fasta
[KComputing signature for ksizes: [7]
[KComputing only nucleotide (and not protein) signatures.
[KComputing a total of 1 signature(s).
[Kcalculated 1000 signatures for 1000 sequences in uniflna_sub1.fasta
[Ksaved 1000 signature(s). Note: signature license is CC0.
finished in 1.1013703346252441seconds


In [17]:
# comparison to matrx
start = time.time()
!sourmash compare uniflna_sub1.fasta.sig -o uniflna_sub1_k7_cmp
print("finished in " + str(time.time()-start) + "seconds")

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

...sig loading 999_sub1.fasta.sig
[Kloaded 1000 signatures total.                                                  
[K
min similarity in matrix: 0.002
[Ksaving labels to: uniflna_sub1_k7_cmp.labels.txt
[Ksaving distance matrix to: uniflna_sub1_k7_cmp
finished in 57.62817907333374seconds


In [18]:
!sourmash plot uniflna_sub1_k7_cmp

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading comparison matrix from uniflna_sub1_k7_cmp...
[K...got 1000 x 1000 matrix.
[Kloading labels from uniflna_sub1_k7_cmp.labels.txt
[Ksaving histogram of matrix values => uniflna_sub1_k7_cmp.hist.png
[Kwrote dendrogram to: uniflna_sub1_k7_cmp.dendro.png
[Kwrote numpy distance matrix to: uniflna_sub1_k7_cmp.matrix.png


tree:  
![dendogram](uniflna_sub1_k7_cmp.dendro.png)

In [19]:
# comparison to csv for plotting in R
start = time.time()
!sourmash compare uniflna_sub1.fasta.sig --csv uniflna_sub1_k7_cmp.csv
print("finished in " + str(time.time()-start) + "seconds")

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

...sig loading 999_sub1.fasta.sig
[Kloaded 1000 signatures total.                                                  
[K
min similarity in matrix: 0.002
finished in 57.83396553993225seconds
