## Minhashing with the sourmash library  
using: https://github.com/dib-lab/sourmash  
specific for DNA/RNA

In [4]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import pandas as pd
import numpy as np
import itertools as it
import matplotlib.pyplot as plt
import multiprocessing
import time
import pycurl
plt.rcParams['figure.dpi'] = 150

tutorial from:  
https://sourmash.readthedocs.io/en/latest/command-line.html

In [9]:
# load genomes
!curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Escherichia_coli/reference/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz
!curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Salmonella_enterica/reference/GCF_000006945.2_ASM694v2/GCF_000006945.2_ASM694v2_genomic.fna.gz
!curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Sphingobacteriaceae_bacterium_DW12/latest_assembly_versions/GCF_000783305.1_ASM78330v1/GCF_000783305.1_ASM78330v1_genomic.fna.gz


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1437k  100 1437k    0     0   925k      0  0:00:01  0:00:01 --:--:--  925k


In [10]:
!sourmash info

[Ksourmash version 2.0.0
[K- loaded from path: /home/brian/anaconda3/lib/python3.7/site-packages/sourmash
[K


In [11]:
!sourmash compute *.fna.gz

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcomputing signatures for files: GCF_000005845.2_ASM584v2_genomic.fna.gz, GCF_000006945.2_ASM694v2_genomic.fna.gz, GCF_000783305.1_ASM78330v1_genomic.fna.gz
[KComputing signature for ksizes: [21, 31, 51]
[KComputing only nucleotide (and not protein) signatures.
[KComputing a total of 3 signature(s).
[K... reading sequences from GCF_000005845.2_ASM584v2_genomic.fna.gz
[Kcalculated 3 signatures for 1 sequences in GCF_000005845.2_ASM584v2_genomic.fna.gz
[Ksaved 3 signature(s). Note: signature license is CC0.
[K... reading sequences from GCF_000006945.2_ASM694v2_genomic.fna.gz
[Kcalculated 3 signatures for 2 sequences in GCF_000006945.2_ASM694v2_genomic.fna.gz
[Ksaved 3 signature(s). Note: signature license is CC0.
[K... reading sequences from GCF_000783305.1_ASM78330v1_genomic.fna.gz
[Kcalculated 3 signatures for 78 sequences in GCF_000783305.1_ASM78330v1_genomic.fn

In [14]:
!sourmash compare *.sig -o cmp -k 31

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloaded 3 signatures total.                                                     
[K
0-GCF_000005845.2...	[1.    0.008 0.   ]
1-GCF_000006945.2...	[0.008 1.    0.   ]
2-GCF_000783305.1...	[0. 0. 1.]
min similarity in matrix: 0.000
[Ksaving labels to: cmp.labels.txt
[Ksaving distance matrix to: cmp


In [15]:
!sourmash plot cmp

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading comparison matrix from cmp...
[K...got 3 x 3 matrix.
[Kloading labels from cmp.labels.txt
[Ksaving histogram of matrix values => cmp.hist.png
[Kwrote dendrogram to: cmp.dendro.png
[Kwrote numpy distance matrix to: cmp.matrix.png
0	GCF_000005845.2_ASM584v2_genomic.fna.gz
1	GCF_000006945.2_ASM694v2_genomic.fna.gz
2	GCF_000783305.1_ASM78330v1_genomic.fna.gz


In [24]:
start = time.time()
!sourmash compute canine_flu_test.fa  -k 5 --singleton
print("finished in " + str(time.time()-start) + "seconds")

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcomputing signatures for files: canine_flu_test.fa
[KComputing signature for ksizes: [5]
[KComputing only nucleotide (and not protein) signatures.
[KComputing a total of 1 signature(s).
[Kcalculated 1240 signatures for 1240 sequences in canine_flu_test.fa
[Ksaved 1240 signature(s). Note: signature license is CC0.


In [25]:
start = time.time()
!sourmash compare canine_flu_test.fa.sig -o cmp
print("finished in " + str(time.time()-start) + "seconds")

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

...sig loading 1,239u_test.fa.sig
[Kloaded 1240 signatures total.                                                  
[K
min similarity in matrix: 0.608
[Ksaving labels to: cmp.labels.txt
[Ksaving distance matrix to: cmp


In [26]:
start = time.time()
!sourmash plot cmp
print("finished in " + str(time.time()-start) + "seconds")

[K== This is sourmash version 2.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading comparison matrix from cmp...
[K...got 1240 x 1240 matrix.
[Kloading labels from cmp.labels.txt
[Ksaving histogram of matrix values => cmp.hist.png
[Kwrote dendrogram to: cmp.dendro.png
[Kwrote numpy distance matrix to: cmp.matrix.png


tree:  
![dendogram](cmp.dendro.png)

matrix and heatmap:  
![matrix](cmp.matrix.png)