# Take DIAMOND output and make a counts table for each taxonomic assignment to be used for Robust Aitchison Principal Components Analysis (RPCA) in DEICODE

### Assembly ORFs were run against NCBI-nr using DIAMOND with the following command: diamond blastp -d /work/hpc/users/nvp29/databases/NCBI-nr/diamond/nr_diamond_tax.dmnd -q $f -o $n'_metaFlye.diamondout' -f 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sscinames stitle --very-sensitive --threads 20

In [1]:
import pandas as pd
import os as os
import glob as glob
import numpy as np
import sys as sys

In [2]:
os.chdir("/Users/nastassia.patin/Desktop/Projects/Lasker2019/Long read paper/mSystems/Revisions/new-ORF-annotations/for_deicode-metaflye_illumina_hybrid")

### Option 1: One file at a time

In [3]:
dmnd = pd.read_csv("1903c124_15m_hybridSPAdes.diamondout", sep="\t", header=None, 
                 names=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 
                        'qend','sstart', 'send', 'evalue', 'bitscore', 'staxids', 'sscinames',
                        'stitle'])
dmnd = dmnd[['qseqid','bitscore', 'staxids', 'sscinames']]
# filter for hits with a bitscore greater than 50
dmnd_filt = dmnd.loc[dmnd['bitscore'] > 50].copy()
dmnd_filt.loc[: ,'bitscore'] = '1903c124_15m_hybridSPAdes'
dmnd_filt = dmnd_filt.rename(columns={'bitscore' : 'sample'})
dmnd_filt[['count']] = 1
dmnd_filt['staxids'] = dmnd_filt['staxids'].str.split(';', expand=True)[0] # if multiple tax ids, just keep first
dmnd_grped = dmnd_filt.groupby(['staxids', 'sample']).sum()
dmnd_grped = dmnd_grped.reset_index()
dmnd_grped.head()

Unnamed: 0,staxids,sample,count
0,10,1903c124_15m_hybridSPAdes,20
1,100,1903c124_15m_hybridSPAdes,42
2,100035,1903c124_15m_hybridSPAdes,2
3,1000413,1903c124_15m_hybridSPAdes,21
4,100053,1903c124_15m_hybridSPAdes,20


In [4]:
dmnd_grped.to_csv("1903c124_15m_hybridSPAdes_annotations.csv", index=None)

### Concatenate all individual files (bash) and import concatenated annotations files

In [34]:
annots_all = pd.read_csv("hybridSPAdes_annotations_ALL.csv")
annots_all

Unnamed: 0,staxids,sample,count
0,999948,Las19c107_10m_hybridSPAdes,3
1,999948,1903c123_10m_hybridSPAdes,2
2,999948,1903c111_10m_hybridSPAdes,2
3,999931,Las19c138_27m_hybridSPAdes,4
4,999931,Las19c138_27m_hybridSPAdes,3
...,...,...,...
1196972,10,1903c122_28m_hybridSPAdes,12
1196973,10,1903c119_11m_hybridSPAdes,15
1196974,10,1903c118_23m_hybridSPAdes,5
1196975,10,1903c117_50m_hybridSPAdes,29


In [32]:
annots_all.size

3590931

In [36]:
annots_all = annots_all.groupby(['staxids', 'sample']).sum()
df_all = annots_all.reset_index()
df_all

Unnamed: 0,staxids,sample,count
0,2,1903c111_10m_hybridSPAdes,142
1,2,1903c117_50m_hybridSPAdes,187
2,2,1903c118_23m_hybridSPAdes,21
3,2,1903c119_11m_hybridSPAdes,102
4,2,1903c122_28m_hybridSPAdes,240
...,...,...,...
682657,2929456,1903c129_26m_hybridSPAdes,5
682658,2929456,1903c144_13m_hybridSPAdes,4
682659,2929456,Las19c107_10m_hybridSPAdes,3
682660,2929456,Las19c135_5m_hybridSPAdes,1


In [38]:
df_all.size

2047986

In [37]:
df_all.to_csv("hybridSPAdes_annotations_ALL-v2.csv", index=None)

In [39]:
df_all['staxids'].to_csv("taxids_hybridSPAdes_ALL.txt", index=None, header=False)

### Option 2: In a loop if files are small enough (<1.2 GB)

In [3]:
def format_diamondout_for_qiime(diamondout):
    a, b = diamondout.split('.', maxsplit=2)
    dmnd = pd.read_csv(diamondout, sep="\t", header=None, names=['qseqid', 'sseqid', 'pident', 
                                                                 'length', 'mismatch', 'gapopen', 'qstart', 
                                                                 'qend','sstart', 'send', 'evalue', 'bitscore', 
                                                                 'staxids', 'sscinames', 'stitle']) 
    dmnd = dmnd[['qseqid','bitscore', 'staxids', 'sscinames']]
    
    dmnd_filt = dmnd.loc[dmnd['bitscore'] > 50].copy() 
    dmnd_filt.loc[:, 'bitscore'] = a
    dmnd_filt = dmnd_filt.rename(columns={'bitscore' : 'sample'})
    dmnd_filt[['count']] = 1
    dmnd_filt['staxids'] = dmnd_filt['staxids'].str.split(';', expand=True)[0] # if multiple tax ids, just keep first
    dmnd_grped = dmnd_filt.groupby(['staxids', 'sample']).sum()
    dmnd_grped = dmnd_grped.reset_index()
    return(dmnd_grped)

#### Run function on all vsearch output files

In [None]:
df = []

for f in glob.glob("*.diamondout"):
    dmnd_parsed = format_diamondout_for_qiime(f)
    df.append(dmnd_parsed)
    
df_all = pd.concat(df)

#### If very large files had to be split into sub-files, run the below loop to combine all sub-files into one sample file

In [20]:
df = []

for f in glob.glob("*.diamondout"):
    a, b = f.split(".", maxsplit=2)
    #print(a)
    sep='-'
    name = [a, "sub.a*"]
    for n in glob.glob(sep.join(name)):
        #print(n)
        dmnd_parsed = format_diamondout_for_qiime(n)
        df.append(dmnd_parsed)
    
df_all = pd.concat(df)

In [21]:
df_all.head()

Unnamed: 0,staxids,sample,count
0,10,1903c111_10m-1_SPAdes-sub,7
1,100,1903c111_10m-1_SPAdes-sub,8
2,100035,1903c111_10m-1_SPAdes-sub,1
3,1000413,1903c111_10m-1_SPAdes-sub,7
4,100053,1903c111_10m-1_SPAdes-sub,7


In [24]:
df_all['sample'] = df_all['sample'].str.split('-s', expand=True)[0]
df_all['sample'].unique()

In [27]:
# Export the dataframe to save and load in the future
df_all.to_csv("Illumina_large_annotations.csv", index=None)

### For the 'large' and 'small' data sets, upload and combine and export taxids

In [28]:
os.chdir("/Users/nastassia.patin/Desktop/Projects/Lasker2019/Long read paper/mSystems/Revisions/new-ORF-annotations/Illumina/")

In [30]:
df_small = pd.read_csv("Illumina_small_annotations.csv")
df_large = pd.read_csv("Illumina_large_annotations.csv")

In [33]:
df_illumina = pd.concat([df_small, df_large])
df_illumina

Unnamed: 0,staxids,sample,count
0,10,1903c126_45m-1_SPAdes,5
1,100,1903c126_45m-1_SPAdes,40
2,100035,1903c126_45m-1_SPAdes,52
3,1000413,1903c126_45m-1_SPAdes,357
4,100053,1903c126_45m-1_SPAdes,29
...,...,...,...
2729083,999883,1903c129_26m-2_SPAdes,2
2729084,999894,1903c129_26m-2_SPAdes,2
2729085,999898,1903c129_26m-2_SPAdes,1
2729086,9999,1903c129_26m-2_SPAdes,20


In [49]:
df_illumina = df_illumina.groupby(['staxids', 'sample']).sum()
df_illumina = df_illumina.reset_index()
df_illumina

Unnamed: 0,staxids,sample,count
0,2,1903c111_10m-1_SPAdes,217
1,2,1903c111_10m-2_SPAdes,141
2,2,1903c117_50m-1_SPAdes,185
3,2,1903c117_50m-2_SPAdes,83
4,2,1903c119_11m-2_SPAdes,102
...,...,...,...
1141963,2929456,Las19c107_10m-1_SPAdes,3
1141964,2929456,Las19c107_10m-2_SPAdes,3
1141965,2929456,Las19c135_5m-1_SPAdes,1
1141966,2929456,Las19c135_5m-2_SPAdes,1


In [50]:
# Export the taxids to use for the taxonomy metadata file
df_illumina['staxids'].to_csv("taxids_Illumina.txt", index=None, header=False)

## Combine the three (Illumina, metaFlye, and hybridSPAdes) data sets into one big DF

In [36]:
os.chdir("/Users/nastassia.patin/Desktop/Projects/Lasker2019/Long read paper/mSystems/Revisions/new-ORF-annotations/")
df_metaflye = pd.read_csv("metaFlye_annotations_ALL.csv")
df_hybrid = pd.read_csv("hybridSPAdes_annotations_ALL.csv")

In [51]:
dfs = [df_illumina, df_hybrid, df_metaflye]
df_all = pd.concat(dfs)
df_all

Unnamed: 0,staxids,sample,count
0,2,1903c111_10m-1_SPAdes,217
1,2,1903c111_10m-2_SPAdes,141
2,2,1903c117_50m-1_SPAdes,185
3,2,1903c117_50m-2_SPAdes,83
4,2,1903c119_11m-2_SPAdes,102
...,...,...,...
183576,989422,1903c127_7m-3_metaFlye,1
183577,989436,1903c127_7m-3_metaFlye,1
183578,991,1903c127_7m-3_metaFlye,1
183579,996,1903c127_7m-3_metaFlye,1


In [52]:
# pivot table so each sample is a column and counts are values
df_all_pivot = df_all.pivot(index='staxids', columns='sample', values='count')

In [53]:
df_all_pivot

sample,1903c111_10m-1_SPAdes,1903c111_10m-2_SPAdes,1903c111_10m-3_metaFlye,1903c111_10m_hybridSPAdes,1903c117_50m-1_SPAdes,1903c117_50m-2_SPAdes,1903c117_50m-3_metaFlye,1903c117_50m_hybridSPAdes,1903c118_23m-3_metaFlye,1903c118_23m_hybridSPAdes,...,Las19c107_10m-2_SPAdes,Las19c107_10m-3_metaFlye,Las19c107_10m_hybridSPAdes,Las19c135_5m-1_SPAdes,Las19c135_5m-2_SPAdes,Las19c135_5m-3_metaFlye,Las19c135_5m_hybridSPAdes,Las19c138_27m-1_SPAdes,Las19c138_27m-3_metaFlye,Las19c138_27m_hybridSPAdes
staxids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,217.0,141.0,17.0,142.0,185.0,83.0,46.0,187.0,,21.0,...,218.0,,221.0,175.0,165.0,138.0,164.0,298.0,21.0,322.0
6,6.0,3.0,,3.0,3.0,1.0,1.0,2.0,,4.0,...,7.0,,7.0,6.0,3.0,1.0,3.0,12.0,,11.0
7,54.0,52.0,,52.0,40.0,9.0,20.0,39.0,,5.0,...,38.0,,38.0,75.0,55.0,7.0,54.0,110.0,5.0,117.0
9,26.0,24.0,,24.0,16.0,19.0,13.0,18.0,,17.0,...,26.0,,26.0,9.0,43.0,12.0,44.0,76.0,13.0,70.0
10,54.0,33.0,,34.0,32.0,10.0,6.0,29.0,,5.0,...,38.0,,37.0,23.0,24.0,5.0,23.0,26.0,,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2928627,28.0,17.0,1.0,17.0,1.0,2.0,4.0,2.0,,2.0,...,4.0,,4.0,3.0,10.0,34.0,10.0,14.0,2.0,14.0
2928628,29.0,17.0,1.0,17.0,2.0,1.0,10.0,3.0,,,...,,,,3.0,4.0,21.0,4.0,9.0,,11.0
2928629,46.0,34.0,2.0,34.0,7.0,2.0,2.0,7.0,,1.0,...,2.0,,2.0,5.0,5.0,37.0,5.0,25.0,5.0,26.0
2928680,,,,,1.0,,,1.0,,,...,,,,,,,,,,


In [54]:
df_all_pivot = df_all_pivot.fillna(0)
df_all_pivot

sample,1903c111_10m-1_SPAdes,1903c111_10m-2_SPAdes,1903c111_10m-3_metaFlye,1903c111_10m_hybridSPAdes,1903c117_50m-1_SPAdes,1903c117_50m-2_SPAdes,1903c117_50m-3_metaFlye,1903c117_50m_hybridSPAdes,1903c118_23m-3_metaFlye,1903c118_23m_hybridSPAdes,...,Las19c107_10m-2_SPAdes,Las19c107_10m-3_metaFlye,Las19c107_10m_hybridSPAdes,Las19c135_5m-1_SPAdes,Las19c135_5m-2_SPAdes,Las19c135_5m-3_metaFlye,Las19c135_5m_hybridSPAdes,Las19c138_27m-1_SPAdes,Las19c138_27m-3_metaFlye,Las19c138_27m_hybridSPAdes
staxids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,217.0,141.0,17.0,142.0,185.0,83.0,46.0,187.0,0.0,21.0,...,218.0,0.0,221.0,175.0,165.0,138.0,164.0,298.0,21.0,322.0
6,6.0,3.0,0.0,3.0,3.0,1.0,1.0,2.0,0.0,4.0,...,7.0,0.0,7.0,6.0,3.0,1.0,3.0,12.0,0.0,11.0
7,54.0,52.0,0.0,52.0,40.0,9.0,20.0,39.0,0.0,5.0,...,38.0,0.0,38.0,75.0,55.0,7.0,54.0,110.0,5.0,117.0
9,26.0,24.0,0.0,24.0,16.0,19.0,13.0,18.0,0.0,17.0,...,26.0,0.0,26.0,9.0,43.0,12.0,44.0,76.0,13.0,70.0
10,54.0,33.0,0.0,34.0,32.0,10.0,6.0,29.0,0.0,5.0,...,38.0,0.0,37.0,23.0,24.0,5.0,23.0,26.0,0.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2928627,28.0,17.0,1.0,17.0,1.0,2.0,4.0,2.0,0.0,2.0,...,4.0,0.0,4.0,3.0,10.0,34.0,10.0,14.0,2.0,14.0
2928628,29.0,17.0,1.0,17.0,2.0,1.0,10.0,3.0,0.0,0.0,...,0.0,0.0,0.0,3.0,4.0,21.0,4.0,9.0,0.0,11.0
2928629,46.0,34.0,2.0,34.0,7.0,2.0,2.0,7.0,0.0,1.0,...,2.0,0.0,2.0,5.0,5.0,37.0,5.0,25.0,5.0,26.0
2928680,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Convert floats to integers

In [56]:
df_all_pivot = df_all_pivot.astype(int)
df_all_pivot

In [58]:
df_all_pivot.to_csv("metaflye_hybrid_illumina_dfs.tsv", sep="\t")

In [10]:
df_all_pivot = pd.read_csv("metaflye_hybrid_illumina_dfs.txt", sep="\t", index_col=0)

In [11]:
df_all_pivot

Unnamed: 0,1903c111_10m-1_SPAdes,1903c111_10m-2_SPAdes,1903c111_10m-3_metaFlye,1903c111_10m_hybridSPAdes,1903c117_50m-1_SPAdes,1903c117_50m-2_SPAdes,1903c117_50m-3_metaFlye,1903c117_50m_hybridSPAdes,1903c118_23m-3_metaFlye,1903c118_23m_hybridSPAdes,...,Las19c107_10m-2_SPAdes,Las19c107_10m-3_metaFlye,Las19c107_10m_hybridSPAdes,Las19c135_5m-1_SPAdes,Las19c135_5m-2_SPAdes,Las19c135_5m-3_metaFlye,Las19c135_5m_hybridSPAdes,Las19c138_27m-1_SPAdes,Las19c138_27m-3_metaFlye,Las19c138_27m_hybridSPAdes
2,217,141,17,142,185,83,46,187,0,21,...,218,0,221,175,165,138,164,298,21,322
6,6,3,0,3,3,1,1,2,0,4,...,7,0,7,6,3,1,3,12,0,11
7,54,52,0,52,40,9,20,39,0,5,...,38,0,38,75,55,7,54,110,5,117
9,26,24,0,24,16,19,13,18,0,17,...,26,0,26,9,43,12,44,76,13,70
10,54,33,0,34,32,10,6,29,0,5,...,38,0,37,23,24,5,23,26,0,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2928627,28,17,1,17,1,2,4,2,0,2,...,4,0,4,3,10,34,10,14,2,14
2928628,29,17,1,17,2,1,10,3,0,0,...,0,0,0,3,4,21,4,9,0,11
2928629,46,34,2,34,7,2,2,7,0,1,...,2,0,2,5,5,37,5,25,5,26
2928680,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df_all_pivot += 1

In [13]:
df_all_pivot

Unnamed: 0,1903c111_10m-1_SPAdes,1903c111_10m-2_SPAdes,1903c111_10m-3_metaFlye,1903c111_10m_hybridSPAdes,1903c117_50m-1_SPAdes,1903c117_50m-2_SPAdes,1903c117_50m-3_metaFlye,1903c117_50m_hybridSPAdes,1903c118_23m-3_metaFlye,1903c118_23m_hybridSPAdes,...,Las19c107_10m-2_SPAdes,Las19c107_10m-3_metaFlye,Las19c107_10m_hybridSPAdes,Las19c135_5m-1_SPAdes,Las19c135_5m-2_SPAdes,Las19c135_5m-3_metaFlye,Las19c135_5m_hybridSPAdes,Las19c138_27m-1_SPAdes,Las19c138_27m-3_metaFlye,Las19c138_27m_hybridSPAdes
2,218,142,18,143,186,84,47,188,1,22,...,219,1,222,176,166,139,165,299,22,323
6,7,4,1,4,4,2,2,3,1,5,...,8,1,8,7,4,2,4,13,1,12
7,55,53,1,53,41,10,21,40,1,6,...,39,1,39,76,56,8,55,111,6,118
9,27,25,1,25,17,20,14,19,1,18,...,27,1,27,10,44,13,45,77,14,71
10,55,34,1,35,33,11,7,30,1,6,...,39,1,38,24,25,6,24,27,1,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2928627,29,18,2,18,2,3,5,3,1,3,...,5,1,5,4,11,35,11,15,3,15
2928628,30,18,2,18,3,2,11,4,1,1,...,1,1,1,4,5,22,5,10,1,12
2928629,47,35,3,35,8,3,3,8,1,2,...,3,1,3,6,6,38,6,26,6,27
2928680,1,1,1,1,2,1,1,2,1,1,...,1,1,1,1,1,1,1,1,1,1


In [14]:
df_all_pivot.to_csv("metaflye_hybrid_illumina_dfs-transf.tsv", sep="\t")

In [15]:
df_all_pivot.columns

Index(['1903c111_10m-1_SPAdes', '1903c111_10m-2_SPAdes',
       '1903c111_10m-3_metaFlye', '1903c111_10m_hybridSPAdes',
       '1903c117_50m-1_SPAdes', '1903c117_50m-2_SPAdes',
       '1903c117_50m-3_metaFlye', '1903c117_50m_hybridSPAdes',
       '1903c118_23m-3_metaFlye', '1903c118_23m_hybridSPAdes',
       '1903c119_11m-2_SPAdes', '1903c119_11m-3_metaFlye',
       '1903c119_11m_hybridSPAdes', '1903c122_28m-1_SPAdes',
       '1903c122_28m-2_SPAdes', '1903c122_28m-3_metaFlye',
       '1903c122_28m_hybridSPAdes', '1903c123_10m-1_SPAdes',
       '1903c123_10m-2_SPAdes', '1903c123_10m-3_metaFlye',
       '1903c123_10m_hybridSPAdes', '1903c124_15m-1_SPAdes',
       '1903c124_15m-2_SPAdes', '1903c124_15m-3_metaFlye',
       '1903c124_15m_hybridSPAdes', '1903c126_45m-1_SPAdes',
       '1903c126_45m-2_SPAdes', '1903c126_45m-3_metaFlye',
       '1903c126_45m_hybridSPAdes', '1903c127_7m-1_SPAdes',
       '1903c127_7m-2_SPAdes', '1903c127_7m-3_metaFlye',
       '1903c127_7m_hybridSPAdes', '1903c