In [43]:
import pandas as pd
import numpy as np

#### Reading dataframes

In [64]:
df_lactobacillus_HQ = pd.read_csv('data56_related_files/HighQLactobacillus')
df_alistipes_HQ = pd.read_csv('data56_related_files/HighQAlistipes')
df_lactobacillacea_HQ = pd.read_csv('data56_related_files/HighQLActobacillaceae')
df_NCBI_lactobacillus_HQ = pd.read_csv('data56_related_files/HighQ_NCBI_Lactobacillus')
df_dereplicated_bins = pd.read_csv('data56_related_files/derepelicatedBins_dRep.tsv')

#### Renaming column to same key name and removing extension

In [67]:
df_dereplicated_bins = df_dereplicated_bins.rename({'derepelicatedBins_dRep': 'Bin_Id'}, axis=1)

In [68]:
df_dereplicated_bins['Bin_Id'] = df_dereplicated_bins['Bin_Id'].map(lambda x: x.rstrip('.fna'))

In [69]:
df_dereplicated_bins

Unnamed: 0,Bin_Id
0,S10C1039
1,S10C1295
2,S10C1297
3,S10C15900
4,S10C2556
...,...
728,S9C61
729,S9C8881
730,S9C92
731,S9C962


#### Joining dereplicated bins with high quality using 'inner' keeping those who are in both datasets. 

In [48]:
derep_Lactobacillus = pd.merge(df_lactobacillus_HQ, df_dereplicated_bins, how="inner", on='Bin_Id')

In [62]:
derep_Lactobacillus['Species']

0    s__Lactobacillus gallinarum
1     s__Lactobacillus crispatus
2     s__Lactobacillus johnsonii
3                            s__
Name: Species, dtype: object

In [50]:
derep_Alistipes = pd.merge(df_alistipes_HQ, df_dereplicated_bins, how="inner", on='Bin_Id')

In [65]:
derep_Alistipes['Species']

0     s__Alistipes sp000434235
1     s__Alistipes sp900544265
2      s__Alistipes finegoldii
3     s__Alistipes sp002161445
4        s__Alistipes communis
5     s__Alistipes sp900021155
6     s__Alistipes sp900542505
7                          s__
8     s__Alistipes onderdonkii
9                          s__
10         s__Alistipes dispar
11    s__Alistipes sp900544265
12                         s__
13                         s__
14    s__Alistipes sp900546065
15                         s__
16    s__Alistipes sp900546065
17    s__Alistipes sp900290115
18    s__Alistipes sp900544265
19    s__Alistipes sp002161445
20    s__Alistipes sp900546065
Name: Species, dtype: object

In [71]:
derep_Lactobacillaceae = pd.merge(df_lactobacillacea_HQ, df_dereplicated_bins, how="inner", on='Bin_Id')

In [81]:
derep_Lactobacillaceae['Species'].nunique()

14

In [85]:
derep_Lactobacillaceae['Species']

0        s__Limosilactobacillus ingluviei
1             s__Ligilactobacillus agilis
2                                     s__
3                                     s__
4     s__Limosilactobacillus coleohominis
5        s__Limosilactobacillus vaginalis
6           s__Ligilactobacillus aviarius
7        s__Limosilactobacillus ingluviei
8                                     s__
9                                     s__
10       s__Limosilactobacillus ingluviei
11       s__Limosilactobacillus reuteri_E
12            s__Ligilactobacillus agilis
13        s__Ligilactobacillus salivarius
14          s__Ligilactobacillus aviarius
15            s__Lactobacillus gallinarum
16                                    s__
17            s__Ligilactobacillus agilis
18             s__Lactobacillus crispatus
19        s__Ligilactobacillus salivarius
20                                    s__
21            s__Limosilactobacillus oris
22       s__Limosilactobacillus ingluviei
23                   s__Weissella 

In [86]:
derep_Lactobacillaceae

Unnamed: 0,Bin_Id,classification,closest_placement_taxonomy,msa_percent,Domain,Phylum,Class,Order,Family,Genus,...,1,2,3,4,5+,Completeness,Contamination,Strain heterogeneity,bin_class,class_3
0,S10C1039,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,94.58,d__Bacteria,p__Firmicutes,c__Bacilli,o__Lactobacillales,f__Lactobacillaceae,g__Limosilactobacillus,...,331,0,0,0,0,98.91,0.0,0.0,0,0
1,S10C1297,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,93.47,d__Bacteria,p__Firmicutes,c__Bacilli,o__Lactobacillales,f__Lactobacillaceae,g__Ligilactobacillus,...,341,1,0,0,0,96.6,0.26,0.0,0,0
2,S10C3913,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,,95.89,d__Bacteria,p__Firmicutes,c__Bacilli,o__Lactobacillales,f__Lactobacillaceae,g__Ligilactobacillus,...,343,2,0,0,0,98.06,1.05,0.0,0,0
3,S10C455,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,90.81,d__Bacteria,p__Firmicutes,c__Bacilli,o__Lactobacillales,f__Lactobacillaceae,g__Ligilactobacillus,...,333,10,1,0,0,97.38,3.84,15.38,0,0
4,S12C1500,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,83.32,d__Bacteria,p__Firmicutes,c__Bacilli,o__Lactobacillales,f__Lactobacillaceae,g__Limosilactobacillus,...,301,1,0,0,0,98.91,0.55,100.0,0,0
5,S12C188,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,92.87,d__Bacteria,p__Firmicutes,c__Bacilli,o__Lactobacillales,f__Lactobacillaceae,g__Limosilactobacillus,...,324,0,0,0,0,96.55,0.0,0.0,0,0
6,S13C2782,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,90.73,d__Bacteria,p__Firmicutes,c__Bacilli,o__Lactobacillales,f__Lactobacillaceae,g__Ligilactobacillus,...,324,10,0,0,0,93.89,3.66,80.0,0,0
7,S14C1039,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,89.84,d__Bacteria,p__Firmicutes,c__Bacilli,o__Lactobacillales,f__Lactobacillaceae,g__Limosilactobacillus,...,320,0,0,0,0,98.55,0.0,0.0,0,0
8,S14C330,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,95.73,d__Bacteria,p__Firmicutes,c__Bacilli,o__Lactobacillales,f__Lactobacillaceae,g__Limosilactobacillus,...,332,1,0,0,0,99.45,0.55,100.0,0,0
9,S14C589,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,,82.69,d__Bacteria,p__Firmicutes,c__Bacilli,o__Lactobacillales,f__Lactobacillaceae,g__Limosilactobacillus,...,304,3,0,0,0,96.75,1.14,66.67,0,0


#### Saving the bin ids for the high quality bins which was not removed by drep in the Lactobacillacea dataframe. 

In [92]:
derep_Lactobacillaceae['Bin_Id'].to_csv('data56_related_files/derep_lactobacillaceae_bins.csv', index = False)

#### Testing on the NCBI defined Lactobacillus  
One less uniqie species and one bin less than the Lactobacillaceae family. 

In [73]:
derep_NCBI_Lactobacillus = pd.merge(df_NCBI_lactobacillus_HQ, df_dereplicated_bins, how="inner", on='Bin_Id')

In [56]:
derep_NCBI_Lactobacillus['Species'].nunique()

13