In [43]:
import pandas as pd
import numpy as np

#### Reading dataframes created in [SpeciesBinPlotWithQuality](SpeciesBinPlotWithQuality.ipynb.ipynb#section_id2) notebook

<a id='section_id2'></a>

In [64]:
df_lactobacillus_HQ = pd.read_csv('data56_related_files/HighQLactobacillus')
df_alistipes_HQ = pd.read_csv('data56_related_files/HighQAlistipes')
df_lactobacillacea_HQ = pd.read_csv('data56_related_files/HighQLactobacillaceae')
df_NCBI_lactobacillus_HQ = pd.read_csv('data56_related_files/HighQ_NCBI_Lactobacillus')
df_dereplicated_bins = pd.read_csv('data56_related_files/derepelicatedBins_dRep.tsv')

#### Renaming column to same key name and removing extension

In [67]:
df_dereplicated_bins = df_dereplicated_bins.rename({'derepelicatedBins_dRep': 'Bin_Id'}, axis=1)

In [68]:
df_dereplicated_bins['Bin_Id'] = df_dereplicated_bins['Bin_Id'].map(lambda x: x.rstrip('.fna'))

In [69]:
df_dereplicated_bins

Unnamed: 0,Bin_Id
0,S10C1039
1,S10C1295
2,S10C1297
3,S10C15900
4,S10C2556
...,...
728,S9C61
729,S9C8881
730,S9C92
731,S9C962


#### Joining dereplicated bins with high quality bins using 'inner' keeping those who are in both datasets. 

In [48]:
derep_Lactobacillus = pd.merge(df_lactobacillus_HQ, df_dereplicated_bins, how="inner", on='Bin_Id')

In [62]:
derep_Lactobacillus['Species']

0    s__Lactobacillus gallinarum
1     s__Lactobacillus crispatus
2     s__Lactobacillus johnsonii
3                            s__
Name: Species, dtype: object

In [50]:
derep_Alistipes = pd.merge(df_alistipes_HQ, df_dereplicated_bins, how="inner", on='Bin_Id')

In [65]:
derep_Alistipes['Species']

0     s__Alistipes sp000434235
1     s__Alistipes sp900544265
2      s__Alistipes finegoldii
3     s__Alistipes sp002161445
4        s__Alistipes communis
5     s__Alistipes sp900021155
6     s__Alistipes sp900542505
7                          s__
8     s__Alistipes onderdonkii
9                          s__
10         s__Alistipes dispar
11    s__Alistipes sp900544265
12                         s__
13                         s__
14    s__Alistipes sp900546065
15                         s__
16    s__Alistipes sp900546065
17    s__Alistipes sp900290115
18    s__Alistipes sp900544265
19    s__Alistipes sp002161445
20    s__Alistipes sp900546065
Name: Species, dtype: object

In [100]:
derep_Lactobacillaceae = pd.merge(df_lactobacillacea_HQ, df_dereplicated_bins, how="inner", on='Bin_Id')

In [101]:
len(derep_Lactobacillaceae)

29

In [81]:
derep_Lactobacillaceae['Species'].nunique()

14

In [124]:
derep_Lactobacillaceae['Species']  

0        s__Limosilactobacillus ingluviei
1             s__Ligilactobacillus agilis
2                                     s__
3                                     s__
4     s__Limosilactobacillus coleohominis
5        s__Limosilactobacillus vaginalis
6           s__Ligilactobacillus aviarius
7        s__Limosilactobacillus ingluviei
8                                     s__
9                                     s__
10       s__Limosilactobacillus ingluviei
11       s__Limosilactobacillus reuteri_E
12            s__Ligilactobacillus agilis
13        s__Ligilactobacillus salivarius
14          s__Ligilactobacillus aviarius
15            s__Lactobacillus gallinarum
16                                    s__
17            s__Ligilactobacillus agilis
18             s__Lactobacillus crispatus
19        s__Ligilactobacillus salivarius
20                                    s__
21            s__Limosilactobacillus oris
22       s__Limosilactobacillus ingluviei
23                   s__Weissella 

#### Saving the bin ids for the high quality bins which was not removed by drep in the Lactobacillacea dataframe. 
These will be used as reference genomes in inStrain

In [111]:
derep_Lactobacillaceae['Bin_Id'].to_csv('data56_related_files/derep_lactobacillaceae_bins.csv', index = False)

#### Generating header names for ref bins 
This is done to resemble the downloaded reference genomes including species so they are easily manipulated later

In [219]:
df_headers = derep_Lactobacillaceae[['Genus', 'Species', 'Bin_Id']]

In [220]:
df_headers['header'] = ''
for index, row in df_headers.iterrows():
    if (row['Species'] == 's__'):
        row["header"] = row["Genus"] + row["Species"] + row["Bin_Id"]
    else:
        row["header"] = row["Species"] + row["Bin_Id"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [221]:
df_headers.head(5)

Unnamed: 0,Genus,Species,Bin_Id,header
0,g__Limosilactobacillus,s__Limosilactobacillus ingluviei,S10C1039,s__Limosilactobacillus ingluvieiS10C1039
1,g__Ligilactobacillus,s__Ligilactobacillus agilis,S10C1297,s__Ligilactobacillus agilisS10C1297
2,g__Ligilactobacillus,s__,S10C3913,g__Ligilactobacilluss__S10C3913
3,g__Ligilactobacillus,s__,S10C455,g__Ligilactobacilluss__S10C455
4,g__Limosilactobacillus,s__Limosilactobacillus coleohominis,S12C1500,s__Limosilactobacillus coleohominisS12C1500


In [222]:
df_headers['header'] = df_headers['header'].str.replace(r'g__', '')
df_headers['header'] = df_headers['header'].str.replace(r'S', '_S')
df_headers['header'] = df_headers['header'].str.replace(r's__', ' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [223]:
df_headers.head(5)

Unnamed: 0,Genus,Species,Bin_Id,header
0,g__Limosilactobacillus,s__Limosilactobacillus ingluviei,S10C1039,Limosilactobacillus ingluviei_S10C1039
1,g__Ligilactobacillus,s__Ligilactobacillus agilis,S10C1297,Ligilactobacillus agilis_S10C1297
2,g__Ligilactobacillus,s__,S10C3913,Ligilactobacillus _S10C3913
3,g__Ligilactobacillus,s__,S10C455,Ligilactobacillus _S10C455
4,g__Limosilactobacillus,s__Limosilactobacillus coleohominis,S12C1500,Limosilactobacillus coleohominis_S12C1500


In [224]:
df_headers['header'] =df_headers['header'] + ','


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [225]:
df_headers.head(4)

Unnamed: 0,Genus,Species,Bin_Id,header
0,g__Limosilactobacillus,s__Limosilactobacillus ingluviei,S10C1039,"Limosilactobacillus ingluviei_S10C1039,"
1,g__Ligilactobacillus,s__Ligilactobacillus agilis,S10C1297,"Ligilactobacillus agilis_S10C1297,"
2,g__Ligilactobacillus,s__,S10C3913,"Ligilactobacillus _S10C3913,"
3,g__Ligilactobacillus,s__,S10C455,"Ligilactobacillus _S10C455,"


In [226]:
df_headers['header'].to_csv('data56_related_files/ref_bins_header.csv', index = False, header = False)

#### Testing on the NCBI defined Lactobacillus  
One less uniqie species and one bin less than the Lactobacillaceae family. 

In [95]:
derep_NCBI_Lactobacillus = pd.merge(df_NCBI_lactobacillus_HQ, df_dereplicated_bins, how="inner", on='Bin_Id')

In [99]:
len(derep_NCBI_Lactobacillus)

28

In [102]:
derep_NCBI_Lactobacillus['Species'].nunique()

13

#### Extracting HQ Lactobacillaceae bins for use in InStrain - not used.

In [105]:
df_lactobacillacea_HQ['Bin_Id'].to_csv('data56_related_files/Lactobacillaceae_HQ_binIDs', index = False)