In [1]:
import pandas as pd 
import numpy as np 

## Load embedding and integrated dataset. 

In [5]:
df_umap = pd.read_csv('../data/df_umap.csv')

integrated = pd.read_csv('../data/integrated_rnaseq_tnseq_v0.csv')

In [16]:
df_umap.rename(columns = {'LocusTag': 'Rv_ID'}, inplace = True)
integrated.rename(columns = {'LocusTag': 'Rv_ID'}, inplace = True)

In [21]:
df_umap.shape

(4055, 14)

## Load tuberculist and Sanger annotations. 

In [6]:
sanger = pd.read_csv('../data/H37Rv_sanger_roles.csv')

tblist = pd.read_csv('../data/H37Rv_tuberculist_functional_categories.csv')

In [32]:
tblist.head()

Unnamed: 0,Rv_ID,gene_name,category
0,Rv0001,dnaA,information pathways
1,Rv0002,dnaN,information pathways
2,Rv0003,recF,information pathways
3,Rv0004,-,conserved hypotheticals
4,Rv0005,gyrB,information pathways


In [33]:
sanger = sanger[['Rv_ID', 'function']]

tblist = tblist[['Rv_ID', 'category']]

In [None]:
sanger.rename(columns = {'function': 'function_sanger'}, inplace = True)

### Annotate using Rv ID 

Let's use an outer join to merge the datasets. 

In [47]:
df_umap_tb = pd.merge(df_umap, tblist, on = 'Rv_ID', how = 'left')
df_umap_annot = pd.merge(df_umap_tb, sanger, on = 'Rv_ID', how = 'left')

In [48]:
df_umap_annot.drop_duplicates(inplace = True)

In [49]:
%load_ext blackcellmagic

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic


In [50]:
cols = [
    "Rv_ID",
    "Gene name",
    "Geneid/Contrast_id",
    "is_sdr",
    "category",
    "function_sanger",
    "redox_enzyme",
    "function_redox_",
    "Function",
    "groups",
    "UK_score_4",
    "u1",
    "u2",
    "u3",
    "u1_tn",
    "u2_tn",
]

In [51]:
df_umap_annot = df_umap_annot[cols]

In [52]:
df_umap_annot.head()

Unnamed: 0,Rv_ID,Gene name,Geneid/Contrast_id,is_sdr,category,function_sanger,redox_enzyme,function_redox_,Function,groups,UK_score_4,u1,u2,u3,u1_tn,u2_tn
0,Rv0001,dnaA,1.0,0.0,information pathways,chromosomal replication initiator protein,0.0,,Chromosomal replication initiator protein DnaA,0.0,0.0,0.931896,0.631473,0.299345,-4.858897,1.413937
1,Rv0002,dnaN,2.0,0.0,information pathways,"DNA polymerase III, [beta] subunit",0.0,,DNA polymerase III beta subunit (EC 2.7.7.7),0.0,0.0,1.695993,0.697901,1.37974,-6.833129,1.925143
2,Rv0003,recF,3.0,0.0,information pathways,DNA replication and SOS induction,0.0,,DNA recombination and repair protein RecF,0.0,0.0,2.145068,0.298384,1.759979,10.352838,-0.587461
3,Rv0004,Rv0004,4.0,0.0,conserved hypotheticals,conserved hypothetical protein,0.0,,"Zn-ribbon-containing, possibly RNA-binding pro...",0.0,0.0,2.293828,0.307575,0.833794,-6.510271,0.908762
4,Rv0005,gyrB,5.0,0.0,information pathways,DNA gyrase subunit B,0.0,,DNA gyrase subunit B (EC 5.99.1.3),0.0,0.0,0.941413,0.684316,0.302338,-7.03945,1.838185


For this one, let's merge the Sanger and tblist before to not move such big dataframes around. 

In [53]:
annot = sanger.merge(tblist, on = 'Rv_ID', how = 'outer')

In [55]:
annot.shape

(4008, 3)

In [56]:
integrated_annot = pd.merge(integrated, annot, on = 'Rv_ID', how = 'left')

In [57]:
integrated_annot.shape

(4056, 1173)

In [58]:
integrated_annot.head()

Unnamed: 0,Rv_ID,Gene name,Geneid/Contrast_id,is_sdr,Function,groups,redox_enzyme,function_redox_,UK_score_4,1,...,zhang_Fe_1.5mM_vs_zhang_Fe_450uM,zhang_Trp_Rescue_vs_zhang_in_vitro_control_Rescue,zhang_Tyloxapol_pH_6.5_vs_zhang_Tyloxapol_pH_4.5,zhang_Tyloxapol_pH_6.5_vs_zhang_pcit_pH_4.5,zhang_mhcii_mouse_d10_vs_zhang_wt_mouse_d10,zhang_mhcii_mouse_d45_vs_zhang_wt_mouse_d45,zhang_wt_mouse_d10_vs_zhang_input_library,zhang_wt_mouse_d45_vs_zhang_input_library,function_sanger,category
0,Rv0001,dnaA,1.0,0.0,Chromosomal replication initiator protein DnaA,0.0,0.0,,0.0,-0.065922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,chromosomal replication initiator protein,information pathways
1,Rv0002,dnaN,2.0,0.0,DNA polymerase III beta subunit (EC 2.7.7.7),0.0,0.0,,0.0,0.638581,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"DNA polymerase III, [beta] subunit",information pathways
2,Rv0003,recF,3.0,0.0,DNA recombination and repair protein RecF,0.0,0.0,,0.0,0.059921,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DNA replication and SOS induction,information pathways
3,Rv0004,Rv0004,4.0,0.0,"Zn-ribbon-containing, possibly RNA-binding pro...",0.0,0.0,,0.0,0.041393,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,conserved hypothetical protein,conserved hypotheticals
4,Rv0005,gyrB,5.0,0.0,DNA gyrase subunit B (EC 5.99.1.3),0.0,0.0,,0.0,0.229135,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DNA gyrase subunit B,information pathways


In [60]:
integrated_annot.iloc[:2, :10]

Unnamed: 0,Rv_ID,Gene name,Geneid/Contrast_id,is_sdr,Function,groups,redox_enzyme,function_redox_,UK_score_4,1
0,Rv0001,dnaA,1.0,0.0,Chromosomal replication initiator protein DnaA,0.0,0.0,,0.0,-0.065922
1,Rv0002,dnaN,2.0,0.0,DNA polymerase III beta subunit (EC 2.7.7.7),0.0,0.0,,0.0,0.638581


In [61]:
integrated_annot = pd.concat(
    [
        integrated_annot.iloc[:, :9], 
        integrated_annot.iloc[:, -2:], 
        integrated_annot.iloc[:, 9:-2]
    ], 
    axis = 1
)

### Export datasets. 

In [62]:
integrated_annot.to_csv('../data/integrated_rnaseq_tnseq_v1.csv', index = False)

(4056, 1173)