# LISH-Harvard MoA Kaggle dataset annotation

This notebook combines the original Kaggle's LISH-MoA challenge data (https://www.kaggle.com/c/lish-moa) and the annotations provided by LISH-Harvard after the competition (https://github.com/LISHarvard/moa_challenge) into one single, annotated csv, including cell names and gene names

In [1]:
import pandas as pd

In [2]:
df_training_ft = pd.read_csv('dataset/kaggle/train_features.csv.zip').set_index('sig_id')
df_test_ft = pd.read_csv('dataset/kaggle/test_features.csv.zip').set_index('sig_id')
df_training_targets = pd.read_csv('dataset/kaggle/train_targets_scored.csv.zip').set_index('sig_id')
df_training_nonscored = pd.read_csv('dataset/kaggle/train_targets_nonscored.csv.zip').set_index('sig_id')
df_ids = pd.read_csv('dataset/kaggle/train_drug.csv.zip').set_index('sig_id')

Load the true annotations for genes/cells from https://github.com/LISHarvard/moa_challenge 

In [3]:
df_cell_info = pd.read_csv('https://raw.githubusercontent.com/LISHarvard/moa_challenge/master/cell_info.csv')
df_cell_mapping = pd.read_csv('https://raw.githubusercontent.com/LISHarvard/moa_challenge/master/cell_mapping.csv')
df_genes = pd.read_csv('https://raw.githubusercontent.com/LISHarvard/moa_challenge/master/gene_mapping.csv')

In [4]:
df_cells = df_cell_info.set_index('rid').join(df_cell_mapping.set_index('old'))
df_cells['alias'] = df_cells['new'] + '-' + df_cells['ccle_name']
df_cells

Unnamed: 0_level_0,ccle_name,new,alias
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c-100,MIAPACA2_PANCREAS,c-24,c-24-MIAPACA2_PANCREAS
c-126,42MGBA_CENTRAL_NERVOUS_SYSTEM,c-25,c-25-42MGBA_CENTRAL_NERVOUS_SYSTEM
c-127,A375_SKIN,c-26,c-26-A375_SKIN
c-128,FTC133_THYROID,c-27,c-27-FTC133_THYROID
c-129,HT1080_SOFT_TISSUE,c-28,c-28-HT1080_SOFT_TISSUE
...,...,...,...
c-95,DMS273_LUNG,c-19,c-19-DMS273_LUNG
c-96,G361_SKIN,c-20,c-20-G361_SKIN
c-97,OAW42_OVARY,c-21,c-21-OAW42_OVARY
c-98,HMC18_BREAST,c-22,c-22-HMC18_BREAST


Load gene metadata and symbols mapped with gProfiler

In [5]:
df_gene_mapping = pd.read_csv('dataset/gProfiler_gene_symbols.csv.zip')
df_gene_mapping['alias'] = 'g-' + df_gene_mapping.name + '_' + df_gene_mapping.initial_alias.astype(str)
df_gene_mapping

Unnamed: 0,initial_alias,converted_alias,name,description,namespace,alias
0,16,AARS1,AARS1,alanyl-tRNA synthetase 1 [Source:HGNC Symbol;A...,ENTREZGENE_ACC,g-AARS1_16
1,23,ABCF1,ABCF1,ATP binding cassette subfamily F member 1 [Sou...,ENTREZGENE_ACC,g-ABCF1_23
2,25,ABL1,ABL1,"ABL proto-oncogene 1, non-receptor tyrosine ki...",ENTREZGENE_ACC,g-ABL1_25
3,30,ACAA1,ACAA1,acetyl-CoA acyltransferase 1 [Source:HGNC Symb...,ENTREZGENE_ACC,g-ACAA1_30
4,47,ACLY,ACLY,ATP citrate lyase [Source:HGNC Symbol;Acc:HGNC...,ENTREZGENE_ACC,g-ACLY_47
...,...,...,...,...,...,...
769,116832,RPL39L,RPL39L,ribosomal protein L39 like [Source:HGNC Symbol...,ENTREZGENE_ACC,g-RPL39L_116832
770,148022,TICAM1,TICAM1,toll like receptor adaptor molecule 1 [Source:...,ENTREZGENE_ACC,g-TICAM1_148022
771,200734,SPRED2,SPRED2,sprouty related EVH1 domain containing 2 [Sour...,ENTREZGENE_ACC,g-SPRED2_200734
772,256364,EML3,EML3,EMAP like 3 [Source:HGNC Symbol;Acc:HGNC:26666],ENTREZGENE_ACC,g-EML3_256364


In [6]:
df_tr = pd.concat([df_training_ft, df_training_targets, df_training_nonscored], axis=1)
df_tr.insert(0, 'training', True)
df_tr

Unnamed: 0_level_0,training,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,ve-cadherin_antagonist,vesicular_monoamine_transporter_inhibitor,vitamin_k_antagonist,voltage-gated_calcium_channel_ligand,voltage-gated_potassium_channel_activator,voltage-gated_sodium_channel_blocker,wdr5_mll_interaction_inhibitor,wnt_agonist,xanthine_oxidase_inhibitor,xiap_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,True,trt_cp,24,D1,1.0620,0.5577,-0.2479,-0.6208,-0.1944,-1.0120,...,0,0,0,0,0,0,0,0,0,0
id_000779bfc,True,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.0190,0.5207,...,0,0,0,0,0,0,0,0,0,0
id_000a6266a,True,trt_cp,48,D1,0.6280,0.5817,1.5540,-0.0764,-0.0323,1.2390,...,0,0,0,0,0,0,0,0,0,0
id_0015fd391,True,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.0620,-0.8095,...,0,0,0,0,0,0,0,0,0,0
id_001626bd3,True,trt_cp,72,D2,-0.3254,-0.4009,0.9700,0.6919,1.4180,-0.8244,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
id_fffb1ceed,True,trt_cp,24,D2,0.1394,-0.0636,-0.1112,-0.5080,-0.4713,0.7201,...,0,0,0,0,0,0,0,0,0,0
id_fffb70c0c,True,trt_cp,24,D2,-1.3260,0.3478,-0.3743,0.9905,-0.7178,0.6621,...,0,0,0,0,0,0,0,0,0,0
id_fffc1c3f4,True,ctl_vehicle,48,D2,0.3942,0.3756,0.3109,-0.7389,0.5505,-0.0159,...,0,0,0,0,0,0,0,0,0,0
id_fffcb9e7c,True,trt_cp,24,D1,0.6660,0.2324,0.4392,0.2044,0.8531,-0.0343,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df_t = df_test_ft.copy()
df_t.insert(0, 'training', False)
df_t

Unnamed: 0_level_0,training,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_0004d9e33,False,trt_cp,24,D1,-0.5458,0.1306,-0.5135,0.4408,1.5500,-0.1644,...,0.0981,0.7978,-0.1430,-0.2067,-0.2303,-0.1193,0.0210,-0.0502,0.1510,-0.7750
id_001897cda,False,trt_cp,72,D1,-0.1829,0.2320,1.2080,-0.4522,-0.3652,-0.3319,...,-0.1190,-0.1852,-1.0310,-1.3670,-0.3690,-0.5382,0.0359,-0.4764,-1.3810,-0.7300
id_002429b5b,False,ctl_vehicle,24,D1,0.1852,-0.1404,-0.3911,0.1310,-1.4380,0.2455,...,-0.2261,0.3370,-1.3840,0.8604,-1.9530,-1.0140,0.8662,1.0160,0.4924,-0.1942
id_00276f245,False,trt_cp,24,D2,0.4828,0.1955,0.3825,0.4244,-0.5855,-1.2020,...,0.1260,0.1570,-0.1784,-1.1200,-0.4325,-0.9005,0.8131,-0.1305,0.5645,-0.5809
id_0027f1083,False,trt_cp,48,D1,-0.3979,-1.2680,1.9130,0.2057,-0.5864,-0.0166,...,0.4965,0.7578,-0.1580,1.0510,0.5742,1.0900,-0.2962,-0.5313,0.9931,1.8380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
id_ff7004b87,False,trt_cp,24,D1,0.4571,-0.5743,3.3930,-0.6202,0.8557,1.6240,...,-1.1790,-0.6422,-0.4367,0.0159,-0.6539,-0.4791,-1.2680,-1.1280,-0.4167,-0.6600
id_ff925dd0d,False,trt_cp,24,D1,-0.5885,-0.2548,2.5850,0.3456,0.4401,0.3107,...,0.0210,0.5780,-0.5888,0.8057,0.9312,1.2730,0.2614,-0.2790,-0.0131,-0.0934
id_ffb710450,False,trt_cp,72,D1,-0.3985,-0.1554,0.2677,-0.6813,0.0152,0.4791,...,0.4418,0.9153,-0.1862,0.4049,0.9568,0.4666,0.0461,0.5888,-0.4205,-0.1504
id_ffbb869f2,False,trt_cp,48,D2,-1.0960,-1.7750,-0.3977,1.0160,-1.3350,-0.2207,...,0.3079,-0.4473,-0.8192,0.7785,0.3133,0.1286,-0.2618,0.5074,0.7430,-0.0484


In [8]:
df_dataset = pd.concat([df_tr, df_t], axis=0).join(df_ids)
df_dataset

Unnamed: 0_level_0,training,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,vesicular_monoamine_transporter_inhibitor,vitamin_k_antagonist,voltage-gated_calcium_channel_ligand,voltage-gated_potassium_channel_activator,voltage-gated_sodium_channel_blocker,wdr5_mll_interaction_inhibitor,wnt_agonist,xanthine_oxidase_inhibitor,xiap_inhibitor,drug_id
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,True,trt_cp,24,D1,1.0620,0.5577,-0.2479,-0.6208,-0.1944,-1.0120,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b68db1d53
id_000779bfc,True,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.0190,0.5207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,df89a8e5a
id_000a6266a,True,trt_cp,48,D1,0.6280,0.5817,1.5540,-0.0764,-0.0323,1.2390,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18bb41b2c
id_0015fd391,True,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.0620,-0.8095,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8c7f86626
id_001626bd3,True,trt_cp,72,D2,-0.3254,-0.4009,0.9700,0.6919,1.4180,-0.8244,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7cbed3131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
id_ff7004b87,False,trt_cp,24,D1,0.4571,-0.5743,3.3930,-0.6202,0.8557,1.6240,...,,,,,,,,,,
id_ff925dd0d,False,trt_cp,24,D1,-0.5885,-0.2548,2.5850,0.3456,0.4401,0.3107,...,,,,,,,,,,
id_ffb710450,False,trt_cp,72,D1,-0.3985,-0.1554,0.2677,-0.6813,0.0152,0.4791,...,,,,,,,,,,
id_ffbb869f2,False,trt_cp,48,D2,-1.0960,-1.7750,-0.3977,1.0160,-1.3350,-0.2207,...,,,,,,,,,,


In [9]:
# Move column drug_id to positio 0 and add cell and gene names
df_dt = df_dataset.iloc[:, [-1] + list(range(0, len(df_dataset.columns)-1))].rename(columns={e[1]: e[0] for e in df_genes.values})
df_dt = df_dt.rename(columns={k: v for k, v in df_gene_mapping[['initial_alias', 'alias']].values})
df_dt = df_dt.rename(columns={k: v for k, v in df_cells[['new', 'alias']].values})
df_dt

Unnamed: 0_level_0,drug_id,training,cp_type,cp_time,cp_dose,g-AARS1_16,g-ABCF1_23,g-ABL1_25,g-ACAA1_30,g-ACLY_47,...,ve-cadherin_antagonist,vesicular_monoamine_transporter_inhibitor,vitamin_k_antagonist,voltage-gated_calcium_channel_ligand,voltage-gated_potassium_channel_activator,voltage-gated_sodium_channel_blocker,wdr5_mll_interaction_inhibitor,wnt_agonist,xanthine_oxidase_inhibitor,xiap_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,b68db1d53,True,trt_cp,24,D1,1.0620,0.5577,-0.2479,-0.6208,-0.1944,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_000779bfc,df89a8e5a,True,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.0190,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_000a6266a,18bb41b2c,True,trt_cp,48,D1,0.6280,0.5817,1.5540,-0.0764,-0.0323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_0015fd391,8c7f86626,True,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.0620,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_001626bd3,7cbed3131,True,trt_cp,72,D2,-0.3254,-0.4009,0.9700,0.6919,1.4180,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
id_ff7004b87,,False,trt_cp,24,D1,0.4571,-0.5743,3.3930,-0.6202,0.8557,...,,,,,,,,,,
id_ff925dd0d,,False,trt_cp,24,D1,-0.5885,-0.2548,2.5850,0.3456,0.4401,...,,,,,,,,,,
id_ffb710450,,False,trt_cp,72,D1,-0.3985,-0.1554,0.2677,-0.6813,0.0152,...,,,,,,,,,,
id_ffbb869f2,,False,trt_cp,48,D2,-1.0960,-1.7750,-0.3977,1.0160,-1.3350,...,,,,,,,,,,


In [11]:
df_dt.to_csv('lish_moa_annotated.csv.zip', compression='zip')