In [1]:
import pandas as pd
import numpy as np

In [2]:
from tqdm.notebook import tqdm

In [3]:
import os
while os.getcwd() != '/home/jupyter/crisp':
    os.chdir("..")
%pwd

'/home/jupyter/crisp'

In [4]:
mouse_mapping = pd.read_csv('notebooks/heavy_ion_runs/GLDS148.ProbeID.GeneSymbol.Mapping.txt', sep='\t')
mouse_probe_dict = dict(zip(mouse_mapping.ProbeID, mouse_mapping.GeneSymbol.str.lower()))

human_mapping = pd.read_csv('notebooks/heavy_ion_runs/GLDS73.ProbeID.GeneSymbol.Mapping.txt', sep='\t')
human_probe_dict = dict(zip(human_mapping.ProbeID, human_mapping.GeneSymbol.str.lower()))

In [5]:
human_files = [f for f in os.listdir('data/cosmic_rads/human cells/re_upload_raw_data/') if '.txt' in f]
human_frame = pd.concat([pd.read_csv('data/cosmic_rads/human cells/re_upload_raw_data/'+f,sep='\t').T.iloc[1:] for f in tqdm(human_files)])
human_frame.columns =  pd.read_csv('data/cosmic_rads/human cells/re_upload_raw_data/GSM1082165_sample_table.txt', sep='\t').T.iloc[0]
human_frame['Source Name'] = [f[:-17] for f in tqdm(human_files)]
human_frame.set_index('Source Name', inplace=True)
human_frame.columns = [human_probe_dict[x] if x in human_probe_dict.keys() else x for x in  human_frame.columns]

  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/101 [00:00<?, ?it/s]

In [6]:
human_frame.to_pickle('data/cosmic_rads/human cells/re_upload_raw_data/complete_human_frame.pkl')
human_frame = pd.read_pickle('data/cosmic_rads/human cells/re_upload_raw_data/complete_human_frame.pkl')

In [7]:
human_metadata = pd.read_csv('data/cosmic_rads/human cells/human_metadata.txt', sep='\t')
human_metadata['Source Name'] = human_metadata['Source Name'].str[:-2]
human_metadata["irradiated"] = [1. if x > 0.01 else 0. for x in  tqdm(human_metadata['Characteristics [dose gray]'])]
human_metadata['environment'] = human_metadata['Characteristics[organism]'] +  human_metadata['Characteristics [cell line]']

  0%|          | 0/134 [00:00<?, ?it/s]

In [8]:
# human_frame there was an error here! We only got a 4th of the data! This needs a re-run!

In [9]:
mouse_files = [f for f in os.listdir('data/cosmic_rads/mice_lungs/raw_data/') if '.txt' in f]
mouse_frame = pd.concat([pd.read_csv('data/cosmic_rads/mice_lungs/raw_data/'+f,sep='\t').T.iloc[1:] for f in tqdm(mouse_files)])
mouse_frame.columns =  pd.read_csv('data/cosmic_rads/mice_lungs/raw_data/GSM1035974_sample_table.txt', sep='\t').T.iloc[0]
mouse_frame['Source Name'] = [f[:-17] for f in mouse_files]
mouse_frame.set_index('Source Name', inplace=True)
mouse_frame.columns = [mouse_probe_dict[x] if x in mouse_probe_dict.keys() else x for x in mouse_frame.columns]

  0%|          | 0/41 [00:00<?, ?it/s]

In [10]:
mouse_frame.to_pickle('data/cosmic_rads/human cells/re_upload_raw_data/complete_mouse_frame.pkl')
mouse_frame = pd.read_pickle('data/cosmic_rads/human cells/re_upload_raw_data/complete_mouse_frame.pkl')

In [11]:
mouse_metadata = pd.read_csv('data/cosmic_rads/mice_lungs/mouse_metadata.txt', sep='\t')
mouse_metadata['Source Name'] = mouse_metadata['Source Name'].str[:-2]
mouse_metadata['Characteristics [dose gray]'] = [0. if x == "Unirradiated Control" else 1. for x in  mouse_metadata['Comment [Sample_source_name]']]
mouse_metadata['Factor Value[radiation type]'] = [np.nan if x == "Unirradiated Control" else 'Fe' for x in  mouse_metadata['Comment [Sample_source_name]']]
mouse_metadata["irradiated"] = [1. if x > 0.01 else 0. for x in  mouse_metadata['Characteristics [dose gray]']]
mouse_metadata['environment'] = mouse_metadata['Characteristics[organism]'] +  mouse_metadata['Characteristics [Sex]']

In [12]:
human_mouse_gene_map = pd.read_csv('data/mouse_human_mapping.txt', sep='\t')

db_mouse_genes = set(human_mouse_gene_map['Mouse gene name'].str.lower())
db_human_genes = set(human_mouse_gene_map['Gene name'].str.lower())

experiment_human_genes = set(human_frame.columns)
experiment_mouse_genes = set(mouse_frame.columns)
human_subset = experiment_human_genes.intersection(db_human_genes)
mouse_subset = experiment_mouse_genes.intersection(db_mouse_genes)

human_to_mouse_dic = dict(zip(human_mouse_gene_map['Gene name'].str.lower(),human_mouse_gene_map['Mouse gene name'].str.lower()))
mouse_to_human_dic = dict(zip(human_mouse_gene_map['Mouse gene name'].str.lower(),human_mouse_gene_map['Gene name'].str.lower()))

humanised_mouse_genes = set([mouse_to_human_dic[x] for x in list(mouse_subset)])

human_genes_to_use = list(humanised_mouse_genes.intersection(human_subset))
mouse_genes_to_use = list([human_to_mouse_dic[x] for x in human_genes_to_use])

In [13]:
human_frame = human_frame[human_genes_to_use]
mouse_frame.columns = [mouse_to_human_dic[x] if x in mouse_to_human_dic.keys() else x for x in mouse_frame.columns]
mouse_frame = mouse_frame[human_genes_to_use]

In [14]:
m_merged_data = pd.merge(mouse_metadata, mouse_frame, left_on="Source Name", right_index=True)
m_merged_data = m_merged_data.T[~m_merged_data.T.index.duplicated(keep='first')].T

In [15]:
h_merged_data = pd.merge(human_metadata, human_frame, left_on='Source Name', right_index=True)
h_merged_data = h_merged_data[h_merged_data['Factor Value[radiation type]'] != 'x-ray']
h_merged_data = h_merged_data.T[~h_merged_data.T.index.duplicated(keep='first')].T

In [16]:
# you may have duplicate columns here - once they are removed the system should work

In [17]:
h_m_data = h_merged_data.append(m_merged_data)

In [18]:
h_m_data.to_pickle('data/cosmic_rads/docker_exps/all_human_mice_data_combined.pkl')

In [19]:
# to talk through with laruen - what to do with the fact we're dropping huge numbers of duplicates?
# humans have duplicates too with the well! 

In [20]:
# this won't work until you have equivalent genes!
# m_h_data = m_merged_data.append(h_merged_data)

In [21]:
environment = h_m_data['environment']

In [22]:
target = h_m_data['irradiated']

In [23]:
if len(h_m_data.columns) == 15374:
    h_m_data = h_m_data[h_m_data.columns[43:-4]]

In [24]:
h_m_data['environment'] = list(environment)
h_m_data['target'] = list(target)

In [25]:
h_m_data.to_pickle('data/cosmic_rads/docker_exps/all_human_mice_data_combined.pkl')

In [26]:
h_m_data

Unnamed: 0,galnt13,spart,usp12,ttc9b,rps25,lrrc14b,znf777,lypla2,cntrob,itih3,...,traf5,serinc1,gpx6,tspan5,adgrl3,ube2g1,smarcal1,s1pr3,environment,target
0,4.46424,5.44465,6.03254,4.00528,14.1401,3.47801,6.93685,7.25607,5.69813,4.03223,...,4.63137,8.83469,3.43998,10.1032,3.37714,8.16892,8.06357,7.02017,Homo sapiensHBEC3KT,0.0
1,4.59663,6.24428,4.80986,3.93824,14.0324,3.44717,7.29125,6.72747,5.93035,3.15744,...,4.46322,9.23907,3.48334,9.35297,3.26959,8.7853,8.84771,5.35438,Homo sapiensHBEC3KT,1.0
2,5.49855,6.73741,4.68409,3.9535,13.7697,3.8469,6.92817,6.78872,5.84161,3.51335,...,3.97759,9.01184,3.56301,10.0574,3.50771,8.82566,8.87777,7.72982,Homo sapiensHBEC3KT,0.0
3,3.97014,5.35934,6.3925,4.07191,14.0186,3.31506,7.82133,6.64026,5.35815,3.61417,...,4.22047,9.03961,3.4445,9.90618,3.67186,8.52964,7.7856,7.21256,Homo sapiensHBEC3KT,0.0
4,5.22817,6.28206,7.09742,3.67017,13.66,3.50986,7.97077,7.18961,5.05405,3.43657,...,3.6634,9.53451,3.94549,10.0767,3.38848,9.5753,8.7571,7.82985,Homo sapiensHBEC3KT,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,3.81,8.44,3.09,2.57,14.46,5.49,4.12,12.88,3.02,4.27,...,2.93,11.47,3.52,3.59,6.26,9.6,8.18,3.06,Mus musculusmale,1.0
38,3.31,8.31,3.69,2.96,14.28,5.22,3.54,12.9,2.99,3.7,...,3.87,11.11,3.83,3.56,6.22,9.26,8.32,2.59,Mus musculusfemale,1.0
39,3.62,8.56,3.73,3.94,14.26,6.02,3.84,12.88,3.57,3.36,...,3.24,11.29,3.96,4.47,5.83,9.35,8.32,2.52,Mus musculusmale,1.0
40,4.0,8.7,3.89,3.76,14.32,5.08,4.56,12.67,2.64,3.59,...,2.97,11.73,3.62,3.4,6.33,9.64,8.15,2.84,Mus musculusmale,1.0
