In [30]:
import os
import json
import pandas as pd
import numpy as np
if os.getcwd() != '/home/jupyter/crisp':
    os.chdir("..")
from utils.gcp_helpers import get_dataframe_from_bucket
%pwd

'/home/jupyter/crisp'

In [31]:
human_data = get_dataframe_from_bucket("Radiation/human/combined/human.combined.expression.071321.tsv","ah21_data", "ah21_data", sep="\t")

In [32]:
human_metadata = get_dataframe_from_bucket("Radiation/human/combined/human.metadata.combined.071321.csv","ah21_data", "ah21_data", sep=",")

In [33]:
mouse_data = get_dataframe_from_bucket("Radiation/mouse/combined/mouse.combined.expression.071121.tsv","ah21_data", "ah21_data", sep="\t")

In [34]:
# https://www.ncbi.nlm.nih.gov/homologene
# there are a couple r packages that are just for mapping between refseq and ensembl and homologene identifiers
# you can ping frank on R if you need it!

In [6]:
human_mouse_gene_map = pd.read_csv('data/mouse_human_mapping.txt', sep='\t')

In [7]:
db_mouse_genes = set(human_mouse_gene_map['Mouse gene name'].str.lower())

In [8]:
db_human_genes = set(human_mouse_gene_map['Gene name'].str.lower())

In [9]:
exp_human_genes = set(human_data.GeneName.str.lower())

In [10]:
exp_mouse_genes = set(mouse_data.GeneName.str.lower())

In [11]:
len(exp_human_genes)

23818

In [12]:
len(db_mouse_genes)

19614

In [13]:
len(db_human_genes)

18830

In [14]:
human_sub = exp_human_genes.intersection(db_human_genes)

In [15]:
mouse_sub = exp_mouse_genes.intersection(db_mouse_genes)

In [16]:
len(mouse_sub.intersection(human_sub))

10493

In [19]:
metadata = human_metadata

In [29]:
metadata

Unnamed: 0,sample,organism,source,gender,age_years,timepoint_hr,dose_Gy,radiation,dataset,irradiated
0,GSM226029,Homo sapiens,Peripheral Blood,female,23,24,0.0,Control,GLDS152,No
1,GSM226014,Homo sapiens,Peripheral Blood,male,47,24,0.0,Control,GLDS152,No
2,GSM225984,Homo sapiens,Peripheral Blood,male,39,6,0.0,Control,GLDS152,No
3,GSM226024,Homo sapiens,Peripheral Blood,female,46,24,0.0,Control,GLDS152,No
4,GSM225989,Homo sapiens,Peripheral Blood,female,20,6,0.0,Control,GLDS152,No
...,...,...,...,...,...,...,...,...,...,...
120,GSM1080653,Homo sapiens,Peripheral Blood,male,25,48,8.0,gamma,GLDS157,Yes
121,GSM1080603,Homo sapiens,Peripheral Blood,male,53,6,8.0,gamma,GLDS157,Yes
122,GSM1080593,Homo sapiens,Peripheral Blood,male,39,6,8.0,gamma,GLDS157,Yes
123,GSM1080628,Homo sapiens,Peripheral Blood,male,38,24,8.0,gamma,GLDS157,Yes


In [20]:
meta_columns = metadata.columns
datasets = list(metadata.dataset.unique())
radstates = list(metadata.irradiated.unique())

In [24]:
for m in meta_columns:
    check_nan = (metadata[metadata[m].isnull()])
# no nan values found in the humans

In [21]:
value_matrix = {}
for d in datasets:
    value_matrix[d] = metadata[metadata.dataset == d].irradiated.value_counts()
pd.DataFrame(value_matrix)
# mix of radiates vs irradiated in both experimental settings

Unnamed: 0,GLDS152,GLDS157
Yes,40,60
No,10,15


In [25]:
value_matrix = {}
for d in datasets:
    value_matrix[d] = metadata[metadata.dataset == d].timepoint_hr.value_counts()
pd.DataFrame(value_matrix)    
# only one experiment has variation in radiation time

Unnamed: 0,GLDS152,GLDS157
6,25.0,25
24,25.0,25
48,,25


In [26]:
value_matrix = {}
for d in datasets:
    value_matrix[d] = metadata[metadata.dataset == d].gender.value_counts()
pd.DataFrame(value_matrix)    
# only one experiment has variation in radiation time

Unnamed: 0,GLDS152,GLDS157
female,25,40
male,25,35


In [27]:
value_matrix = {}
for d in datasets:
    value_matrix[d] = metadata[metadata.dataset == d].dose_Gy.value_counts()
pd.DataFrame(value_matrix)    
# only one experiment has variation in radiation time

Unnamed: 0,GLDS152,GLDS157
0.0,10,15
2.0,10,15
5.0,10,15
0.5,10,15
8.0,10,15


In [None]:
# just on the offchacne, look at https://genelab-data.ndc.nasa.gov/genelab/accession/GLDS-73/
# this is as the radiation looks like cosmic rays here
# there are some mouse lung studies here https://genelab-data.ndc.nasa.gov/genelab/accession/GLDS-148/
# these mimic, so can be used. However - not sure if the mouse data uses Iron unlike the human data? NOPE! They both use Iron! 

In [28]:
value_matrix = {}
for d in datasets:
    value_matrix[d] = metadata[metadata.dataset == d].radiation.value_counts()
pd.DataFrame(value_matrix)    
# only one experiment has variation in radiation time

Unnamed: 0,GLDS152,GLDS157
Control,10.0,15.0
gamma,40.0,
gamma,,60.0
