In [1]:
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Goal: build dataset thats is compatible with R DESEQ toolkit

Steps

1. read necessary columns and cell_types (row names) from external files provided by jimmy 

2. apply scaling corrections to dataset

3. drop non tumor cells 

4. transpose data so genes are rows and patient/cell are columns

4. generate unique patient/cell column names

5. generate meta data table


## Outputs
    * metadata_df -> colData_deseq.csv (meta data which describes the columns (coldata))
   
    * df_raw_calc_tumor_T -> 'data/DE_data_tumor_TP10k.csv' (deseq2 count data)

### build de dataset

In [3]:
rows = pd.read_csv('data/rows for X.csv',header = None)
cell_type = list(rows[0])
cell_type = [val.lower() for val in cell_type]

headers = pd.read_csv('data/columns for X.csv', header = None)
genes_to_keep = pd.read_csv('data/Gene list for DE.csv', header = None)

genes_to_keep = list(genes_to_keep[0])
column_names = list(headers[0])

df_raw = pd.read_csv('data/X.csv',names = column_names, usecols = genes_to_keep)
df_raw.shape

df_raw['cell_type'] = cell_type
#df_raw = df_raw[df_raw['row_id'] == 'tumor']
#df_raw.drop(['row_id'],axis = 1, inplace = True)
df_raw.head()

Unnamed: 0,HTR1D,GLUL,GAD1,HTR2B,SLC1A3,PAM,GABRP,DDC,ACHE,SLC38A5,...,SLC6A12,SLC38A1,SLC38A2,TAC3,ALDH2,CHRNA5,ABAT,ACE,GRIN2D,cell_type
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tumor
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tumor
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.208186,0.0,0.0,0.0,0.0,0.0,0.0,fibroblast
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.274803,2.274803,0.0,0.0,0.0,0.0,0.0,0.0,tumor
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.431281,0.0,0.0,0.0,0.0,0.0,0.0,endothelial


## join patient

In [5]:
df_patient = pd.read_csv('data/patients.csv', header = None)

In [13]:
df_raw['patient_id'] = patient_list
df_raw.set_index('patient_id',inplace = True)
df_raw.head()

Unnamed: 0_level_0,HTR1D,GLUL,GAD1,HTR2B,SLC1A3,PAM,GABRP,DDC,ACHE,SLC38A5,...,SLC6A12,SLC38A1,SLC38A2,TAC3,ALDH2,CHRNA5,ABAT,ACE,GRIN2D,cell_type
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tumor
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tumor
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.208186,0.0,0.0,0.0,0.0,0.0,0.0,fibroblast
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.274803,2.274803,0.0,0.0,0.0,0.0,0.0,0.0,tumor
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.431281,0.0,0.0,0.0,0.0,0.0,0.0,endothelial


### calculate de dataset

In [14]:
df_UMI = pd.read_csv('data/normalization by UMI.csv', header = None)
UMI_multiplier = np.array([int(val) for val in df_UMI[0]])

In [15]:
len(UMI_multiplier)

88031

In [16]:
df_raw.shape

(88031, 23)

In [17]:
df_raw_calc = df_raw.copy()

for col in df_raw.columns:
    if col != 'cell_type':
        df_raw_calc[col] = np.round((((2**df_raw[col])) -1)*UMI_multiplier/10000)

In [40]:
df_raw_calc_tumor = df_raw_calc[df_raw_calc['cell_type'] == 'tumor']
df_raw_calc_tumor.drop(['cell_type'],axis = 1, inplace = True)

In [41]:
df_raw_calc_tumor.reset_index(inplace = True)

In [42]:
df_raw_calc_tumor.head()

Unnamed: 0,patient_id,HTR1D,GLUL,GAD1,HTR2B,SLC1A3,PAM,GABRP,DDC,ACHE,...,GLUD1,SLC6A12,SLC38A1,SLC38A2,TAC3,ALDH2,CHRNA5,ABAT,ACE,GRIN2D
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
def apply_new_patient_id(df):
    length = df.shape[0]
    patient_id = df['patient_id'].iloc[0]
    new_id_list = ['patient_{}_cell_{}'.format(patient_id,val) for val in np.arange(length)+1]
    df['patient_id'] = new_id_list
    
    return df
    

In [43]:
df_raw_calc_tumor = df_raw_calc_tumor.groupby(by = 'patient_id', as_index = False).apply(apply_new_patient_id)

In [45]:
df_raw_calc_tumor.set_index('patient_id', inplace = True)

In [46]:
df_raw_calc_tumor.tail()

Unnamed: 0_level_0,HTR1D,GLUL,GAD1,HTR2B,SLC1A3,PAM,GABRP,DDC,ACHE,SLC38A5,...,GLUD1,SLC6A12,SLC38A1,SLC38A2,TAC3,ALDH2,CHRNA5,ABAT,ACE,GRIN2D
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
patient_14_cell_3321,0.0,0.0,1.0,0.0,1.0,6.0,21.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,3.0,1.0,0.0,0.0,0.0
patient_14_cell_3322,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
patient_14_cell_3323,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
patient_14_cell_3324,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,...,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
patient_14_cell_3325,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
df_raw_calc_tumor_T = df_raw_calc_tumor.T

In [49]:
df_raw_calc_tumor_T.to_csv('data/DE_data_tumor_TP10k.csv',index_label = 'gene')

In [50]:
df_raw_calc_tumor_T.columns

Index(['patient_0_cell_1', 'patient_0_cell_2', 'patient_0_cell_3',
       'patient_0_cell_4', 'patient_0_cell_5', 'patient_0_cell_6',
       'patient_0_cell_7', 'patient_0_cell_8', 'patient_0_cell_9',
       'patient_0_cell_10',
       ...
       'patient_14_cell_3316', 'patient_14_cell_3317', 'patient_14_cell_3318',
       'patient_14_cell_3319', 'patient_14_cell_3320', 'patient_14_cell_3321',
       'patient_14_cell_3322', 'patient_14_cell_3323', 'patient_14_cell_3324',
       'patient_14_cell_3325'],
      dtype='object', name='patient_id', length=43817)

In [51]:
df_raw_calc_tumor_T.head()

patient_id,patient_0_cell_1,patient_0_cell_2,patient_0_cell_3,patient_0_cell_4,patient_0_cell_5,patient_0_cell_6,patient_0_cell_7,patient_0_cell_8,patient_0_cell_9,patient_0_cell_10,...,patient_14_cell_3316,patient_14_cell_3317,patient_14_cell_3318,patient_14_cell_3319,patient_14_cell_3320,patient_14_cell_3321,patient_14_cell_3322,patient_14_cell_3323,patient_14_cell_3324,patient_14_cell_3325
HTR1D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GLUL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GAD1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
HTR2B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SLC1A3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [71]:
df_raw_calc_tumor.shape

(43817, 22)

In [53]:
metadata_df = pd.DataFrame(
{'cell_id': df_raw_calc_tumor_T.columns,
}
)

In [68]:
metadata_df['patient_id'] = metadata_df['cell_id'].str.split('_',expand=True)[1]

In [89]:
for col in df_raw_calc_tumor.columns:
    df_raw_calc_tumor[df_raw_calc_tumor[col] == 0].index
    zero_list = np.where(df_raw_calc_tumor[col] == 0, 'zero', 'nonzero')
    metadata_df['{}_zero'.format(col)] = zero_list
    

In [91]:
metadata_df.to_csv('data/colData_deseq.csv',index = False)