# for LD (some code to play stats on the crm files)

In [1]:
# basic packages
import os, glob
import pandas as pd
import numpy as np; np.random.seed(0)
import itertools
from collections import Counter, defaultdict
import time

# machine learning packages from sklearn
from sklearn.preprocessing import MinMaxScaler #StandardScaler 
from sklearn import preprocessing, metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV
from IPython.display import Image


# for IRF
from functools import reduce
# Needed for the scikit-learn wrapper function
import irf
from irf import (irf_utils, utils,
                 irf_jupyter_utils)
from irf.ensemble.wrf import RandomForestClassifierWithWeights
from math import ceil

# Import our custom utilities
from imp import reload


# Import tools needed for visualization
import seaborn as sns; sns.set()
import matplotlib
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz
import pydot

In [2]:
%load_ext autoreload
%autoreload 2

In [48]:
save_dir = '../data/processed/fig4_modelling/common_genes'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [4]:
THRES=1

In [5]:
normal_tissues = ['Airway','Astrocytes','Bladder','Colon','Esophageal','GDSD6','GM12878','HMEC','Melanocytes','Ovarian',
'Pancreas','Prostate','Renal','Thyroid','Uterine']
normal_tissues_dict = dict(zip(normal_tissues,range(len(normal_tissues))))

In [14]:
rna_df = pd.read_csv('../data/interim/rna/tissue_tpm_sym.csv',index_col=0)
rna_df_norm = rna_df[normal_tissues]
rna_D0_dict = pd.Series(rna_df.GDSD0.values, index=rna_df.index.values).to_dict()
rna_D3_dict = pd.Series(rna_df.GDSD3.values, index=rna_df.index.values).to_dict()
rna_D6_dict = pd.Series(rna_df.GDSD6.values, index=rna_df.index.values).to_dict()

# 0. CRM data wrangling

In [7]:
%%time
# import
data_all = pd.read_csv('/Users/mguo123/Google Drive/1_khavari/omics_project-LD/pan_omics/data/processed/tissue_crms/all_count_comb_overall.csv',index_col=0,header=0)
data_all = data_all[data_all.tissue.isin(normal_tissues)]
data_all = data_all[data_all.iloc[:,2:].sum(axis=1)>1e-1]

# expression labels
exp_label = list(np.log10(data_all.exp.values+1e-2))
labels_all  = np.array(np.array(exp_label)>THRES)

tissues_label  = data_all.tissue.values#np.array((data_all.exp>THRES).values)
tissue_num_labels =  data_all.tissue.map(normal_tissues_dict).values

genes_all = data_all.index.values
gene_to_num_dict = dict(zip(np.unique(genes_all),range(len(np.unique(genes_all)))))
genes_num_all = np.vectorize(gene_to_num_dict.get)(genes_all)



In [8]:
print('files_loaded', data_all.shape)


files_loaded (137054, 1063)


In [9]:
data_all[:5]

Unnamed: 0,tissue,exp,num_loop_counts,num_loops,num_atac_regions_pro,num_atac_regions_loop,AHR_pro,ARID5B_pro,ARNT_pro,ARNTL_pro,...,MAFA_loop,MEOX1_loop,E2F5_pro,E2F5_loop,ESR2_pro,KLF14_pro,TBX15_pro,ESR2_loop,KLF14_loop,TBX15_loop
A4GALT,Airway,5.435,45.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GNT,Airway,0.0,5060.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAMP,Airway,43.71,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AANAT,Airway,0.0,13.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ABAT,Airway,0.125,21.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
## only tfs

data_all.drop(['tissue','exp','num_loop_counts','num_loops','num_atac_regions_pro','num_atac_regions_loop'],axis=1,inplace=True)

data_all.shape




(137054, 1057)

In [11]:
selector = VarianceThreshold()
data_all_varfilt = selector.fit_transform(data_all)
data_all_varfilt_cols = data_all.columns[selector.get_support()]
print(data_all.shape, data_all_varfilt.shape, len(data_all_varfilt_cols))
scaler = MinMaxScaler()
data_all_norm = scaler.fit_transform(data_all_varfilt)
data_all_norm = pd.DataFrame(data_all_norm, columns = data_all_varfilt_cols)

(137054, 1057) (137054, 980) 980


In [12]:
data_all_norm[:5]

Unnamed: 0,AHR_pro,ARID5B_pro,ARNT_pro,ARNTL_pro,ATF1_pro,ATF2_pro,ATF3_pro,ATF4_pro,ATF6_pro,ATF7_pro,...,TFAP2B_pro,ZNF382_pro,MYOD1_loop,NR4A3_loop,TFAP2B_loop,ZNF382_loop,CUX2_pro,CUX2_loop,E2F5_pro,E2F5_loop
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
