In [2]:
import pandas as pd
import numpy as np
import re
import sklearn.decomposition

In [3]:
folder = 'training_sets_genes'
cancer_types = ['bladder', 'breast', 'colon', 'kidney', 'leukemia', 'liver', 'lung', 'ovarian', 'pancreatic']

df1 = pd.read_csv('CRISPR_gene_dependency.csv', index_col='DepMap_ID').T
df2 = pd.read_csv('CRISPR_gene_effect.csv', index_col='DepMap_ID').T
df3 = pd.read_csv('CCLE_expression.csv', index_col=0).T
info = pd.read_csv('sample_info_new.csv', index_col='DepMap_ID').T

In [4]:
def regex_func(x):
    regex = "(\S+) \([0-9]+\)"
    pattern = re.findall(regex, x)
    if len(pattern) > 0:
        return pattern[0]
    else:
        return x

def convert_indexes(df):
    df = df.dropna(axis='index')
    df.index = df.index.to_series().apply(regex_func)
    
    return df

def get_PCA_data(df, labels, name):
    df = convert_indexes(df)
    df = df.loc[df.index.isin(labels.index)]
    pca = sklearn.decomposition.PCA(n_components=8, whiten=True)
    X = pca.fit_transform(df)
    df = pd.DataFrame(X, index=df.index)
    rename = lambda x: name + '_' + str(x)
    df.rename(rename, axis=1, inplace=True)
    return df

def get_labels():
    labels = pd.Series(np.zeros(0))
    for i in range(len(cancer_types)):
        cancer_type = cancer_types[i]
        df_new = pd.read_csv('{}/{}_training_genes_set.csv'.format(folder, cancer_type), index_col=0)
        indexes = df_new.index[df_new.iloc[:, 0].str.match('positive')]
        new_labels = pd.Series(data=(i * np.ones(len(indexes))), index=indexes)
        labels = pd.concat([labels, new_labels])
        
    return labels
    
labels = get_labels() 
df1 = get_PCA_data(df1, labels, 'dep')
df2 = get_PCA_data(df2, labels, 'eff')
df3 = get_PCA_data(df3, labels, 'exp')

In [5]:
df = labels.rename('label')
for data in [df1, df2, df3]:
    df = pd.merge(df, data, left_index=True, right_index=True)
    
#df.to_csv('cancer_type_data.csv', index=True)

In [6]:
indexes = df.index.value_counts() == 1
indexes = indexes.index[indexes]
df = df.loc[df.index.isin(indexes)]
print(len(df))

275
