In [1]:
import pandas as pd
import random
import itertools
import numpy as np
import time 
import pickle
from scipy.stats import ttest_ind, chi2_contingency, fisher_exact, ranksums
from scipy.stats import ttest_ind, chi2_contingency, fisher_exact, ranksums
from scipy.stats import pearsonr, spearmanr

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Context and Background

In pancreatic cancer, there are two main subtypes that have been identified: 'basal-like' and 'classical.' However, the problem is that we don't really know which gene to target in 'basal-like' subtypes and which to target in 'classical' subtypes so they're functionally useless because we don't know how to treat each uniquely. Therefore, I think it could be interesting to perform a correlation analysis between each of the genes in our lists (both the FDA and also the novel drug list) and see if any of these genes associate strongly with basal-like or classical signature strength.

So, more precisely, we are assessing correlation between GENE X's expression values and signature scores for basal-like/classical across all cells that contain non-zero values for each. Do this only on tumor cells, we don't care about the other cell types. The non-zero expression value of gene X should be pretty straightforward; this is already present in the single cell matrix. To determine the signature score, we will use the same formula as before: 

**Equation:**
Signature Mean - Background Mean = Signature Score 

For each cell, there will be a unique Signature Mean and a unique Background Mean, and therefore a unique Signature Score for each cell.

Calculating Signature Mean:
Extract the specific columns that line up with the gene names in the signature (see each attachment separately).
Take the average value of all of those column values for that one cell/row. You do not take the average of values down a column in this case.

Use the lists attached for basal-like (Moffitt basal-like) and classical like (Moffitt classical-like) to calculate a basal-like signature mean and also a classical signature mean for each cell (you have already computed a background score in the past, so you can just use that same unique value from before for each cell). Then subtract the background mean from the signature mean to get the signature score for each cell. Then similar to the analysis in Part #4, let's generate a list of genes that have the strongest correlations with basal-like scores and then separately also a list that correlates with classical signature scores (using both pearson and spearman, p and r). 

In [3]:
%cd drive/My\ Drive/Broad

/content/drive/My Drive/Broad


In [4]:
%pwd

'/content/drive/My Drive/Broad'

In [5]:
subset = 'novel'
gene_file = '110920_new drug list v1.csv'

In [6]:
column_names = pd.read_csv('data/columns for X.csv',header = None)
column_names = list(column_names[0])

In [7]:
rows = pd.read_csv('data/rows for X.csv',header = None)
cell_type = list(rows[0])
cell_type = [val.lower() for val in cell_type]

In [8]:
skip_rows = [idx  for idx, val in enumerate(cell_type) if val != 'tumor']

In [9]:
gene_list = pd.read_csv('data/{}'.format(gene_file), header = None)
gene_list = list(gene_list[0])

In [10]:
len(gene_list)

125

In [11]:
df_gene = pd.read_csv('data/X.csv',names = column_names, skiprows = skip_rows, usecols = lambda x: x in gene_list)

In [12]:
len(skip_rows)

44214

def get_background(row):
    mean_list = []
    n_cols = len(row)    
    for i in range(25):
        col_rand = random.sample(range(0, n_cols), 50)
        row_rand = row.iloc[col_rand]
        mean_list.append(np.nanmean(row_rand))
        
    return np.nanmean(mean_list)

chunksize = 3000
background_list = []
start = time.time()
for idx, chunk in enumerate(pd.read_csv('data/X.csv',names = column_names, chunksize=chunksize)):
    background_list.append(chunk.apply(get_background,axis = 1))
    print('completed with {} at {:.2f}'.format(idx,time.time()-start))


background_arr = pd.concat(background_list,axis = 0)

df_background = pd.DataFrame({'value':background_arr})

df_background.to_csv('outputs/background_values.csv', index = False)

In [13]:
df_background = pd.read_csv('outputs/background_values.csv')

In [14]:
groups_list = [
    ('basal','data/Moffitt Basal-like.csv'),
    ('classifical','data/Moffit Classical-like.csv'),
]

In [None]:

df_basal_list = pd.read_csv('data/Moffitt Basal-like.csv', header = None)
basal_gene_list = list(df_basal_list[0])

df_gene_basal = pd.read_csv('data/X.csv', names = column_names, usecols = lambda x: x in basal_gene_list)
df_gene_basal['score'] = df_gene_basal.mean(axis = 1) - df_background['value'].values
df_gene_basal['cell_type'] = cell_type

df_gene_basal_tumor = df_gene_basal[df_gene_basal['cell_type'] == 'tumor']
basal_score_tumor  = df_gene_basal_tumor['score']

In [None]:
basal_score_tumor.shape

In [None]:
df_classical_list = pd.read_csv('data/Moffit Classical-like.csv', header = None)
classical_gene_list = list(df_classical_list[0])

df_gene_classical = pd.read_csv('data/X.csv', names = column_names, usecols = lambda x: x in classical_gene_list)
df_gene_classical['score'] = df_gene_classical.mean(axis = 1) - df_background['value'].values
df_gene_classical['cell_type'] = cell_type

df_gene_classical_tumor = df_gene_classical[df_gene_classical['cell_type'] == 'tumor']
classical_score_tumor  = df_gene_classical_tumor['score']

In [None]:
df_gene['basal_score'] = list(basal_score_tumor)
df_gene['classical_score'] = list(classical_score_tumor)


In [None]:
df_gene.replace(0,np.nan, inplace = True)

In [None]:
gene_col_list = []
r_pearson_list = []
r_spearman_list = []
p_pearson_list = []
p_spearman_list = []
n_rows_list = []

for group in ['basal_score','classical_score']:
  for gene in gene_list:

    df_subset = df_gene[[gene, group]]

    df_subset_filtered = df_subset[~df_subset[gene].isna()]
    r_pearson, p_pearson = pearsonr(df_subset_filtered[gene], df_subset_filtered[group])
    r_spearman, p_spearman = spearmanr(df_subset_filtered[gene], df_subset_filtered[group])

    n_rows = df_subset_filtered.shape[0]
    gene_col_list.append(gene)
    r_pearson_list.append(r_pearson)
    p_pearson_list.append(p_pearson)
    r_spearman_list.append(r_spearman)
    p_spearman_list.append(p_spearman)        
    n_rows_list.append(n_rows)
      
  df_gene_corr = pd.DataFrame({'gene':gene_col_list,
                                'r_pearson':np.round(r_pearson_list,5),
                                'p_pearson':p_pearson_list,
                                'r_spearman':np.round(r_spearman_list,5),
                                'p_spearman':p_spearman_list,
                                'n_rows_nonzero' : n_rows_list
                              })
  df_gene_corr.sort_values(by= 'r_pearson', ascending = False, inplace = True)
  df_gene_corr.to_csv('outputs/{}_{}_correlation.csv'.format(subset,group),index = False)




In [None]:
df_gene.mean(axis = 1)