As you know, the goal of this project has basically been to take a large group of genes at the beginning and then to continually whittle them down via a set of criteria or screens. At one of our steps, we used a filter step at the patient level but we want to be more inclusive and include an 'OR' statement that also accounts for single-cell expression.

So here is how this would be done:
1. Take the genes that are present in 'COMPARTMENTS >=4  UPDATED GENE NAMES.csv' which can be found here: https://drive.google.com/drive/folders/1RxLl65T5mD7P9jL584zareH6gUBs3t_B . Then, using these genes, ask if they have non-zero expression in at least (>=) 10% of naive tumor cells OR naive atypical ductal cells OR treated tumor cells OR treated atypical ductal cells. 
2. Once you have the whittled down list from #1, compare that to the genes found in the attached file, (120520) Post >=4 Surface and TCGA >25 FPKM.csv.
3. Take the SUPERSET of the genes found in #1 and #2 (so any gene that is in either #1 or #2), and then remove the duplicates, and send this gene list over to me.

In [1]:
import pandas as pd
import numpy as np

### read in columns and rows for single cell

In [2]:
columns_naive = pd.read_csv('../data/columns for X.csv',header = None)
columns_naive = list(columns_naive[0])

rows_naive = pd.read_csv('../data/rows for X.csv',header = None)
cell_type_naive = list(rows_naive[0])
cell_type_naive = [val.lower() for val in cell_type_naive]

In [3]:
rows_treated = pd.read_csv('../data/treated snRNA cell types list.csv',header = None)
cell_type_treated = list(rows_treated[0])
cell_type_treated = [val.lower() for val in cell_type_treated]

columns_treated = pd.read_csv('../data/treated snRNA gene list.csv', header = None)
columns_treated = list(columns_treated[0])

## read in gene names

In [4]:
gene_list = pd.read_csv('../data/COMPARTMENTS _=4 UDPATED GENE NAMES.csv', header = None)
gene_list = list(gene_list[0])
gene_list[0:10]

['CFTR',
 'RALA',
 'CACNG3',
 'SKAP2',
 'CEACAM7',
 'ITGA3',
 'CD4',
 'TSPAN9',
 'GPRC5A',
 'PSD']

## Naive Analysis

In [24]:
def get_not_null_cols(df, thresh = .9):

    df = df.replace(0,np.nan)
    df_null_ratio = pd.DataFrame(df.isna().sum()/df.shape[0],columns = ['null_ratio'])

    sublist = list(df_null_ratio[df_null_ratio['null_ratio'] < .9].index)
    
    return sublist

In [30]:
import pickle
import time

In [28]:
naive_gene_col_list = [idx for idx, col in enumerate(columns_naive) if col in gene_list]

In [33]:
naive_gene_list = []

start = time.time()

for i in range(0, len(naive_gene_col_list), 250):
    short_gene_list = naive_gene_col_list[i:i + 250]
    df_naive = pd.read_csv('../data/X.csv', 
                             names = columns_naive, 
                             usecols = short_gene_list)
    df_naive['cell_type'] = cell_type_naive
    df_naive_tumor = df_naive[df_naive['cell_type']=='tumor']
    df_naive_atypical = df_naive[df_naive['cell_type'] == 'atypical_ductal']
    
    sublist_tumor = get_not_null_cols(df_naive_tumor,thresh = .9)
    sublist_atypical = get_not_null_cols(df_naive_atypical,thresh = .9)
    
    naive_gene_list.append(sublist_tumor)
    naive_gene_list.append(sublist_atypical)
    
    print('done with {} at {:.2f}'.format(i+250, time.time()-start))

with open('outputs/naive_gene_list_compartments_filtered.pkl', 'wb') as f:
    pickle.dump(naive_gene_list, f)

done with 250 at 137.40
done with 500 at 230.80
done with 750 at 311.75
done with 1000 at 391.04
done with 1250 at 470.67
done with 1500 at 551.18
done with 1750 at 632.42
done with 2000 at 713.91
done with 2250 at 795.78
done with 2500 at 877.61
done with 2750 at 959.75
done with 3000 at 1042.33
done with 3250 at 1125.65
done with 3500 at 1209.83
done with 3750 at 1294.47
done with 4000 at 1379.24
done with 4250 at 1469.96
done with 4500 at 1562.20


## Treated

In [36]:
treated_gene_col_list = [idx for idx, col in enumerate(columns_treated) if col in gene_list]

In [38]:
treated_gene_list = []
start = time.time()

for i in range(0, len(treated_gene_col_list), 250):
    short_gene_list = treated_gene_col_list[i:i + 250]
    df_treated = pd.read_csv('../data/X treated.csv', 
                             names = columns_treated, 
                             usecols = short_gene_list)
    
    df_treated['cell_type'] = cell_type_treated

    df_treated_tumor = df_treated[df_treated['cell_type']=='tumor']
    df_treated_atypical = df_treated[df_treated['cell_type'] == 'atypical_ductal']
    
    sublist_tumor = get_not_null_cols(df_treated_tumor,thresh = .9)
    sublist_atypical = get_not_null_cols(df_treated_atypical,thresh = .9)
    
    treated_gene_list.append(sublist_tumor)
    treated_gene_list.append(sublist_atypical)
    
    print('done with {} at {:.2f}'.format(i+250, time.time()-start))

with open('outputs/treated_gene_list_compartments_filtered.pkl', 'wb') as f:
    pickle.dump(treated_gene_list, f)

done with 250 at 45.22
done with 500 at 89.80
done with 750 at 134.84
done with 1000 at 179.95
done with 1250 at 225.32
done with 1500 at 270.58
done with 1750 at 320.82
done with 2000 at 367.24
done with 2250 at 413.60
done with 2500 at 461.70
done with 2750 at 508.05
done with 3000 at 555.38
done with 3250 at 602.54
done with 3500 at 653.04
done with 3750 at 701.03
done with 4000 at 748.68
done with 4250 at 797.13
done with 4500 at 845.51


In [55]:
import itertools

set_treated = set(itertools.chain.from_iterable(treated_gene_list))
set_naive = set(itertools.chain.from_iterable(naive_gene_list))

In [56]:
set_total = set_treated.union(set_naive)

In [61]:
original_set = pd.read_csv('../data/(120520) Post _=4 SURFACE and TCGA _25 FPKM.csv',header = None)

In [62]:
original_set = set(original_set[0])

In [64]:
final_list = list(original_set.union(set_total))

In [70]:
df_output = pd.DataFrame(final_list, columns = ['genes'])
df_output.to_csv('outputs/gene_list_superset.csv',index = False)

In [71]:
new_gene_list = [gene for gene in final_list if gene not in original_set]

In [72]:
len(new_gene_list)

66