In [None]:
import numpy as np
import pandas as pd
import os
import glob
import seaborn as sns
import matplotlib
from collections import Counter

Read in Data

In [None]:
OVCAR_dt=pd.read_csv("DESEQ_Normalized_Output_dt_nonzero_genes.tsv", sep="\t",header = 0)
OVCAR_dt[:10]

Process the data to define the mean of every gene and quartile infortmation

In [None]:
samplenames = []
for val in OVCAR_dt.columns:
    if val[0:3] == 'OVC':
        samplenames.append(val)
samplenames[:10]

In [None]:
OVCAR_nozero = np.array(OVCAR_dt)

In [None]:
genemean = []
for row in OVCAR_nozero[:,1:95]:
    means = np.mean(row)
    genemean.append(means)
OVCAR_dt['Gene_Mean'] = genemean

In [None]:
lowerthreshold_TQN = []
upperthreshold_TQN = []
for row in OVCAR_nozero[:,1:95]:
    lowq = np.percentile(row,25)
    upq = np.percentile(row,75)
    IQR = upq - lowq
    lowthresh = lowq - (1.5 * IQR)
    upperthresh = upq + (1.5 * IQR)
    lowerthreshold_TQN.append(lowthresh)
    upperthreshold_TQN.append(upperthresh)
    
    
OVCAR_dt['LowerThreshold'] = lowerthreshold_TQN
OVCAR_dt['UpperThreshold'] = upperthreshold_TQN

OVCAR_dt[:10]

Rank all genes by thier mean expression across the entire cohort; from highest to lowest expression (Rank 1 = highest expression)

In [None]:
OVCAR_dt['Rank'] = OVCAR_dt['Gene_Mean'].rank(method = 'average', ascending = False)

OVCAR_dt[:10]

In [None]:
ranknames = []
for val in samplenames:
    name = val + '_rank'
    ranknames.append(name)
    OVCAR_dt[name] = OVCAR_dt[val].rank(method = 'average', ascending = False)
OVCAR_dt[:10]

Determine the change in rank value for every gene in every sample

In [None]:
DevNames = []
Deviation_in_Rank = []
for val in ranknames:
    geneset = [val]
    newname = val+'_deviation_in_rank'
    DevNames.append(newname)
    for i in range(len(OVCAR_dt[val])):
        rank = OVCAR_dt[val][i]
        sample = val
        #median = DESEQ_dt['Median'][i]
        true_rank = OVCAR_dt['Rank'][i]
        gene = OVCAR_dt['Gene'][i]
        #genechrom = DESEQ_dt['Chromosomes'][i]
        rank_dif = true_rank - rank
        #OVCAR_dt[newname][i] = true_rank
        Deviation_in_Rank.append([sample, gene, rank_dif])
    

In [None]:
for val in DevNames:
    set_of_values = []
    for i in range(len(Deviation_in_Rank)):
        name = val.split('_')[:3]
        name2 = Deviation_in_Rank[i][0].split('_')[:3]
        vals = Deviation_in_Rank[i][2]
        if name == name2:
            set_of_values.append(vals)
    OVCAR_dt[val] = set_of_values
OVCAR_dt[:10]

Establish Negative Controls

In [None]:
ControlNames = ['OVCAR3_H10_DORM_1_no_chro_rank_deviation_in_rank','OVCAR3_G11_SCRAMBLE_1_no_chro_rank_deviation_in_rank']

In [None]:
Dev_values_control = []
maxes = []
mins = []
for name in ControlNames:
    namevals = []
    for i in range(len(OVCAR_dt[name])):
        Dev_values_control.append(OVCAR_dt[name][i])
        namevals.append(OVCAR_dt[name][i])
    maxes.append(np.max(namevals))
    mins.append(np.min(namevals))
Dev_values_control[:3]
Dev_values_only = []
maxes = []
mins = []
for name in DevNames:
    namevals = []
    for i in range(len(OVCAR_dt[name])):
        Dev_values_only.append(OVCAR_dt[name][i])
        namevals.append(OVCAR_dt[name][i])
    maxes.append(np.max(namevals))
    mins.append(np.min(namevals))
Dev_values_only[:3]


# FDR Stuff

In [None]:
#FDR function that accepts lengths of gene lists from normals and non-normals 
#need to do sample by sample, should be the median of the controls vs the total count 
def FDR_Calculation(normals,rest):
    firstset = np.median(normals)
    values = []
    boolvals = []
    combo_of_vals = []
    #when looking for threshold cutoffs 
    FDR_Threshold_Internal = .05
    for i in range(len(rest)):
        secondset = rest[i]
        value = firstset/secondset
        values.append(value)
        booleanvalue = value < FDR_Threshold_Internal
        boolvals.append(booleanvalue)
        combo_of_vals.append([value,booleanvalue])
            
        
        
    return(values)

def FDR_Calculation_Threshold(normals,rest,thresh):
    firstset = np.median(normals)
    values = []
    boolvals = []
    combo_of_vals = []
    #when looking for threshold cutoffs 
    FDR_Threshold_Internal = thresh
    for i in range(len(rest)):
        name = FDR_Threshold_Up[i][0]
        secondset = rest[i]
        if secondset == 0:
            value = 'NA'
            values.append(value)
            booleanvalue = 'NA'
            boolvals.append(booleanvalue)
            combo_of_vals.append([value,booleanvalue,name,i])
        elif secondset != 0:
            value = firstset/secondset
            values.append(value)
            booleanvalue = value < FDR_Threshold_Internal
            boolvals.append(booleanvalue)
            combo_of_vals.append([value,booleanvalue,name,i])
            
        
        
    return(combo_of_vals)

# Downregualted

In [None]:
#combo down
FDR_Threshold_Up = []
for val in DevNames:
    geneset = [val]
    #chromose = val.split('_')[-1]
    for i in range(len(OVCAR_dt[val])):
        value = OVCAR_dt[val][i]
        #median = DESEQ_dt['Median'][i]
        #set threshold 
        Threshold = -2190
        gene = OVCAR_dt['Gene'][i]
        #genechrom = DESEQ_dt['Chromosomes'][i]
        if value < Threshold :
            geneset.append(gene)
    FDR_Threshold_Up.append(geneset)

norms = []
non_norms = []
for i in range(len(FDR_Threshold_Up)):
    lengths = len(FDR_Threshold_Up[i])-1
    if i == 0 or i == 67:
        norms.append(lengths)
        non_norms.append(lengths)
    else:
        non_norms.append(lengths)
print(np.sum(norms),np.sum(non_norms))






# Upregulated

In [None]:
#upregualted
FDR_Threshold_Up = []
for val in DevNames:
    geneset = [val]
    #chromose = val.split('_')[-1]
    for i in range(len(OVCAR_dt[val])):
        value = OVCAR_dt[val][i]
        #median = DESEQ_dt['Median'][i]
        #set threshold 
        Threshold = 2200
        gene = OVCAR_dt['Gene'][i]
        #genechrom = DESEQ_dt['Chromosomes'][i]
        if value > Threshold :
            geneset.append(gene)
    FDR_Threshold_Up.append(geneset)

norms = []
non_norms = []
for i in range(len(FDR_Threshold_Up)):
    lengths = len(FDR_Threshold_Up[i])-1
    if i == 0 or i == 67:
        norms.append(lengths)
        non_norms.append(lengths)
    else:
        non_norms.append(lengths)
print(np.sum(norms),np.sum(non_norms))





In [None]:
FDR_RES = FDR_Calculation_Threshold(norms,non_norms,.1)

FDR_Passed_Set = []
for i in range(len(FDR_RES)):
    boolval = FDR_RES[i][1]
    if boolval == True:
        FDR_Passed_Set.append(FDR_Threshold_Up[i])
FDR_Passed_Set_dt = pd.DataFrame(FDR_Passed_Set)
FDR_Passed_Set_dt[:10]

In [None]:
#OVCAR_dt.to_csv('OVCAR3_CRISPRI_SCREEN_DESEQ2_Normalized_with_Rank_and_Rank_Change.tsv', sep='\t', index= False)