In [1]:
#Matthew Markamn, 3//4/19
#Python3 script to parse the data for Tempus coding challenge

In [2]:
import pandas as pd

In [3]:
#Read in the file as a pandas dataframe
cnv_data = pd.read_csv('/Users/MacProMatt/Desktop/CNV_challenge (5).csv')

In [4]:
#Question one
#For each gene, what proportion of patients have probe regions that are called loss,
#neutral, or gain? These numbers should not necessarily sum to 1, as a patient may have
#more than one call per gene.

#subset dataframe by columns and drop duplicates
new_cnv = cnv_data[['orderid', 'gene', 'call']]
new_cnv = new_cnv.drop_duplicates(keep = 'first')

#create a dataframe to organize calls per gene
genes, calls, data = ['TP53', 'EGFR', 'CDKN2A'], ['loss', 'gain', 'neutral'], [[0]*3]*3
call_df = pd.DataFrame(data, index = calls, columns = genes)

#iterate through the subsetted dataframe and populate our matrix
for index, row in new_cnv.iterrows():
    patient, gene, call = row["orderid"], row["gene"], row["call"]
    call_df[gene][call] += 1

#visualize our matrix
#caluculations for each data point in matrix are made by taking x/1526 (total number of patients)
call_df

Unnamed: 0,TP53,EGFR,CDKN2A
loss,26,50,186
gain,12,85,7
neutral,1502,1356,1337


In [5]:
#Question 2
#How many cases have multiple calls per gene?
#Are there any that have more than one gene with a conflicting call? List them.

#initialize a dictionary to hold the amount of calls per patient
pat_dict = {}

#iterate through the dataframe (duplicates already removed) and count the calls per patient
#if there are >3 calls per patient, we know there is a gene with more than one call for that patient
#if there are >4 genes per patient, this patient is a candidate to have more than one gene with a conflicting call
for index, row in new_cnv.iterrows():
    patient, gene, call = row["orderid"], row["gene"], row["call"]
    call_df[gene][call] += 1
    if patient not in pat_dict:
        pat_dict[patient] = 1
    else: 
        pat_dict[patient] += 1

#populate lists with the patients that have more than 3 or 4 calls for the 3 genes
mult_calls, potential_tri_calls = [], []        
for i in pat_dict:
    if pat_dict[i] > 3:
        mult_calls.append(i)
    if pat_dict[i] > 4:
        potential_tri_calls.append(i)

#the number of patients with conflicting calls (includes patients that may have conflicts for more than one gene)
print(len(mult_calls))

#the patients with more than one gene with a conflicting call (visually confirmed below)
print(potential_tri_calls)

#subset the deduplicated dataframe by potential multiple genes conflicting calls so we can visually confirm 
#that the four elements of 'potential_tri_calls' are in fact true positives
mult_calls_df = new_cnv.loc[new_cnv['orderid'].isin(potential_tri_calls)]
mult_calls_df

38
['PAT180', 'PAT532', 'PAT1256', 'PAT1421']


Unnamed: 0,orderid,gene,call
7408,PAT180,TP53,neutral
7420,PAT180,EGFR,neutral
7424,PAT180,EGFR,gain
7445,PAT180,CDKN2A,neutral
7446,PAT180,CDKN2A,loss
21854,PAT532,TP53,loss
21857,PAT532,TP53,gain
21869,PAT532,EGFR,neutral
21874,PAT532,EGFR,gain
21894,PAT532,CDKN2A,neutral


In [6]:
#Question3
#Cases with multiple calls are considered conflicting. These conflicts can come in
#different combinations. For example, in the two patients above, PATX has a loss/gain
#conflict and PATY has a neutral/gain conflict. What is the most common type of conflict
#in the dataset?

#subset deduplicated dataframe by only keeping rows with duplicates in the 'orderid' and 'genes' column
double_calls =  new_cnv.loc[new_cnv['orderid'].isin(mult_calls)]
double_calls_index = double_calls.duplicated(subset = ['orderid', 'gene'], keep = False)
double_calls = double_calls.loc[double_calls_index]

#loop through the list and populate a dict with types of conflicting calls
last_patient, last_gene, last_call = "", "", ""
call_dict = {"neg/neutral":0, "pos/neutral":0, "neg/pos":0}
for index, row in double_calls.iterrows():
    patient, gene, call = row["orderid"], row["gene"], row["call"]
    if patient == last_patient and gene == last_gene:
        call_list = [call, last_call]
        if "neutral" in call_list and "loss" in call_list:
            call_dict["neg/neutral"] += 1
        elif "neutral" in call_list and "gain" in call_list:
            call_dict["pos/neutral"] += 1
        elif "gain" in call_list and "loss" in call_list:
            call_dict["neg/pos"] += 1
    last_patient, last_gene, last_call = patient, gene, call

#output the numbers of each type of conflicting call
print(call_dict)

{'neg/neutral': 21, 'pos/neutral': 20, 'neg/pos': 1}
