In [1]:
import pandas as pd
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df_posey = pd.read_csv('posey_results.txt', sep='\t', index_col=0)
df_posey.head()

Unnamed: 0_level_0,GeneA,GeneB,Alt_varA,PosA,Alt_varB,PosB,Support_score,Classification_score,Predicted_class,95-99%_confidence,Type
Comb_No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,,,,,,,,,,,
2,,,,,,,,,,,
3,,,,,,,,,,,
4,,,,,,,,,,,
5,ENG,ASXL3,C/-,130581941/-,T/-,31323161/-,93.2,0.626779,Disease-causing,95%,Distinct


In [3]:
from collections import namedtuple
Row = namedtuple('Row', 'GeneA,GeneB,Alt_varA,PosA,Alt_varB,PosB,Support_score,Classification_score,Predicted_class,confidence,Type')

In [4]:
posey_rows = {}
for row in array(df_posey):
    if type(row[0]) is float: continue;
    posey_rows[row[0]+'/'+row[1]] = Row(*row)

In [5]:
RowClassifier = namedtuple('RowClassifier', 'CADD1,CADD2,RecA,EssA,CADD3,CADD4,RecB,EssB,Pathway,GenePair,Type')

In [6]:
df_ess = pd.read_csv('Mus musculus_consolidated.csv')[ ['symbols','essentiality consensus'] ]
df_ess.head()

essentialities = {}
for row in array(df_ess):
    if type(row[0]) is float: continue;
    essentialities[row[0].upper()] = 1 if row[1] == 'Essential' else 0

In [7]:
excel_rec = pd.read_excel('recessiveness_science.xlsx')[ ['gene', 'P(rec)'] ]

recessiveness = {}
for row in array(excel_rec):
    if type(row[0]) is float: continue;
    recessiveness[row[0].upper()] = row[1]

In [8]:
df_cpra = pd.read_csv('dual_diag_chrom_pos_no_space.tsv', index_col=0)

with open('dual_diagnosis_tocadd_nodots.vcf', 'w') as f_output:
    f_output.write('\t'.join( ('#CHROM', 'POS', 'ID', 'REF', 'ALT') ) + '\n')
    for i, row in enumerate(array(df_cpra)):
        row = row[2:]
        if row[0] == '.': continue;
        row[0] = row[0][3:]
        f_output.write(str(row[0]) + '\t' + str(row[1]) + '\tvar' + str(i) + '\t' + str(row[2]) + '\t' + str(row[3]) + '\n' )

In [9]:
df_cadd = pd.read_csv('dual_diag_cadd_scores.tsv', sep='\t')
df_cadd.head()

def get_cpra(row, c, p, r, a):
    if row[c] == '.': return '0';
    return (
        str(row[c] if 'chr' not in row[c] else row[c][3:]) + '-' +
        str(row[p]) + '-' +
        str(row[r]) + '-' +
        str(row[a])
    )

cadd_from_cpra = {}
for row in df_cadd.iterrows():
    row = row[1]
    cadd_from_cpra[get_cpra(row, 'CHROM', 'POS', 'REF', 'ALT')] = row['RawScore']

In [10]:
df_cpra['CADD'] = df_cpra.apply( lambda row: cadd_from_cpra.setdefault(get_cpra(row, 'chr', 'pos', 'ref', 'alt'), 0), axis=1 )

In [11]:
row_from_gpa = {}
for row in df_cpra.iterrows():
    row = row[1]
    row_from_gpa[ row['gene'] + str(row['pos']) + row['alt']] = row

In [12]:
with open('posey_to_predict.csv', 'w') as f_out:
    f_out.write('CADD1,CADD2,RecA,EssA,CADD3,CADD4,RecB,EssB,Path,GenePair,Type\n')
    for gene_pair in posey_rows:
        features = []
        row = posey_rows[gene_pair]
        
        #CADD1, CADD2
        pos_a, pos_b = row.PosA.split('/')
        mod_a, mod_b = row.Alt_varA.split('/')
        mod_b = mod_b if mod_b != 'na' else mod_a
        pos_b = pos_b if pos_b != 'na' else pos_a
        
        row_a = row_from_gpa.setdefault(row.GeneA + str(pos_a) + mod_a, None)
        row_b = row_from_gpa.setdefault(row.GeneA + str(pos_b) + mod_b, None)
        
        features.append(row_a['CADD'])
        features.append(0 if row_b is None else row_b['CADD'])
        features.append(recessiveness.setdefault(row.GeneA, 'N/A'))
        features.append(essentialities.setdefault(row.GeneA, 'N/A'))
        
        #CADD3, CADD4
        pos_a, pos_b = row.PosB.split('/')
        mod_a, mod_b = row.Alt_varB.split('/')
        mod_b = mod_b if mod_b != 'na' else mod_a
        pos_b = pos_b if pos_b != 'na' else pos_a
        
        row_a = row_from_gpa.setdefault(row.GeneB + str(pos_a) + mod_a, None)
        row_b = row_from_gpa.setdefault(row.GeneB + str(pos_b) + mod_b, None)
        
        features.append(row_a['CADD'])
        features.append(0 if row_b is None else row_b['CADD'])
        features.append(recessiveness.setdefault(row.GeneB, 'N/A'))
        features.append(essentialities.setdefault(row.GeneB, 'N/A'))
        
        # Pathway
        features.append('?')
        
        # Gene pair
        features.append(gene_pair)
        
        # Type
        features.append(row.Type)
        
        features = [i if i != -1 else 'N/A' for i in features]
        
        f_out.write(','.join(map(str, features)) + '\n')