## Creation of input files for the DT
### Imports and constants

In [None]:
import pandas as pd
import numpy as np
import os
from math import log

basepath = "/mnt/container-nle-neoagdt/data/tran2015/"

variants_out = "variants.csv"
peptide_sequences_out = "peptide-sequences.csv"
genes_out = "genes.csv"
hlas_out = "hlas.csv"
binding_scores_out = "binding-scores.csv"

# Generate gene mapping dict
genemap = dict()
with open(os.path.join(basepath.replace('tran2015', ''), 'genenames_mapping.txt'), 'r') as fh:
    for row in fh:
        columns = row.split('\t')
        genemap[columns[1].upper()] = columns[10] # alternatively column 12
        genemap[columns[10].upper()] = columns[10]
        for i in [4, 5, 7]:
            if columns[i] is not None:
                csplit = columns[i].split(',')
                for altname in csplit:
                        genemap[altname.strip().upper()] = columns[10]
samples = [x for x in os.listdir(basepath) if x.isnumeric()]
# Excluding sample 3971 since it is not present in the supplementary mutation tables
samples.remove('3971')

### Functions

In [None]:
def load_expression(sample):
    return pd.read_csv(os.path.join(basepath, sample, 'rna', 'genes.fpkm_tracking'), sep='\t')

def load_prediction(sample):
    return pd.read_csv(os.path.join(basepath, sample, 'MHC_Class_I', 
                                    '{}.all_epitopes.netchop.tsv'.format(sample)), sep='\t')

def load_stability(sample):
    return pd.read_csv(os.path.join(basepath, sample, 'MHC_Class_I', 
                                    '{}.all_epitopes.netmhcstab.tsv'.format(sample)), sep='\t')

def load_mutation(sample):
    df = pd.read_excel(os.path.join(basepath.replace('tran2015', 'tran2015_supp'), 
                                      'tran_patients_mutations.xlsx'), sheet_name=sample, skiprows=3, header=None)
    # Accounting for mistake of swapping wt and mut in sample 3812
    if sample == '3812':
       df[4] = df[4].apply(
           lambda x: ''.join([x[0:-3], x[-3:len(x)].split('>')[1], '>', x[-3:len(x)].split('>')[0]]) 
           if not pd.isnull(x) else np.NaN
       )
    return df

def load_hla(sample):
    return pd.read_csv(os.path.join(basepath, sample, 'hlatyping.txt'), sep=',', header=None) 

### Workflow
#### Predictions and peptide sequences

In [None]:
for sample in samples:
    # Load data
    df_stab = load_stability(sample)
    df_pred = load_prediction(sample)
    df_pred = pd.merge(df_pred, df_stab,  
                       how='left', 
                       left_on=['Mutation','Epitope Seq', 'HLA Allele'], 
                       right_on = ['Mutation','Epitope Seq', 'HLA Allele'])   
    
    # Scores and column mapping
    df_pred['allele name'] = df_pred['HLA Allele'].apply(
        lambda x: x.split('-')[1].translate({ord(i): None for i in '*:'}))
    df_pred['genename'] = df_pred['Mutation'].apply(lambda x: x.split('_')[0])
    df_pred['gene_id'] = df_pred['genename'].apply(
        lambda x: genemap[x.upper()] if x.upper() in genemap.keys() else np.NaN)
    df_pred['Mutation_ID'] = df_pred['Mutation'].apply(
        lambda x: '_'.join([str(genemap[x.split('_')[0].upper()]),
                            ''.join([i for i in x.split('_')[1] if not i.isdigit()])]) 
        if x.split('_')[0].upper() in genemap.keys() else np.NaN)
    df_pred['Mut_peptide'] = df_pred['Epitope Seq']
    df_pred['binding_score'] = df_pred['NetMHCpan Percentile_x'].apply(lambda x: 1 - x / 100)
    df_pred['cleavage_score'] = df_pred['Best Cleavage Score']
    df_pred['stability_score'] = df_pred['Predicted Stability']
    
    # Create list of all gene names that could not be mapped
    f = open(os.path.join(basepath, sample, 'unmapped_genes.txt'), 'w')
    for i in df_pred.loc[df_pred['gene_id'].isnull()]['genename'].unique():
        f.write(i + '\n')
    f.close() 
    
    # Filter for unmapped genes
    df_pred = df_pred.dropna(subset = ['gene_id'])
    
    # Sample specific cleanup
    if sample == '3812':
        df_pred['Mutation_ID'] = df_pred['Mutation_ID'].apply(lambda x: x.replace('.', ''))
    
    # Export
    df_pred.to_csv(os.path.join(basepath, sample, peptide_sequences_out),
              index=False, 
              columns=['Mutation_ID', 'Mut_peptide'])
    df_pred.to_csv(os.path.join(basepath, sample, binding_scores_out),
              index=False, 
              columns=['allele name', 'gene_id', 'Mutation_ID', 'Mut_peptide', 
                       'binding_score', 'cleavage_score', 'stability_score'])

#### Expression

In [None]:
for sample in samples:
    # Load data
    df_expr = load_expression(sample)
    df_expr['gene_id'] = df_expr['tracking_id'].apply(lambda x: x.split('.')[0])

    # Convert confidence interval to variance
    Z = 3.92 # two-sided since we use (high-low)
    std = (df_expr['FPKM_conf_hi'] - df_expr['FPKM_conf_lo']) / Z
    var = np.power(std, 2)
    df_expr['FPKM_VAR'] = var

    # Export
    df_expr.to_csv(os.path.join(basepath, sample, genes_out),
              index=False, 
              columns=['tracking_id', 'gene_id', 'gene_short_name', 'FPKM', 'FPKM_VAR'])

#### HLA allele specific expression

In [None]:
for sample in samples:
    # Load data
    df_expr = load_expression(sample)
    # Map typed HLA alleles to gene identfier
    df_hla = df_expr[df_expr['gene_short_name'].str.startswith('HLA')]

    allele_name = list()
    gene_id = list()
    for allele in load_hla(sample).loc[0].to_list():
        allele_name.append(allele.split('-')[1].replace('*', '').replace(':', ''))
        gene_id.append(
            df_hla.loc[df_hla['gene_short_name'] == allele.split('*')[0], 'tracking_id'].iloc[0].split('.')[0])

    # Export
    pd.DataFrame(list(zip(allele_name, gene_id)),
                 columns =['allele_name', 'gene_id']
                ).to_csv(os.path.join(basepath, sample, hlas_out), index=False)

#### Mutations

In [None]:
for sample in samples:
    # Load data
    df_mut = load_mutation(sample)
    df_pep = pd.read_csv(os.path.join(basepath, sample, peptide_sequences_out))
    df_mut['Gene_ID'] = df_mut[0].apply(lambda x: genemap[x.split('.')[0].upper()] 
                                            if x.split('.')[0].upper() in genemap.keys() else np.NaN)
    df_mut['Mutation_ID'] = df_mut.apply(
        lambda dfrow: '_'.join([genemap[dfrow[0].split('.')[0].upper()] 
                                if dfrow[0].split('.')[0].upper().split('.')[0].upper() in genemap.keys() 
                                else str(np.NaN), str(dfrow[4])]), axis=1)
    df_mut['Mutation_ID'] = df_mut['Mutation_ID'].apply(
        lambda x: '_'.join([x.split('_')[0], ''.join([i for i in x.split('_')[1] if not i.isdigit()])])) 
    df_mut['VAF'] = df_mut[7]

    df_mut = pd.merge(df_mut, df_pep,  
                        how='left', on='Mutation_ID')
    
    # Export
    df_mut.to_csv(os.path.join(basepath, sample, variants_out),
                    index=False, 
                    columns=['Mutation_ID', 'Gene_ID', 'VAF'])