In [1]:
import os
import sys
sys.path.append("..")  # add top folder to path
from collections import Counter

import numpy as np
import pandas as pd
import scipy.stats as sts
import matplotlib.pyplot as plt
import torch
import impepdom

In [8]:
def weighted_harmonic_mean(var_1, var_2, beta=1):  # should make it generalizable to many variables
    '''
    Harmonic mean for two parameters with weighting.
    
    Parameters
    ----------
    var_1, var_2: int or ndarray
        Variables to consider
        
    beta: float, optional
        Importance of `var_2` relative to `var_1`.
        If beta == 1, this function is equivalent to `scipy.stats.hmean()`
    '''
    
    return (1 + beta**2) * np.multiply(var_1, var_2) / (beta**2 * var_1 + var_2)

### CNN Training and Evaluation

In [20]:
### Import csv and extract best hyperparameters

path = '../store/hyperparams'

hyperparams = []


for file in ['/mlp_2x100_cnn_b08:01.csv']: # add '/mlp_2x100_cnn_a01/01.csv', 
    all_name = file[15:-4]
    allele = 'HLA-' + all_name.upper() # change to appropriate name
    df = pd.read_csv(path + file)
    idx = (df['min_auc'].notna() & df['mean_pcc'].notna())
    
    metric_1, metric_2 = np.array(df['min_auc'][idx]), np.array(df['mean_pcc'][idx]) # change from ppv to pcc
    beta = 1  # how much the second metric should be weighted compared to the first
    w_hmean = weighted_harmonic_mean(metric_1, metric_2, beta=0.6)
    
    best_3_rows = (-w_hmean).argsort()[:3] # for top 3 rows with best harmonic mean value
    
    batch_sizes = list(df['batch_size'][best_3_rows].astype('int'))
    batch_counter = Counter(batch_sizes)
    batch_sz = batch_counter.most_common(1)[0][0]
    
    hyperparams.append({
        'hla_allele': allele, 
        'padding': 'flurry',
        'batch_size': batch_sz, 
        'num_epochs': int(np.mean(df['num_epochs'][best_3_rows])),
        'learning_rate': float(np.mean(df['learning_rate'][best_3_rows])),
        'dropout_input': float(np.mean(df['dropout_input'][best_3_rows])),
        'dropout_hidden': float(np.mean(df['dropout_hidden'][best_3_rows])),
        'min_auc': list(metric_1[best_3_rows]),
        'mean_pcc': list(metric_2[best_3_rows])
    })
    


In [21]:
hyperparams

[{'hla_allele': 'HLA-B08:01',
  'padding': 'flurry',
  'batch_size': 32,
  'num_epochs': 8,
  'learning_rate': 0.001,
  'dropout_input': 0.3333333333333333,
  'dropout_hidden': 0.6666666666666666,
  'min_auc': [0.961533306486236, 0.9586539891169714, 0.9529939536379968],
  'mean_pcc': [0.7810577608677632, 0.7834574843780465, 0.7852789903522068]}]

In [24]:
results = []

impepdom.time_tracker.reset_timer() 

for hyp in hyperparams:
    print('working with allele', hyp['hla_allele'])
    model = impepdom.MultilayerPerceptron(num_hidden_layers=2, hidden_layer_size=100, dropout_input=hyp['dropout_input'], 
                                                 dropout_hidden=hyp['dropout_hidden'], conv=True, num_conv_layers=2, conv_filt_sz=5, conv_stride=1)
    
    dataset = impepdom.PeptideDataset(
        hla_allele=hyp['hla_allele'],
        padding='end',
        toy=False)

    save_folder, baseline_metrics, _ = impepdom.run_experiment(
        model_type='cnn',
        dataset=dataset,
        train_fold_idx=[0, 1, 2, 3],
        learning_rate=hyp['learning_rate'],
        num_epochs=hyp['num_epochs'],
        batch_size=hyp['batch_size'],
    )
    
    trained_model, train_history = impepdom.load_trained_model(model, save_folder)
    
    
    X_test, y_test = dataset.get_fold(fold_idx=[4])
    y_proba = model(torch.tensor(X_test, dtype=torch.float)).detach().numpy()
    
    results.append({
        'hla_allele': hyp['hla_allele'],
        'y_test': y_test,
        'y_proba': y_proba
        })

working with allele HLA-B08:01
[0 m 23 s] peptide dataset initialized


KeyError: 'cnn'

In [None]:
### Allele A01:01 test set eval - calculating the F ranks

In [None]:
### Allele B08:01 test set eval - calculating the AUC, PPC


