# ACE Neural Validation Experiments (In-Silico)

#### Dhuvarakesh Karthikeyan and Jin Seok (Andy) Lee

In [5]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt

import os
import re

In [6]:
all_df = pd.read_csv('val_refs/iedb_mmer_all.csv')
pos_df = all_df[all_df['Binding']==1].reset_index(drop=True)
neg_df = all_df[all_df['Binding']==0].reset_index(drop=True)

print(f'Number of positive examples: {len(pos_df)}')
print(f'Number of negative examples: {len(neg_df)}')

Number of positive examples: 2297
Number of negative examples: 4760


In [15]:
from acelib.elispot import ELIspot

test_df = pos_df.sample(25).reset_index(drop=True)

# Step 1. Create an ELIspot configuration
elispot = ELIspot(
    num_peptides_per_pool=5,
    num_coverage=3,
    num_processes=10,
    peptide_ids=list(np.arange(0, len(pos_df))),
    #peptide_sequences=list(pos_df['Epitope'].values)
)


In [16]:
elispot.generate_configuration()

KeyboardInterrupt: 

In [7]:
def sample_peptide_list_rough(data_df, num_samples, num_positives):
    """
    Samples a list of peptides from a dataframe of peptides.

    Returns a dataframe of num_samples peptides, with num_positives
    positive peptides (immunogenic)
    """
    # Sample num_positives positive peptides
    pos_df = data_df[data_df['Binding']==1].reset_index(drop=True)
    assert num_positives <= len(pos_df), 'num_positives must be less than or equal to the number of positive peptides in the dataframe'
    pos_sample = pos_df.sample(n=num_positives, replace=False)
    
    # Sample num_samples - num_positives negative peptides
    neg_df = data_df[data_df['Binding']==0].reset_index(drop=True)
    assert num_samples - num_positives <= len(neg_df), 'num_samples - num_positives must be less than or equal to the number of negative peptides in the dataframe'
    neg_sample = neg_df.sample(n=num_samples - num_positives)
    
    # Combine the two samples
    sample_df = pd.concat([pos_sample, neg_sample]).reset_index(drop=True)
    
    return sample_df


In [9]:
sorted(['peptide_1', 'peptide_10', 'peptide_100'])

['peptide_1', 'peptide_10', 'peptide_100']

In [8]:
sample_peptide_list_rough(temp_df, 100, 5)['Allele Name'].value_counts()

HLA-A*02:01    32
HLA-B*07:02    15
HLA-A*11:01     8
HLA-A*24:02     6
HLA-A*03:01     6
HLA-B*51:01     5
HLA-B*35:01     5
HLA-C*04:01     4
HLA-C*07:01     4
HLA-B*18:01     3
HLA-B*58:01     2
HLA-B*15:01     2
HLA-A*68:01     1
HLA-B*57:01     1
HLA-C*05:01     1
HLA-A*32:01     1
HLA-B*27:05     1
HLA-B*40:01     1
HLA-B*35:08     1
HLA-B*44:03     1
Name: Allele Name, dtype: int64

In [9]:
def sample_peptide_list(data_df, n=10, weighted=True, haplotype=True):
    """
    Sample a potential ELISPOT List of peptides from a dataframe
    of in-vitro or otherwise validated peptides. Current support
    only includes MHC-I and human data. Expanding to MHC-II and
    mouse soon.

    Args:
        data_df (pd.DataFrame): Dataframe of Eptiope:MHC:Binding data
        n (int): Number of peptides to sample from each allele
        weighted (bool): Whether to weight the sampling by dset prevalence

    Returns:
        pd.DataFrame: Dataframe of sampled peptides
    """
    if weighted:
        # Weighted according to prevalence in the dataset
        alleles = data_df['Allele Name']
    else:
        # Unweighted, chosen from uniform across all alleles
        alleles = data_df['Allele Name'].unique()
    
    if haplotype:
        # Split alleles into HLA-A, HLA-B, HLA-C
        hla_a = [a for a in alleles if 'HLA-A' in a]
        hla_b = [b for b in alleles if 'HLA-B' in b]
        hla_c = [c for c in alleles if 'HLA-C' in c]

        # Grab 2 random alleles from each HLA-A, HLA-B, HLA-C
        # Represents one from mom and one for dad
        hla_a_sample = np.random.choice(hla_a, 2, replace=True)
        hla_b_sample = np.random.choice(hla_b, 2, replace=True)
        hla_c_sample = np.random.choice(hla_c, 2, replace=True)
        alleles = np.concatenate([hla_a_sample, hla_b_sample, hla_c_sample])
        
    # Subset the dataframe to only include the sampled alleles
    sampled_df = data_df[data_df['Allele Name'].isin(alleles)]

    # Sample n peptides from each allele    
    counts = sampled_df.groupby(['Allele Name']).count()
    return_df = pd.DataFrame()
    for allele in counts.index:
        if counts.loc[allele, 'Epitope'] <= n:
            return_df = pd.concat([return_df, sampled_df[sampled_df['Allele Name'] == allele]])
        else:
            return_df = pd.concat([return_df, sampled_df[sampled_df['Allele Name'] == allele].sample(n=n, replace=False)])
            
    return_df = return_df.reset_index(drop=True)
    return return_df

In [10]:
sample_df = sample_peptide_list(temp_df, n=25, haplotype=True).sample(25, replace=False).reset_index(drop=True)

In [11]:
sample_df

Unnamed: 0,Epitope,Allele Name,Binding
0,AMQSYTWSL,HLA-C*04:01,0
1,MEDSRDEHRKL,HLA-C*04:01,0
2,GMPPHMLPVL,HLA-C*04:01,0
3,TPVTPRWPEV,HLA-B*07:02,0
4,RLSTASFPT,HLA-A*02:01,0
5,LPTVKLAEV,HLA-B*07:02,0
6,FPQLTTRRL,HLA-B*07:02,1
7,YPRMDIPKI,HLA-B*07:02,1
8,VMPFSIVYIV,HLA-A*02:01,1
9,KVSWAAVTLLL,HLA-C*04:01,0


In [17]:
! ace generate --num-peptides 25 --num-peptides-per-pool 5 --num-coverage 3 --num-processes 10 --output-csv output_csv.csv

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
2023-06-28 15:46:41 INFO     CP solver started.
2023-06-28 15:46:42 INFO     CP solver finished.
2023-06-28 15:46:42 INFO     Solution is optimal.
2023-06-28 15:46:42 INFO     An optimal configuration has been generated.


In [112]:
ace_config = pd.read_csv('output_csv.csv')
ace_config[ace_config['peptide_id']=='peptide_25']

Unnamed: 0,pool_id,peptide_id,plate_id,well_id
9,pool_2,peptide_25,1,A8
34,pool_7,peptide_25,1,B1
59,pool_12,peptide_25,1,A4


In [115]:
ace_config[ace_config['pool_id'] in ['pool_2', 'pool_4', 'pool_13']]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [104]:
def simulate_ace_readout(ace_config, ground_truth_df):
    """
    Simulate the readout of an ACE experiment, given a configuration,
    the ground truth values of the peptides.
    """
    ace_results = ace_config.copy()
    for index in ground_truth_df.index:
        peptide_id = f'peptide_{index+1}'
        binding = ground_truth_df.loc[index, 'Binding']
        ace_results.loc[ace_results['peptide_id']==peptide_id, 'Binding'] = binding

    print(ace_results[ace_results['Binding']==1])
    for pool_id in ace_results['pool_id'].unique():
        pool_df = ace_results[ace_results['pool_id']==pool_id]
        pool_binding = 1 if pool_df['Binding'].sum() > 0 else 0
        ace_results.loc[ace_results['pool_id']==pool_id, 'spot_count'] = pool_binding
    return ace_results

In [105]:
simulate_ace_readout(ace_config, sample_df)

    pool_id  peptide_id  plate_id well_id  Binding
1    pool_1   peptide_7         1      A1      1.0
5    pool_2   peptide_1         1      A8      1.0
9    pool_2  peptide_25         1      A8      1.0
17   pool_4  peptide_14         1     A10      1.0
27   pool_6  peptide_14         1     A12      1.0
32   pool_7   peptide_7         1      B1      1.0
34   pool_7  peptide_25         1      B1      1.0
40   pool_9   peptide_1         1      B3      1.0
51  pool_11  peptide_14         1      A3      1.0
59  pool_12  peptide_25         1      A4      1.0
60  pool_13   peptide_1         1      A5      1.0
70  pool_15   peptide_7         1      A7      1.0


Unnamed: 0,pool_id,peptide_id,plate_id,well_id,Binding,spot_count
0,pool_1,peptide_5,1,A1,0.0,1.0
1,pool_1,peptide_7,1,A1,1.0,1.0
2,pool_1,peptide_8,1,A1,0.0,1.0
3,pool_1,peptide_19,1,A1,0.0,1.0
4,pool_1,peptide_20,1,A1,0.0,1.0
...,...,...,...,...,...,...
70,pool_15,peptide_7,1,A7,1.0,1.0
71,pool_15,peptide_9,1,A7,0.0,1.0
72,pool_15,peptide_11,1,A7,0.0,1.0
73,pool_15,peptide_12,1,A7,0.0,1.0


In [90]:
readout_df = simulate_ace_readout(ace_config, sample_df)
readout_df = readout_df[['pool_id', 'spot_count']].drop_duplicates()  
readout_df.to_csv('readout.csv', index=False)
readout_df

Unnamed: 0,pool_id,spot_count
0,pool_1,1.0
5,pool_2,1.0
10,pool_3,0.0
15,pool_4,1.0
20,pool_5,0.0
25,pool_6,1.0
30,pool_7,1.0
35,pool_8,0.0
40,pool_9,1.0
45,pool_10,0.0


In [91]:
! ace identify --readout-file-type pool_id --readout-files readout.csv --configuration-csv-file output_csv.csv --min-positive-spot-count 1 --output-csv identify_output.csv

In [118]:
identified_peptides = pd.read_csv('identify_output.csv')
identified_peptides[identified_peptides['num_coverage'] == 3]

Unnamed: 0,peptide_id,pool_ids,num_coverage
1,peptide_7,"pool_1,pool_7,pool_15",3
4,peptide_20,"pool_1,pool_6,pool_12",3
5,peptide_1,"pool_2,pool_9,pool_13",3
6,peptide_11,"pool_2,pool_6,pool_15",3
9,peptide_25,"pool_2,pool_7,pool_12",3
11,peptide_9,"pool_4,pool_9,pool_15",3
12,peptide_14,"pool_4,pool_6,pool_11",3


In [123]:
identified_peptides[identified_peptides['num_coverage'] == 3]['pool_ids'].apply(lambda x: re.sub(r'pool_', '', x))

1     1,7,15
4     1,6,12
5     2,9,13
6     2,6,15
9     2,7,12
11    4,9,15
12    4,6,11
Name: pool_ids, dtype: object

### Modeling the ELISPOT False Discovery Rate

#### Without Sequence Features

In [5]:
### Sample without replacement
np.random.randint(0, 100, 10, dtype=np.int32, endpoint=True)

TypeError: randint() got an unexpected keyword argument 'with_replacement'

#### With Sequence Features