# ACE Neural Validation Experiments (In-Silico)

#### Dhuvarakesh Karthikeyan and Jin Seok (Andy) Lee

In [1]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt

import os
import re

from acelib.elispot import ELISpot

In [2]:
all_df = pd.read_csv('val_refs/iedb_mmer_all.csv')
pos_df = all_df[all_df['Binding']==1].reset_index(drop=True)
neg_df = all_df[all_df['Binding']==0].reset_index(drop=True)

print(f'Number of positive examples: {len(pos_df)}')
print(f'Number of negative examples: {len(neg_df)}')

Number of positive examples: 2297
Number of negative examples: 4760


In [5]:
class AceSimulatedExperiment:
    """
    Class to simulate an ELISPOT experiment with optimal peptide pools.

    Experiment procedure:
    1. Initialize objecty
    2. Load reference CSV
    3. Get optimized config
    4. Simulate ELISPOT assay
    5. Calculate sensitivity and specificity
    """
    def __init__(self, num_peptides, num_positives, peptides_per_pool, coverage, disallowed_peptides=False, enforced_peptides=True,
                 sliding_window_sample=False, deterministic=False, dispersion_factor=1, num_processes=8, random_seed=70203):
        # Define the pooled ELISPOT parameters
        self.num_peptides = num_peptides
        self.num_positives = num_positives
        self.peptides_per_pool = peptides_per_pool
        self.coverage = coverage

        # Define ACE parameters
        self.disallowed_peptides = disallowed_peptides
        self.enforced_peptides = enforced_peptides
        if self.disallowed_peptides:
            self.enforced_peptides = False
        
        # Define the simulation parameters
        self.sliding_window_sample = sliding_window_sample
        self.deterministic = deterministic
        self.dispersion_factor = dispersion_factor
        self.num_processes = num_processes
        self.random_state = random_seed
    
    def load_reference_csv(self, path):
        """
        Load the reference immunogenicity data from a csv format.
        At minimum this data should have peptide, MHC allele, and 
        whether or not the pMHC complex is immunogenic.

        Expected Columnn Names for the CSV:
            - Epitope: The peptide sequences, all capital letters
            - Allele: The MHC allele: HLA-[A,B,C]*[0-9][0-9]:[0-9][0-9]
            - Binding: 1 if the pMHC was shown to be immunogenic, 0 otherwise
        """
        assert os.path.exists(path), f'File does not exist: {path}'
        data_df = pd.read_csv(path)
        assert 'Epitope' in data_df.columns, f'Epitope column not found in {path}'
        assert 'Binding' in data_df.columns, f'Binding column not found in {path}'
        data_df.sort_values(by=['Epitope'], inplace=True)
        self.data = data_df

    def __run__(self):
        """
        Run one iteration of the simulation.
        """
        # Sample the peptides
        peptide_ids, peptide_sequences, labels, disallowed_peptide_pairs, enforced_peptide_pairs = self.sample_peptides()
        
        # Run ACE to get the optimal configuration
        assay, config_df = self.compute_optimized_config(peptide_ids, peptide_sequences, disallowed_peptide_pairs, enforced_peptide_pairs)
        
        # Simulate the ELISPOT assay
        res_df = self.simulate_spot_counts(assay, config_df, peptide_ids, labels)

        #hits = res_df[res_df['deconvolution_result']=='hit']['peptide_id'].values
        hits = res_df['peptide_id'].values

        positive_peptide_ids = [f'peptide_{i}' for i in range(len(peptide_ids)) if labels[i]==1]
        negative_peptide_ids = [f'peptide_{i}' for i in range(len(peptide_ids)) if labels[i]==0]

        # Calculate the sensitivity and specificity
        sensitivity = len(set(hits).intersection(set(positive_peptide_ids)))/len(positive_peptide_ids)
        specificity = len(set(hits).intersection(set(positive_peptide_ids)))/len(hits)

        #print(f'Number of positive peptides: {len(positive_peptide_ids)}')
        #print(f'Number of identified peptides: {len(hits)}')


        res_df['Label'] = [1 if id in positive_peptide_ids else 0 for id in res_df['peptide_id']]
        return res_df, sensitivity, specificity
        
    def run(self, num_iterations=1):
        sensitivities = []
        specificities = []
        for _ in range(num_iterations):
            res_df, sensitivity, specificity = self.__run__()
            sensitivities.append(sensitivity)
            specificities.append(specificity)
        if num_iterations == 1:
            print(f'Sensitivity: {sensitivity}')
            print(f'Specificity: {specificity}')
            return res_df
        return sensitivities, specificities

    def sample_peptides(self):
        """
        Sample the peptide sequences and their immunogenicity status
        from the reference data.

        NOTE: We do this instead of randomly initializing peptide sequences
        because the ACE Neural Engine was trained on real sequence data. Thus,
        to make meaningful disallowed peptide pairings we utilize a reference
        dataset.
        """
        # Sample the peptides
        pos_df = self.data[self.data['Binding']==1].reset_index(drop=True)
        neg_df = self.data[self.data['Binding']==0].reset_index(drop=True)

        if self.sliding_window_sample:
            pos_idx = np.random.randint(0, len(pos_df)-self.num_positives)
            neg_idx = np.random.randint(0, len(neg_df)-(self.num_peptides))
            # Pick peptides that are close to each other
            positive_peptides = self.data[self.data['Binding']==1].iloc[pos_idx:pos_idx+self.num_positives]
            negative_peptides = self.data[self.data['Binding']==0].iloc[neg_idx:(neg_idx+self.num_peptides-self.num_positives)]
        else:
            positive_peptides = self.data[self.data['Binding']==1].sample(self.num_positives)
            negative_peptides = self.data[self.data['Binding']==0].sample(self.num_peptides-self.num_positives)
        
        # Combine the peptides and shuffle them
        peptides = pd.concat([positive_peptides, negative_peptides]).sample(frac=1).reset_index(drop=True)
        
        #############################
        ### Temporary for testing ###
        #############################
        # to be replaced by bonafide sequence similarity
        positives = peptides[peptides['Binding']==1]
        disallowed_peptide_pairs = []
        enforced_peptide_pairs = []
        if self.disallowed_peptides:
            for idx, _ in positives.iterrows():
                for idx2, _ in positives.iterrows():
                    if idx != idx2:
                        disallowed_peptide_pairs.append((f'peptide_{idx}', f'peptide_{idx2}'))
        if self.enforced_peptides:
            enforced_peptide_pairs = []
            for idx, _ in positives.iterrows():
                for idx2, _ in positives.iterrows():
                    if idx != idx2:
                        enforced_peptide_pairs.append((f'peptide_{idx}', f'peptide_{idx2}'))
        #############################
        #############################
        
        peptide_ids = [f'peptide_{idx}' for idx in peptides.index.values]
        peptide_sequences = peptides['Epitope'].values
        labels = peptides['Binding'].values
        return peptide_ids, peptide_sequences, labels, disallowed_peptide_pairs, enforced_peptide_pairs

    def compute_optimized_config(self, peptide_ids, peptide_sequences, disallowed_peptide_pairs, enforced_peptide_pairs):
        """
        Run ACE to generate the optimal peptide pools.
        """
        
        # Create the ELISPOT assay object
        assay = ELISpot(
            num_peptides_per_pool=self.peptides_per_pool,
            num_coverage=self.coverage,
            num_processes=10,
            peptide_ids=peptide_ids,
            peptide_sequences=peptide_sequences
        )

        # Generate the configuration using the optimization algorithm
        generation_status, config_df = assay.generate_configuration(disallowed_peptide_pairs=disallowed_peptide_pairs, random_seed=self.random_state)
        try:
            assert len(config_df) > 0
        except:
            raise AssertionError(f'No valid configurations found for the assay parameters. Please try again with different parameters.')
        return assay, config_df
    
    @staticmethod
    def sample_spot_counts(mean: float, dispersion_factor, num_samples):
        """
        Sample the number of spots for a peptide pool using a negative binomial distribution.

        Derivation of NegBinom Parameters:

            Let X be the number RV of spots sampled for a peptide pool.
            Let p be the probability of sampling a spot for a peptide in the pool.
            Let r be the number of successes (spots) we want to sample.
            Let k be the number of failures (non-spots) we want to sample.

            Then, the probability mass function for X is given by:

            P(X=k) = (k+r-1)C(k) * p^r * (1-p)^k
            
            where (k+r-1)C(k) is the binomial coefficient.

            The mean and variance of X are given by:

            mean = r(1-p)/p
            var = r(1-p)/p^2

            We can solve for p and r in terms of the mean and variance:

            p = mean / var
            r = mean^2 / (var - mean)
        """
        if dispersion_factor < 1:
            raise ValueError("dispersion_factor must be greater than or equal to 1")
        elif dispersion_factor == 1:
            return np.random.poisson(mean, num_samples)
        else:    
            variance = mean*dispersion_factor
            p = mean/variance
            r = mean**2/(variance-mean)
            return np.random.negative_binomial(r, p, num_samples)

    def simulate_spot_counts(self, assay, config_df, peptide_ids, labels):
        """
        Simulate the ELISPOT assay using the optimal peptide pools configuration.
        Determine how spots are sampled for each peptide pool. 

        Assumptions:
            1. The number of spots sampled for each peptide pool follows a negative binomial distribution.
            2. The number of spots sampled for each peptide pool is independent of the other peptide pools.
            3. Immunogenic and non-immunogenic peptides have the same shape parameters for the negative binomial distribution, 
            but different mean parameters. This is because we assume the experimental error/noise is the same for both whereas
            true immunogenic peptides will likely have more spots than non-immunogenic peptides.
            4. We assume that having multiple immunogenic peptides in a pool addiditively increases the number of spots sampled


        Parameters:
            config_df: The optimal peptide pools configuration dataframe
            peptide_ids: The peptide ids for each peptide sequence
            labels: The immunogenicity labels for each peptide sequence

        Returns:
            hit_pool_list: A list of the pool ids that were determined to be hits
        """
        label_dict = {peptide_id:label for peptide_id, label in zip(peptide_ids, labels)}
        hit_pool_list = []
        # In the deterministic case we just use the labels to get 0/1 spot counts
        if self.deterministic:
            # Check to see if that pool has an immunogenic peptide in it
            for pool_id in config_df['pool_id'].unique():
                pool_df = config_df[config_df['pool_id']==pool_id]
                for peptide_id in pool_df['peptide_id']:
                    if label_dict[peptide_id] == 1:
                        hit_pool_list.append(pool_id)
                        break

        # Simulate according to a probabilistic model
        else:
            immunogenic_mean = 100
            non_immunogenic_mean = 10
            
            pool_spot_counts = {i:0 for i in config_df['pool_id'].unique()}
            for pool_id in config_df['pool_id'].unique():
                pool_df = config_df[config_df['pool_id']==pool_id]
                for peptide_id in pool_df['peptide_id']:
                    if label_dict[peptide_id] == 1:
                        pool_spot_counts[pool_id] += AceSimulatedExperiment.sample_spot_counts(immunogenic_mean, self.dispersion_factor, num_samples=1)[0]
                    else:
                        pool_spot_counts[pool_id] += AceSimulatedExperiment.sample_spot_counts(non_immunogenic_mean, self.dispersion_factor, num_samples=1)[0]
                    # Check to see if we have enough spots to call it a hit
                    if pool_spot_counts[pool_id] >= immunogenic_mean:
                            hit_pool_list.append(pool_id)
                            break    
        
        res_df = assay.identify_hit_peptides(hit_pool_list, config_df)
        return res_df


In [10]:
simulation = AceSimulatedExperiment(
                                    num_peptides=20,
                                    num_positives=5,
                                    peptides_per_pool=3,
                                    coverage=3,
                                    disallowed_peptides=False,
                                    enforced_peptides=True,
                                    sliding_window_sample=False,
                                    deterministic=False,
                                    dispersion_factor=2,
                                    num_processes=8
                                )
simulation.load_reference_csv('val_refs/iedb_mmer_all.csv')
sensitivities, specificities = simulation.run(10)

min(sensitivities), max(sensitivities), np.median(sensitivities), np.mean(sensitivities), np.std(sensitivities)


2023-07-03 18:56:59 INFO     CP solver started.
2023-07-03 18:56:59 INFO     CP solver finished.
2023-07-03 18:56:59 INFO     An optimal feasible solution was found.
2023-07-03 18:56:59 INFO     CP solver started.
2023-07-03 18:57:00 INFO     CP solver finished.
2023-07-03 18:57:00 INFO     An optimal feasible solution was found.
2023-07-03 18:57:00 INFO     CP solver started.
2023-07-03 18:57:00 INFO     CP solver finished.
2023-07-03 18:57:00 INFO     An optimal feasible solution was found.
2023-07-03 18:57:00 INFO     CP solver started.
2023-07-03 18:57:00 INFO     CP solver finished.
2023-07-03 18:57:00 INFO     An optimal feasible solution was found.
2023-07-03 18:57:00 INFO     CP solver started.
2023-07-03 18:57:01 INFO     CP solver finished.
2023-07-03 18:57:01 INFO     An optimal feasible solution was found.
2023-07-03 18:57:01 INFO     CP solver started.
2023-07-03 18:57:01 INFO     CP solver finished.
2023-07-03 18:57:01 INFO     An optimal feasible solution was found.
2023

(0.8, 1.0, 0.8, 0.8400000000000001, 0.07999999999999999)

In [None]:
simulation = AceSimulatedExperiment(
                                    num_peptides=20,
                                    num_positives=5,
                                    peptides_per_pool=3,
                                    coverage=3,
                                    disallowed_peptides=False,
                                    enforced_peptides=True,
                                    sliding_window_sample=False,
                                    deterministic=False,
                                    dispersion_factor=2,
                                    num_processes=8
                                )
simulation.load_reference_csv('val_refs/iedb_mmer_all.csv')
sensitivities, specificities = simulation.run(10)

min(sensitivities), max(sensitivities), np.median(sensitivities), np.mean(sensitivities), np.std(sensitivities)


In [7]:
def sample_peptide_list_rough(data_df, num_samples, num_positives):
    """
    Samples a list of peptides from a dataframe of peptides.

    Returns a dataframe of num_samples peptides, with num_positives
    positive peptides (immunogenic)
    """
    # Sample num_positives positive peptides
    pos_df = data_df[data_df['Binding']==1].reset_index(drop=True)
    assert num_positives <= len(pos_df), 'num_positives must be less than or equal to the number of positive peptides in the dataframe'
    pos_sample = pos_df.sample(n=num_positives, replace=False)
    
    # Sample num_samples - num_positives negative peptides
    neg_df = data_df[data_df['Binding']==0].reset_index(drop=True)
    assert num_samples - num_positives <= len(neg_df), 'num_samples - num_positives must be less than or equal to the number of negative peptides in the dataframe'
    neg_sample = neg_df.sample(n=num_samples - num_positives)
    
    # Combine the two samples
    sample_df = pd.concat([pos_sample, neg_sample]).reset_index(drop=True)
    
    return sample_df


In [9]:
sorted(['peptide_1', 'peptide_10', 'peptide_100'])

['peptide_1', 'peptide_10', 'peptide_100']

In [8]:
sample_peptide_list_rough(temp_df, 100, 5)['Allele Name'].value_counts()

HLA-A*02:01    32
HLA-B*07:02    15
HLA-A*11:01     8
HLA-A*24:02     6
HLA-A*03:01     6
HLA-B*51:01     5
HLA-B*35:01     5
HLA-C*04:01     4
HLA-C*07:01     4
HLA-B*18:01     3
HLA-B*58:01     2
HLA-B*15:01     2
HLA-A*68:01     1
HLA-B*57:01     1
HLA-C*05:01     1
HLA-A*32:01     1
HLA-B*27:05     1
HLA-B*40:01     1
HLA-B*35:08     1
HLA-B*44:03     1
Name: Allele Name, dtype: int64

In [9]:
def sample_peptide_list(data_df, n=10, weighted=True, haplotype=True):
    """
    Sample a potential ELISPOT List of peptides from a dataframe
    of in-vitro or otherwise validated peptides. Current support
    only includes MHC-I and human data. Expanding to MHC-II and
    mouse soon.

    Args:
        data_df (pd.DataFrame): Dataframe of Eptiope:MHC:Binding data
        n (int): Number of peptides to sample from each allele
        weighted (bool): Whether to weight the sampling by dset prevalence

    Returns:
        pd.DataFrame: Dataframe of sampled peptides
    """
    if weighted:
        # Weighted according to prevalence in the dataset
        alleles = data_df['Allele Name']
    else:
        # Unweighted, chosen from uniform across all alleles
        alleles = data_df['Allele Name'].unique()
    
    if haplotype:
        # Split alleles into HLA-A, HLA-B, HLA-C
        hla_a = [a for a in alleles if 'HLA-A' in a]
        hla_b = [b for b in alleles if 'HLA-B' in b]
        hla_c = [c for c in alleles if 'HLA-C' in c]

        # Grab 2 random alleles from each HLA-A, HLA-B, HLA-C
        # Represents one from mom and one for dad
        hla_a_sample = np.random.choice(hla_a, 2, replace=True)
        hla_b_sample = np.random.choice(hla_b, 2, replace=True)
        hla_c_sample = np.random.choice(hla_c, 2, replace=True)
        alleles = np.concatenate([hla_a_sample, hla_b_sample, hla_c_sample])
        
    # Subset the dataframe to only include the sampled alleles
    sampled_df = data_df[data_df['Allele Name'].isin(alleles)]

    # Sample n peptides from each allele    
    counts = sampled_df.groupby(['Allele Name']).count()
    return_df = pd.DataFrame()
    for allele in counts.index:
        if counts.loc[allele, 'Epitope'] <= n:
            return_df = pd.concat([return_df, sampled_df[sampled_df['Allele Name'] == allele]])
        else:
            return_df = pd.concat([return_df, sampled_df[sampled_df['Allele Name'] == allele].sample(n=n, replace=False)])
            
    return_df = return_df.reset_index(drop=True)
    return return_df

In [10]:
sample_df = sample_peptide_list(temp_df, n=25, haplotype=True).sample(25, replace=False).reset_index(drop=True)

In [11]:
sample_df

Unnamed: 0,Epitope,Allele Name,Binding
0,AMQSYTWSL,HLA-C*04:01,0
1,MEDSRDEHRKL,HLA-C*04:01,0
2,GMPPHMLPVL,HLA-C*04:01,0
3,TPVTPRWPEV,HLA-B*07:02,0
4,RLSTASFPT,HLA-A*02:01,0
5,LPTVKLAEV,HLA-B*07:02,0
6,FPQLTTRRL,HLA-B*07:02,1
7,YPRMDIPKI,HLA-B*07:02,1
8,VMPFSIVYIV,HLA-A*02:01,1
9,KVSWAAVTLLL,HLA-C*04:01,0


In [17]:
! ace generate --num-peptides 25 --num-peptides-per-pool 5 --num-coverage 3 --num-processes 10 --output-csv output_csv.csv

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
2023-06-28 15:46:41 INFO     CP solver started.
2023-06-28 15:46:42 INFO     CP solver finished.
2023-06-28 15:46:42 INFO     Solution is optimal.
2023-06-28 15:46:42 INFO     An optimal configuration has been generated.


In [112]:
ace_config = pd.read_csv('output_csv.csv')
ace_config[ace_config['peptide_id']=='peptide_25']

Unnamed: 0,pool_id,peptide_id,plate_id,well_id
9,pool_2,peptide_25,1,A8
34,pool_7,peptide_25,1,B1
59,pool_12,peptide_25,1,A4


In [115]:
ace_config[ace_config['pool_id'] in ['pool_2', 'pool_4', 'pool_13']]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [104]:
def simulate_ace_readout(ace_config, ground_truth_df):
    """
    Simulate the readout of an ACE experiment, given a configuration,
    the ground truth values of the peptides.
    """
    ace_results = ace_config.copy()
    for index in ground_truth_df.index:
        peptide_id = f'peptide_{index+1}'
        binding = ground_truth_df.loc[index, 'Binding']
        ace_results.loc[ace_results['peptide_id']==peptide_id, 'Binding'] = binding

    print(ace_results[ace_results['Binding']==1])
    for pool_id in ace_results['pool_id'].unique():
        pool_df = ace_results[ace_results['pool_id']==pool_id]
        pool_binding = 1 if pool_df['Binding'].sum() > 0 else 0
        ace_results.loc[ace_results['pool_id']==pool_id, 'spot_count'] = pool_binding
    return ace_results

In [105]:
simulate_ace_readout(ace_config, sample_df)

    pool_id  peptide_id  plate_id well_id  Binding
1    pool_1   peptide_7         1      A1      1.0
5    pool_2   peptide_1         1      A8      1.0
9    pool_2  peptide_25         1      A8      1.0
17   pool_4  peptide_14         1     A10      1.0
27   pool_6  peptide_14         1     A12      1.0
32   pool_7   peptide_7         1      B1      1.0
34   pool_7  peptide_25         1      B1      1.0
40   pool_9   peptide_1         1      B3      1.0
51  pool_11  peptide_14         1      A3      1.0
59  pool_12  peptide_25         1      A4      1.0
60  pool_13   peptide_1         1      A5      1.0
70  pool_15   peptide_7         1      A7      1.0


Unnamed: 0,pool_id,peptide_id,plate_id,well_id,Binding,spot_count
0,pool_1,peptide_5,1,A1,0.0,1.0
1,pool_1,peptide_7,1,A1,1.0,1.0
2,pool_1,peptide_8,1,A1,0.0,1.0
3,pool_1,peptide_19,1,A1,0.0,1.0
4,pool_1,peptide_20,1,A1,0.0,1.0
...,...,...,...,...,...,...
70,pool_15,peptide_7,1,A7,1.0,1.0
71,pool_15,peptide_9,1,A7,0.0,1.0
72,pool_15,peptide_11,1,A7,0.0,1.0
73,pool_15,peptide_12,1,A7,0.0,1.0


In [90]:
readout_df = simulate_ace_readout(ace_config, sample_df)
readout_df = readout_df[['pool_id', 'spot_count']].drop_duplicates()  
readout_df.to_csv('readout.csv', index=False)
readout_df

Unnamed: 0,pool_id,spot_count
0,pool_1,1.0
5,pool_2,1.0
10,pool_3,0.0
15,pool_4,1.0
20,pool_5,0.0
25,pool_6,1.0
30,pool_7,1.0
35,pool_8,0.0
40,pool_9,1.0
45,pool_10,0.0


In [91]:
! ace identify --readout-file-type pool_id --readout-files readout.csv --configuration-csv-file output_csv.csv --min-positive-spot-count 1 --output-csv identify_output.csv

In [118]:
identified_peptides = pd.read_csv('identify_output.csv')
identified_peptides[identified_peptides['num_coverage'] == 3]

Unnamed: 0,peptide_id,pool_ids,num_coverage
1,peptide_7,"pool_1,pool_7,pool_15",3
4,peptide_20,"pool_1,pool_6,pool_12",3
5,peptide_1,"pool_2,pool_9,pool_13",3
6,peptide_11,"pool_2,pool_6,pool_15",3
9,peptide_25,"pool_2,pool_7,pool_12",3
11,peptide_9,"pool_4,pool_9,pool_15",3
12,peptide_14,"pool_4,pool_6,pool_11",3


In [123]:
identified_peptides[identified_peptides['num_coverage'] == 3]['pool_ids'].apply(lambda x: re.sub(r'pool_', '', x))

1     1,7,15
4     1,6,12
5     2,9,13
6     2,6,15
9     2,7,12
11    4,9,15
12    4,6,11
Name: pool_ids, dtype: object

### Modeling the ELISPOT False Discovery Rate

#### Without Sequence Features

In [5]:
### Sample without replacement
np.random.randint(0, 100, 10, dtype=np.int32, endpoint=True)

TypeError: randint() got an unexpected keyword argument 'with_replacement'

#### With Sequence Features