### Goal

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt

sys.path.append("../../data_analysis/")
import helper as hp

In [3]:
savepath = "C01_elisa_extract_top_seq/"
os.makedirs(savepath, exist_ok=True)

In [4]:
df = pd.read_csv(f"{savepath}elisa_screen_1.csv")
df.head(2)

Unnamed: 0,ID,DNA,AA,elisa_score
0,1,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,QAQAQHLCCAFCCCKKCELCPK,0.092
1,2,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,AGCVQCLCCCCCCYAKVCAFPK,0.149


In [5]:
len(df)

96

In [6]:
df_sorted = df.sort_values('elisa_score', ascending=False)
df_sorted.head(5)

Unnamed: 0,ID,DNA,AA,elisa_score
84,85,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,CCCCQHLCACFSCYCKVECFCK,0.685
31,32,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,QGCCCHCCCCFCQCCCVELFPC,0.275
39,40,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,ACQVQHCCCAFCCYCCVCCFCC,0.23
92,93,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,CGQVCCCQACFSCCCKCCLFCK,0.221
35,36,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,QCQVQHLCCCFCCYKKVECCPC,0.205


In [7]:
df_sorted = df_sorted.drop(df_sorted[df_sorted['ID'] == 85].index)
df_sorted.head(5)

Unnamed: 0,ID,DNA,AA,elisa_score
31,32,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,QGCCCHCCCCFCQCCCVELFPC,0.275
39,40,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,ACQVQHCCCAFCCYCCVCCFCC,0.23
92,93,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,CGQVCCCQACFSCCCKCCLFCK,0.221
35,36,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,QCQVQHLCCCFCCYKKVECCPC,0.205
64,65,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,CCQCCCLCCAFCCCCKCELFPK,0.198


In [8]:
len(df_sorted)

95

In [9]:
df_filtered = df_sorted[df_sorted['elisa_score'] > 0.2]
df_filtered.head(5)


Unnamed: 0,ID,DNA,AA,elisa_score
31,32,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,QGCCCHCCCCFCQCCCVELFPC,0.275
39,40,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,ACQVQHCCCAFCCYCCVCCFCC,0.23
92,93,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,CGQVCCCQACFSCCCKCCLFCK,0.221
35,36,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,QCQVQHLCCCFCCYKKVECCPC,0.205


In [10]:
len(df_filtered)

4

In [11]:
all_seqs = df_filtered['AA'].to_list()
all_seqs

['QGCCCHCCCCFCQCCCVELFPC',
 'ACQVQHCCCAFCCYCCVCCFCC',
 'CGQVCCCQACFSCCCKCCLFCK',
 'QCQVQHLCCCFCCYKKVECCPC']

In [12]:
import Levenshtein

lev_distances = []
for i in range(len(all_seqs)):
    for j in range(i+1, len(all_seqs)):
        distance = Levenshtein.distance(all_seqs[i], all_seqs[j])
        lev_distances.append((all_seqs[i], all_seqs[j], distance))

lev_distances

[('QGCCCHCCCCFCQCCCVELFPC', 'ACQVQHCCCAFCCYCCVCCFCC', 11),
 ('QGCCCHCCCCFCQCCCVELFPC', 'CGQVCCCQACFSCCCKCCLFCK', 13),
 ('QGCCCHCCCCFCQCCCVELFPC', 'QCQVQHLCCCFCCYKKVECCPC', 11),
 ('ACQVQHCCCAFCCYCCVCCFCC', 'CGQVCCCQACFSCCCKCCLFCK', 11),
 ('ACQVQHCCCAFCCYCCVCCFCC', 'QCQVQHLCCCFCCYKKVECCPC', 8),
 ('CGQVCCCQACFSCCCKCCLFCK', 'QCQVQHLCCCFCCYKKVECCPC', 15)]

check the correlation between C and score

In [13]:
df_sorted.head(2)

Unnamed: 0,ID,DNA,AA,elisa_score
31,32,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,QGCCCHCCCCFCQCCCVELFPC,0.275
39,40,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,ACQVQHCCCAFCCYCCVCCFCC,0.23


In [14]:
len(df_sorted)

95

In [15]:
df_sorted['C_count'] = df_sorted['AA'].str.count('C')
df_sorted.head(2)

Unnamed: 0,ID,DNA,AA,elisa_score,C_count
31,32,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,QGCCCHCCCCFCQCCCVELFPC,0.275,12
39,40,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,ACQVQHCCCAFCCYCCVCCFCC,0.23,12


In [16]:
correlation = df_sorted['elisa_score'].corr(df_sorted['C_count'])
round(correlation, 2)

0.51

In [17]:
df_sorted['AA_length'] = df_sorted['AA'].apply(lambda x: len(x))
df_sorted.head(2)

Unnamed: 0,ID,DNA,AA,elisa_score,C_count,AA_length
31,32,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,QGCCCHCCCCFCQCCCVELFPC,0.275,12,22
39,40,GCGAATTAATACGACTCACTATAGGGAGACCACAACGGTTTCCCTC...,ACQVQHCCCAFCCYCCVCCFCC,0.23,12,22


In [18]:
len(df_sorted)

95

In [19]:
correlation = df_sorted['elisa_score'].corr(df_sorted['AA_length'])
round(correlation, 2)

0.12

what is the closest peptide in the whole dataset to the peptide with the highest score?

In [20]:
all_aas = df_sorted['AA'].to_list()
len(all_aas)

95

In [21]:
highest_elisa_score_aa = df_sorted.loc[df_sorted['elisa_score'].idxmax(), 'AA']
highest_elisa_score_aa

'QGCCCHCCCCFCQCCCVELFPC'

In [22]:
min_distance = float('inf')
closest_aa = ''

for aa in all_aas:
    if aa!=highest_elisa_score_aa:
        distance = Levenshtein.distance(aa, highest_elisa_score_aa)
        if distance < min_distance:
            min_distance = distance
            closest_aa = aa

closest_aa, min_distance

('QGCVCHLCCCCCCCKCVECFCK', 8)

esm-1v prediction

In [23]:
d_res = {
    "QGCVCHLCCCCCCCKCVECFCK": {"Q1M": 4,
                               "H6C": 4,
                               "E18C": 3,
                               "K15C": 2,
                               "L7C": 1,
                               "Q1C": 1,},
    
    "ACQVQHCCCAFCCYCCVCCFCC": {"A10C": 5,
                            "A1M": 4,
                            "V17C": 4,
                            "Y14C": 2,
                            "A1C": 1,
                            "F11C": 1,},

    "CGQVCCCQACFSCCCKCCLFCK": {"K16C": 1,
                               "C1M": 1,},

    "QCQVQHLCCCFCCYKKVECCPC": {"Q1M": 6},


    "CCQCCCLCCAFCCCCKCELFPK": {"Q3C": 4,
         "K16C": 4,
         "E18C": 3,
         "A10C": 2,
         "F20C": 1,
         "C1M": 1,}
}

In [24]:
d_res

{'QGCVCHLCCCCCCCKCVECFCK': {'Q1M': 4,
  'H6C': 4,
  'E18C': 3,
  'K15C': 2,
  'L7C': 1,
  'Q1C': 1},
 'ACQVQHCCCAFCCYCCVCCFCC': {'A10C': 5,
  'A1M': 4,
  'V17C': 4,
  'Y14C': 2,
  'A1C': 1,
  'F11C': 1},
 'CGQVCCCQACFSCCCKCCLFCK': {'K16C': 1, 'C1M': 1},
 'QCQVQHLCCCFCCYKKVECCPC': {'Q1M': 6},
 'CCQCCCLCCAFCCCCKCELFPK': {'Q3C': 4,
  'K16C': 4,
  'E18C': 3,
  'A10C': 2,
  'F20C': 1,
  'C1M': 1}}

let's remove the 1M

In [25]:
d_res = {
    "QGCVCHLCCCCCCCKCVECFCK": {
                               "H6C": 4,
                               "E18C": 3,
                               "K15C": 2,
                               "L7C": 1,
                               "Q1C": 1,},
    
    "ACQVQHCCCAFCCYCCVCCFCC": {"A10C": 5,
                            "V17C": 4,
                            "Y14C": 2,
                            "A1C": 1,
                            "F11C": 1,},

    "CGQVCCCQACFSCCCKCCLFCK": {"K16C": 1},

    "CCQCCCLCCAFCCCCKCELFPK": {"Q3C": 4,
         "K16C": 4,
         "E18C": 3,
         "A10C": 2,
         "F20C": 1}
}

In [26]:

import re
new_list = []
for base_seq,res in d_res.items():
    for mut,score in res.items():
        mut = re.findall(r'[A-Za-z]+|\d+', mut)
        assert len(mut)==3
        pos = mut[1]
        idx = int(pos)-1
        mutated = base_seq[:idx] + mut[2] + base_seq[idx+1:]
        assert len(mutated)==len(base_seq)
        assert mutated[idx]==mut[2]
        new_list.append((mutated))
new_list = list(set(new_list))
len(new_list)

16

let's do combination if higher than 1 (used 2 before, but let's try 2 anyway)

In [27]:
d_combi = {}
for base_seq,res in d_res.items():
    d_combi[base_seq] = []
    for mut,score in res.items():
        if score>=1:
            d_combi[base_seq].append(mut)
d_combi

{'QGCVCHLCCCCCCCKCVECFCK': ['H6C', 'E18C', 'K15C', 'L7C', 'Q1C'],
 'ACQVQHCCCAFCCYCCVCCFCC': ['A10C', 'V17C', 'Y14C', 'A1C', 'F11C'],
 'CGQVCCCQACFSCCCKCCLFCK': ['K16C'],
 'CCQCCCLCCAFCCCCKCELFPK': ['Q3C', 'K16C', 'E18C', 'A10C', 'F20C']}

In [28]:
import itertools

def pairwise_combinations(lst):
    return list(itertools.combinations(lst, 2))

combi_list = []
for base_seq,muts in d_combi.items():
    all_muts = pairwise_combinations(muts)
    for pairs in all_muts:
        mutated = base_seq
        for mut in pairs:
            mut = re.findall(r'[A-Za-z]+|\d+', mut)
            assert len(mut)==3
            pos = mut[1]
            idx = int(pos)-1
            mutated = mutated[:idx] + mut[2] + mutated[idx+1:]
            assert len(mutated)==len(base_seq)
            assert mutated[idx]==mut[2]
        
        combi_list.append((mutated))

combi_list = list(set(combi_list))
len(combi_list)


30

In [29]:
all_new_peptides = list(set(new_list + combi_list))
len(all_new_peptides)

46

In [30]:
df_all_new_peptides = pd.DataFrame({'AA': all_new_peptides})
df_all_new_peptides.to_csv(f"{savepath}esm1v.csv", index=False)