In [109]:
#Author: Habib Bashour (Greiff lab) ## modified Oct 2024 Gemma Gordon 
# convert from R to python
# https://www.rdocumentation.org/packages/Peptides/versions/2.4.6 
# https://github.com/althonos/peptides.py/tree/main/docs
# https://peptides.readthedocs.io/en/stable/api/peptide.html 

#######PART 1#########

In [2]:
import peptides
import pandas as pd
import numpy as np

In [111]:
#first load/read a dataframe that contains the sequences you are interested in
#this data should include aaSeqAbChain column that contains the sequence of the variable region (fv) of the antibody.

#df = pd.read_csv('/data/localhost/gordon/TNP_Project/datasets/greiff_inputs/greiff_all_datasets.csv')
# rename cols
#df.columns = ['SeqID','Seq','Dataset']

In [54]:
dataset_name = 'VHH_TWIST'

In [55]:
with open('/data/localhost/gordon/TNP_Project/DATA/datasets/seqs/vhh_twist/vhh_twist.fasta', 'r') as vh_seqs:
    entries = [v.strip('\n') for v in vh_seqs.readlines()]
    headers = [e for e in entries if '>' in e]
    seqs = [e for e in entries if '>' not in e]

df = pd.DataFrame()
df['SeqID'] = [h.strip('>') for h in headers]
df['Seq'] = seqs
df['Dataset'] = dataset_name


In [56]:
df

Unnamed: 0,SeqID,Seq,Dataset
0,seq_Brivekimig1_VHH1,DVQLVESGGGVVQPGGSLRLSCAASGRTFSSIYAKGWFRQAPGKER...,VHH_TWIST
1,seq_Brivekimig1_VHH2,EVQLVESGGGVVQPGGSLRLSCAASGRTFSSIYAKGWFRQAPGKER...,VHH_TWIST
2,seq_Brivekimig2_VHH1,EVQLVESGGGVVQPGGSLRLSCAASGFTFRSFGMSWVRQAPGKGPE...,VHH_TWIST
3,seq_Caplacizumab_VHH1,EVQLVESGGGLVQPGGSLRLSCAASGRTFSYNPMGWFRQAPGKGRE...,VHH_TWIST
4,seq_Enristomig_VHH1,EVQLLESGGGEVQPGGSLRLSCAASGGIFAIKPISWYRQAPGKQRE...,VHH_TWIST
...,...,...,...
103,seq_D9D88792-8769-KV-cFR2muts,EVQLVESGGGLVQPGGSLRLSCVASGSILSTLLMGWYRQAPGKQRE...,VHH_TWIST
104,seq_D6D88695-15168-C,QVQLVESGGGLVKPGGSLRLSCVASARGSIFSFDAMAWYRQAPGKA...,VHH_TWIST
105,seq_D6D88695-15168-N,QVQLVESGGGLVKPGGSLRLSCVASARGSIFSFDAMAWYRQAPGKA...,VHH_TWIST
106,seq_D6D88695-15168-C-cFR2muts,QVQLVESGGGLVKPGGSLRLSCVASARGSIFSFDAMAWYRQAPGKQ...,VHH_TWIST


In [57]:
#molecular weight etc ------
def get_mw(seq):
  seq = peptides.Peptide(seq) # have to instantiate class first
  mw = seq.molecular_weight()
  return mw

def get_seqlength(seq):
  length = len(seq)
  return length

def get_avresweight(seq):
  seq = peptides.Peptide(seq)
  avresweight = get_mw(seq) / (len(seq))
  return avresweight

In [58]:
df['Molecular Weight'] = df['Seq'].apply(get_mw)
df['Seq Length'] = df['Seq'].apply(get_seqlength)
df['Average Residue Weight'] = df['Seq'].apply(get_avresweight)

In [59]:
#seq = peptides.Peptide('EVQLLESGGGEVQPGGSLRLSCAASGGIFAIKPISWYRQAPGKQREWVSTTTSSGATNYAESVKGRFTISRDNAKNTLYLQMSSLRAEDTAVYYCNVFEYWGQGTLVTVKP')

In [60]:
#seq.charge(pH=14, pKscale= 'Lehninger')

In [61]:
#charge----

def get_all_charges(seq):

    all_charges = dict()
    charge_intervals = np.arange(1,15,1) # for pH 1-14
    seq = peptides.Peptide(seq)

    for i in charge_intervals:
        charge = seq.charge(pH=i, pKscale= 'Lehninger')
        all_charges[i] = charge

    return all_charges

In [62]:
#   developability_data = developability_data %>%
#     mutate(!!paste0("AbChain_",charge_intervals[i],"_charge") := unlist(clusterMap(cl, get_charge, seq_input = developability_data$aaSeqAbChain, pH_input = pH_input)))
# }

df['Charges (all pH values)'] = df['Seq'].apply(get_all_charges)
# split into new columns for individual pH values


In [63]:
df.head()

Unnamed: 0,SeqID,Seq,Dataset,Molecular Weight,Seq Length,Average Residue Weight,Charges (all pH values)
0,seq_Brivekimig1_VHH1,DVQLVESGGGVVQPGGSLRLSCAASGRTFSSIYAKGWFRQAPGKER...,VHH_TWIST,13260.71204,124,106.941226,"{1: 13.942310283084252, 2: 13.548854022923434,..."
1,seq_Brivekimig1_VHH2,EVQLVESGGGVVQPGGSLRLSCAASGRTFSSIYAKGWFRQAPGKER...,VHH_TWIST,13274.73894,124,107.054346,"{1: 13.943981978270456, 2: 13.565159054262834,..."
2,seq_Brivekimig2_VHH1,EVQLVESGGGVVQPGGSLRLSCAASGFTFRSFGMSWVRQAPGKGPE...,VHH_TWIST,12138.51884,115,105.552338,"{1: 10.94510603082478, 2: 10.576343008920697, ..."
3,seq_Caplacizumab_VHH1,EVQLVESGGGLVQPGGSLRLSCAASGRTFSYNPMGWFRQAPGKGRE...,VHH_TWIST,13842.46674,128,108.144271,"{1: 14.943419952724474, 2: 14.559567084245698,..."
4,seq_Enristomig_VHH1,EVQLLESGGGEVQPGGSLRLSCAASGGIFAIKPISWYRQAPGKQRE...,VHH_TWIST,12090.54814,111,108.923857,"{1: 11.947887393632824, 2: 11.603361081398594,..."


In [64]:
#pI----

def get_pI(seq):

    seq = peptides.Peptide(seq)
    pI = seq.isoelectric_point(pKscale = "Lehninger")
    return pI 

In [65]:
df['pI'] = df['Seq'].apply(get_pI)

In [66]:
df.head(5)

Unnamed: 0,SeqID,Seq,Dataset,Molecular Weight,Seq Length,Average Residue Weight,Charges (all pH values),pI
0,seq_Brivekimig1_VHH1,DVQLVESGGGVVQPGGSLRLSCAASGRTFSSIYAKGWFRQAPGKER...,VHH_TWIST,13260.71204,124,106.941226,"{1: 13.942310283084252, 2: 13.548854022923434,...",9.212508
1,seq_Brivekimig1_VHH2,EVQLVESGGGVVQPGGSLRLSCAASGRTFSSIYAKGWFRQAPGKER...,VHH_TWIST,13274.73894,124,107.054346,"{1: 13.943981978270456, 2: 13.565159054262834,...",9.212511
2,seq_Brivekimig2_VHH1,EVQLVESGGGVVQPGGSLRLSCAASGFTFRSFGMSWVRQAPGKGPE...,VHH_TWIST,12138.51884,115,105.552338,"{1: 10.94510603082478, 2: 10.576343008920697, ...",8.808379
3,seq_Caplacizumab_VHH1,EVQLVESGGGLVQPGGSLRLSCAASGRTFSYNPMGWFRQAPGKGRE...,VHH_TWIST,13842.46674,128,108.144271,"{1: 14.943419952724474, 2: 14.559567084245698,...",9.200839
4,seq_Enristomig_VHH1,EVQLLESGGGEVQPGGSLRLSCAASGGIFAIKPISWYRQAPGKQRE...,VHH_TWIST,12090.54814,111,108.923857,"{1: 11.947887393632824, 2: 11.603361081398594,...",8.729907


In [67]:
#photochemical----

def get_mol_extcoef(seq):
  
  if seq.count('C') % 2 == 0: 
    return (seq.count('Y')*1490) + (seq.count('W')*5500) + ((seq.count('C')/2)*125)

  else:
    return (seq.count('Y')*1490) + (seq.count('W')*5500) + (((seq.count('C')-1)/2)*125)

def get_mol_extcoef_cysteine_bridges(seq): 

  if seq.count('C') % 2 == 0:
    return (seq.count('C')/2)*125
  
  else:
    return ((seq.count('C')-1)/2)*125

In [68]:
df['Molecular Extinction Coefficient'] = df['Seq'].apply(get_mol_extcoef)
df['Molecular Extinction Coefficient (Cysteine bridges)'] = df['Seq'].apply(get_mol_extcoef_cysteine_bridges)

In [69]:
def get_percent_mol_extcoef(seq):
  
    if seq.count('C') % 2 == 0: 
        mol_extcoef = (seq.count('Y')*1490) + (seq.count('W')*5500) + ((seq.count('C')/2)*125)

    else:
        mol_extcoef = (seq.count('Y')*1490) + (seq.count('W')*5500) + (((seq.count('C')-1)/2)*125)
    
    return (mol_extcoef * 10) / get_mw(seq)

def get_percent_mol_extcoef_cysteine_bridges(seq): 

    if seq.count('C') % 2 == 0:
        mol_extcoef_cys = (seq.count('C')/2)*125

    else:
        mol_extcoef_cys = ((seq.count('C')-1)/2)*125

    return (mol_extcoef_cys * 10) / get_mw(seq)

In [70]:
df['% Molecular Extinction Coefficient'] = df['Seq'].apply(get_percent_mol_extcoef)
df['% Molecular Extinction Coefficient (Cysteine bridges)'] = df['Seq'].apply(get_percent_mol_extcoef_cysteine_bridges)

In [71]:
df.head()

Unnamed: 0,SeqID,Seq,Dataset,Molecular Weight,Seq Length,Average Residue Weight,Charges (all pH values),pI,Molecular Extinction Coefficient,Molecular Extinction Coefficient (Cysteine bridges),% Molecular Extinction Coefficient,% Molecular Extinction Coefficient (Cysteine bridges)
0,seq_Brivekimig1_VHH1,DVQLVESGGGVVQPGGSLRLSCAASGRTFSSIYAKGWFRQAPGKER...,VHH_TWIST,13260.71204,124,106.941226,"{1: 13.942310283084252, 2: 13.548854022923434,...",9.212508,25565.0,125.0,19.278754,0.094263
1,seq_Brivekimig1_VHH2,EVQLVESGGGVVQPGGSLRLSCAASGRTFSSIYAKGWFRQAPGKER...,VHH_TWIST,13274.73894,124,107.054346,"{1: 13.943981978270456, 2: 13.565159054262834,...",9.212511,25565.0,125.0,19.258382,0.094164
2,seq_Brivekimig2_VHH1,EVQLVESGGGVVQPGGSLRLSCAASGFTFRSFGMSWVRQAPGKGPE...,VHH_TWIST,12138.51884,115,105.552338,"{1: 10.94510603082478, 2: 10.576343008920697, ...",8.808379,17085.0,125.0,14.075029,0.102978
3,seq_Caplacizumab_VHH1,EVQLVESGGGLVQPGGSLRLSCAASGRTFSYNPMGWFRQAPGKGRE...,VHH_TWIST,13842.46674,128,108.144271,"{1: 14.943419952724474, 2: 14.559567084245698,...",9.200839,21555.0,125.0,15.571647,0.090302
4,seq_Enristomig_VHH1,EVQLLESGGGEVQPGGSLRLSCAASGGIFAIKPISWYRQAPGKQRE...,VHH_TWIST,12090.54814,111,108.923857,"{1: 11.947887393632824, 2: 11.603361081398594,...",8.729907,25565.0,125.0,21.144616,0.103387


In [72]:
#indexes---- 
def get_instaindex(seq):
  if len(seq)<3:
   return 'NA'
  else: 
   seq = peptides.Peptide(seq)
   return seq.instability_index()

def get_aliphindex(seq):
   seq = peptides.Peptide(seq)
   return seq.aliphatic_index()

In [73]:
df['Instability Index'] = df['Seq'].apply(get_instaindex)
df['Aliphatic Index'] = df['Seq'].apply(get_aliphindex)

In [74]:
#hydrophobicity and hmom----

def get_hydrophobicity(seq):
  seq = peptides.Peptide(seq)
  return seq.hydrophobicity(scale='Eisenberg')

def get_hmom(seq): 
  seq = peptides.Peptide(seq)
  return seq.hydrophobic_moment(angle=160,window=10)

In [75]:
df['Hydrophobicity'] = df['Seq'].apply(get_hydrophobicity)
df['Hydrophobic moment'] = df['Seq'].apply(get_hmom)

In [76]:

#amino acid categorical content----- 

def get_Aromaticity(seq):
  return (seq.count('F') + seq.count('H') + seq.count('W') 
          + seq.count('Y'))/len(seq) * 100

# Tiny (A+C+G+S+T)
def get_Tiny(seq):
  return (seq.count('A') + seq.count('C') + seq.count('G') 
          + seq.count('S') + seq.count('T'))/len(seq) * 100

# Small (A+C+D+G+N+P+S+T+V)
def get_Small(seq):
  return (seq.count('A') + seq.count('C') + seq.count('D') 
          + seq.count('G') + seq.count('N') + seq.count('P') 
          + seq.count('S') + seq.count('T') + seq.count('V'))/len(seq) * 100

# Aliphatic (A+I+L+V)
def get_Aliphatic(seq): 
  return (seq.count('A') + seq.count('I') + seq.count('L') 
          + seq.count('V'))/len(seq) * 100

# Nonpolar (A+C+F+G+I+L+M+P+V+W+Y) 
def get_Nonpolar(seq): 
  return (seq.count('A') + seq.count('C') + seq.count('F')
          + seq.count('G') + seq.count('I') + seq.count('L') 
          + seq.count('M') + seq.count('P') + seq.count('V') 
          + seq.count('W') + seq.count('Y')) / len(seq) * 100

# Polar  (D+E+H+K+N+Q+R+S+T)
def get_Polar(seq): 
  return (seq.count('D') + seq.count('E') + seq.count('H')
          + seq.count('K') + seq.count('N') + seq.count('Q') 
          + seq.count('R') + seq.count('S') + seq.count('T'))/ len(seq) * 100

# Basic (H+K+R)
def get_Basic(seq):
  return seq.count('H')+ seq.count('K') + seq.count('R')/len(seq) * 100

# Acidic (D+E)
def get_Acidic(seq): 
  return seq.count('D')+ seq.count('E')/len(seq) * 100

In [77]:
df['Aromatic content'] = df['Seq'].apply(get_Aromaticity)
df['Tiny content'] = df['Seq'].apply(get_Tiny)
df['Small content'] = df['Seq'].apply(get_Small)
df['Aliphatic content'] = df['Seq'].apply(get_Aliphatic)
df['Nonpolar content'] = df['Seq'].apply(get_Nonpolar)
df['Polar content'] = df['Seq'].apply(get_Polar)
df['Basic content'] = df['Seq'].apply(get_Basic)
df['Acidic content'] = df['Seq'].apply(get_Acidic)

In [78]:
df

Unnamed: 0,SeqID,Seq,Dataset,Molecular Weight,Seq Length,Average Residue Weight,Charges (all pH values),pI,Molecular Extinction Coefficient,Molecular Extinction Coefficient (Cysteine bridges),...,Hydrophobicity,Hydrophobic moment,Aromatic content,Tiny content,Small content,Aliphatic content,Nonpolar content,Polar content,Basic content,Acidic content
0,seq_Brivekimig1_VHH1,DVQLVESGGGVVQPGGSLRLSCAASGRTFSSIYAKGWFRQAPGKER...,VHH_TWIST,13260.71204,124,106.941226,"{1: 13.942310283084252, 2: 13.548854022923434,...",9.212508,25565.0,125.0,...,0.006532,0.760206,10.483871,44.354839,62.096774,26.612903,53.225806,46.774194,11.258065,9.032258
1,seq_Brivekimig1_VHH2,EVQLVESGGGVVQPGGSLRLSCAASGRTFSSIYAKGWFRQAPGKER...,VHH_TWIST,13274.73894,124,107.054346,"{1: 13.943981978270456, 2: 13.565159054262834,...",9.212511,25565.0,125.0,...,0.007823,0.760206,10.483871,44.354839,61.290323,26.612903,53.225806,46.774194,11.258065,8.838710
2,seq_Brivekimig2_VHH1,EVQLVESGGGVVQPGGSLRLSCAASGFTFRSFGMSWVRQAPGKGPE...,VHH_TWIST,12138.51884,115,105.552338,"{1: 10.94510603082478, 2: 10.576343008920697, ...",8.808379,17085.0,125.0,...,0.043304,0.760206,8.695652,44.347826,61.739130,23.478261,52.173913,47.826087,9.086957,7.478261
3,seq_Caplacizumab_VHH1,EVQLVESGGGLVQPGGSLRLSCAASGRTFSYNPMGWFRQAPGKGRE...,VHH_TWIST,13842.46674,128,108.144271,"{1: 14.943419952724474, 2: 14.559567084245698,...",9.200839,21555.0,125.0,...,-0.022109,0.922387,10.156250,40.625000,58.593750,25.781250,56.250000,43.750000,11.375000,9.468750
4,seq_Enristomig_VHH1,EVQLLESGGGEVQPGGSLRLSCAASGGIFAIKPISWYRQAPGKQRE...,VHH_TWIST,12090.54814,111,108.923857,"{1: 11.947887393632824, 2: 11.603361081398594,...",8.729907,25565.0,125.0,...,0.024775,0.760206,10.810811,39.639640,55.855856,26.126126,54.054054,45.945946,10.405405,8.306306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,seq_D9D88792-8769-KV-cFR2muts,EVQLVESGGGLVQPGGSLRLSCVASGSILSTLLMGWYRQAPGKQRE...,VHH_TWIST,12657.20064,119,106.363031,"{1: 10.94398197772106, 2: 10.565159048768885, ...",6.476237,21555.0,125.0,...,0.078487,0.760206,9.243697,40.336134,59.663866,28.571429,57.983193,42.016807,8.882353,9.042017
104,seq_D6D88695-15168-C,QVQLVESGGGLVKPGGSLRLSCVASARGSIFSFDAMAWYRQAPGKA...,VHH_TWIST,13623.32894,126,108.121658,"{1: 14.941190611838445, 2: 14.538040954544298,...",9.429943,21555.0,125.0,...,0.036111,0.760206,11.111111,38.095238,59.523810,28.571429,54.761905,45.238095,12.555556,8.380952
105,seq_D6D88695-15168-N,QVQLVESGGGLVKPGGSLRLSCVASARGSIFSFDAMAWYRQAPGKA...,VHH_TWIST,13666.35394,126,108.463127,"{1: 14.941190611838445, 2: 14.538040954544298,...",9.429943,21555.0,125.0,...,0.025000,0.760206,11.111111,37.301587,59.523810,27.777778,53.968254,46.031746,12.555556,8.380952
106,seq_D6D88695-15168-C-cFR2muts,QVQLVESGGGLVKPGGSLRLSCVASARGSIFSFDAMAWYRQAPGKQ...,VHH_TWIST,13723.40894,126,108.915944,"{1: 15.941190611835134, 2: 15.538040954511184,...",9.640658,21555.0,125.0,...,-0.004048,0.760206,11.111111,37.301587,58.730159,26.984127,53.174603,46.825397,13.349206,8.380952


In [79]:
# write out developability data to csv
df.to_csv('greiff_seq_VHH_Twist_results.csv', index=False)