In [122]:
import os
import copy
import numpy as np
import pandas as pd

In [36]:
from pfabnet import utils
from pfabnet.base import ENTITY_KEY, VISCOSITY_KEY

In [28]:
BASE_DIR = os.path.dirname(utils.get_file_path()) + '/../'
DATA_DIR = os.path.join(BASE_DIR, 'data')
RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')
FASTA_DIR = os.path.join(DATA_DIR, 'fasta')

In [29]:
# create data directory
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(RAW_DATA_DIR, exist_ok=True)
os.makedirs(FASTA_DIR, exist_ok=True)

## Process Ab21 dataset

In [27]:
# Extract supplementary data from Lai et al. Mol. Pharmaceutics 2021, 18, 3, 1167–1175
# https://pubs.acs.org/doi/suppl/10.1021/acs.molpharmaceut.0c01073/suppl_file/mp0c01073_si_001.zip

# 1. copy mp0c01073_si_001.zip to data
# 2. unzip mp0cp1073_si_001.zip - this will create a sub directory SI

In [39]:
df_Ab21 = pd.read_csv(os.path.join(os.path.join(DATA_DIR, 'SI'), 'features_values_SI.csv'))
df_Ab21 = df_Ab21.loc[df_Ab21.Isotype == 'IgG1']
df_Ab21.rename({'mabs':ENTITY_KEY}, inplace=True, axis=1)
df_Ab21.reset_index(drop=True, inplace=True)

df_Ab21_2 = pd.read_csv(os.path.join(DATA_DIR, 'Ab21_raw.csv'))
df_Ab21 = df_Ab21_2.merge(df_Ab21, on=ENTITY_KEY)

df_Ab21.to_csv(os.path.join(DATA_DIR, 'Ab21.csv'), index=False)
print('Number of antibodies in Ab21 set: %d' % len(df_Ab21))
df_Ab21.head()

Number of antibodies in Ab21 set: 21


Unnamed: 0,Entity,Viscosity_at_150,Isotype,N_hydrophobic Fv,N_hydrophobic mAb,N_hydrophilic Fv,N_hydrophilic mAb,HI_Fv,HI_mAb,SASA_phobic_Fv,...,net charges mAb,FvCSP,mAbCSP,Fv_pI,mAb_pI,SAP Fv,SAP mAb,SCM Fv,SCM mAb,classifier
0,mAb1,14.4,IgG1,83,492,120,712,1.098922,1.002246,3760.633301,...,26,-10,40,8.88,8.96,134.8,526.3,2522.9,6979.0,0
1,mAb2,20.9,IgG1,85,498,110,690,1.364037,1.078138,4469.273438,...,22,0,10,8.02,8.75,161.4,573.4,1687.7,5731.8,0
2,mAb3,14.9,IgG1,80,486,122,720,1.317641,1.057277,4007.478271,...,26,0,12,7.67,8.71,149.5,552.1,2170.0,6075.0,0
3,mAb4,93.4,IgG1,78,482,122,714,1.195039,1.024895,3754.267578,...,20,-2,-24,8.19,8.83,161.9,539.7,2406.3,7008.6,1
4,mAb5,8.6,IgG1,89,504,112,700,1.273285,1.052817,5683.70459,...,26,0,22,7.86,8.85,213.3,598.2,1636.9,5795.4,0


In [141]:
# Extract and save fasta entry for each antibody in the SI fasta file into separate file

from Bio.SeqIO.FastaIO import FastaIterator

# extract light and heavy chain sequences from fasta file
light_chains = {}
heavy_chains = {}
with open(os.path.join(os.path.join(DATA_DIR, 'SI'), 'seq_vis_SI.fasta'), 'r') as handle:
    for record in FastaIterator(handle):
        id_fields = record.id.split('_')
        title = id_fields[0]
        if title == 'mAB27': # handle the inconsistent naming in the SI file
            title = 'mAb27'
        chain_type = id_fields[1]
        if chain_type == 'light':
            light_chains[title] = str(record.seq)
        else:
            heavy_chains[title] = str(record.seq)

fasta_files = []
for k, v in light_chains.items():
    if k in df_Ab21[ENTITY_KEY].values:
        fasta_file = os.path.join(FASTA_DIR, k + '.fasta')
        fasta_files.append(fasta_file)
        with open(fasta_file, 'w') as fptr:
            fptr.write('>' + k + '_VH\n')
            fptr.write(heavy_chains[k] + '\n')
            fptr.write('>' + k + '_VL\n')
            fptr.write(v + '\n')

print('%d fasta files were saved in FASTA_DIR' % len(fasta_files))

21 fasta files were saved in FASTA_DIR


## Process PDGF38 dataset

In [129]:
def get_plos_seq_data(df_seq_plos):
    df_tmp = df_seq_plos.loc[:,'FW1':'FW4']
    df_seq_plos['seq'] = df_tmp.apply(''.join, axis=1)
    entity_to_sequence = {}
    sequences = df_seq_plos['seq'].values
    for _, row in df_seq_plos.iterrows():
        ref_sequence = list(row['seq'])
        entity_to_sequence[row['Name']] = ref_sequence
        for _, row2 in df_seq_plos.iterrows():
            if row2['Name'] == row['Name']: 
                continue
            sequence2 = list(row2['seq'])
            sequence2_mod = copy.copy(ref_sequence)
            for idx, (aa1, aa2) in enumerate(zip(ref_sequence, sequence2)):
                if aa2 != '-':
                    sequence2_mod[idx] = aa2
            entity_to_sequence[row2['Name']] = sequence2_mod

        return entity_to_sequence

In [142]:
# Extract PDGF sequences and viscosity values from Lai SI: SI/mutants_SI.xlsx

# Extract sequences from PLOS SI
df_plos_lc = pd.read_csv('data/PDGF38_light.csv', sep='\t')
df_plos_hc = pd.read_csv('data/PDGF38_heavy.csv', sep='\t')

entity_to_sequence_hc = get_plos_seq_data(df_plos_hc)
entity_to_sequence_lc = get_plos_seq_data(df_plos_lc)
plos_data = [(k, ''.join(entity_to_sequence_hc[k]), ''.join(v)) for k, v in entity_to_sequence_lc.items()]
df_plos = pd.DataFrame(plos_data, columns=[ENTITY_KEY, 'HC', 'LC'])

# Lai SI
df_PDGF38_raw = pd.read_csv(os.path.join(DATA_DIR, 'PDGF38_raw.csv'))

df_PDGF38_sheet3 = pd.read_excel(xls, 'result')
df_PDGF38_sheet3.rename({'Unnamed: 0':ENTITY_KEY}, inplace=True, axis=1)

df_PDGF38 = df_PDGF38_raw.merge(df_plos, on=ENTITY_KEY)

df_PDGF38 = df_PDGF38[[ENTITY_KEY, VISCOSITY_KEY, 'SCM', 'HC', 'LC']]
df_PDGF38.to_csv(os.path.join(DATA_DIR, 'PDGF38.csv'), index=False)

df_PDGF38.head()

Unnamed: 0,Entity,Viscosity_at_150,SCM,HC,LC
0,AB-001,440,-2213,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...
1,R1-002,288,-2008,EVQLLQSGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...
2,R1-003,523,-1985,EVQLLESGGGLVKPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...
3,R1-004,310,-1961,EVQLLESGGGLVQPGGSLRLSCRASGFTFSSYAMSWVRQAPGKGLE...,SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...
4,R1-005,190,-1838,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...


In [143]:
fasta_files = []
for entity, hc, lc in zip(df_PDGF38[ENTITY_KEY].values, df_PDGF38['HC'].values, df_PDGF38['LC'].values):
    if 'R1-001' in entity:
        print(entity)
    fasta_file = os.path.join(FASTA_DIR, entity + '.fasta')
    fasta_files.append(fasta_file)
    with open(fasta_file, 'w') as fptr:
        fptr.write('>' + entity + '_VH\n')
        fptr.write(hc + '\n')
        fptr.write('>' + entity + '_VL\n')
        fptr.write(lc + '\n')

print('%d fasta files were saved in FASTA_DIR' % len(fasta_files))


38 fasta files were saved in FASTA_DIR
