In [1]:
import os
import copy
import numpy as np
import pandas as pd

In [2]:
try:
    from pfabnet import utils
    from pfabnet.base import ENTITY_KEY, VISCOSITY_KEY
except ModuleNotFoundError as e:
    os.chdir(os.getcwd() + '/../')
    from pfabnet import base
    from pfabnet.base import ENTITY_KEY, VISCOSITY_KEY

In [3]:
BASE_DIR = os.path.dirname(base.get_file_path()) + '/../'
DATA_DIR = os.path.join(BASE_DIR, 'data')
RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')
FASTA_DIR = os.path.join(DATA_DIR, 'fasta')

In [4]:
# create data directory
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(RAW_DATA_DIR, exist_ok=True)
os.makedirs(FASTA_DIR, exist_ok=True)

## Process Ab21 dataset

In [11]:
# Extract supplementary data from Lai et al. Mol. Pharmaceutics 2021, 18, 3, 1167–1175
# https://pubs.acs.org/doi/suppl/10.1021/acs.molpharmaceut.0c01073/suppl_file/mp0c01073_si_001.zip

# 1. copy mp0c01073_si_001.zip to data
# 2. unzip mp0cp1073_si_001.zip - this will create a sub directory SI

In [19]:
df_Ab21 = pd.read_csv(os.path.join(os.path.join(DATA_DIR, 'SI'), 'features_values_SI.csv'))
df_Ab21 = df_Ab21.loc[df_Ab21.Isotype == 'IgG1']
df_Ab21.rename({'mabs':ENTITY_KEY}, inplace=True, axis=1)
df_Ab21.reset_index(drop=True, inplace=True)

df_Ab21_2 = pd.read_csv(os.path.join(DATA_DIR, 'Ab21_raw.csv'))
df_Ab21_merged = df_Ab21_2.merge(df_Ab21, on=ENTITY_KEY)

df_Ab21_merged.to_csv(os.path.join(DATA_DIR, 'Ab21.csv'), index=False)
print('Number of antibodies in Ab21 set: %d' % len(df_Ab21_merged))
df_Ab21_merged.head()

Number of antibodies in Ab21 set: 21


Unnamed: 0,Entity,Viscosity_at_150,Isotype,N_hydrophobic Fv,N_hydrophobic mAb,N_hydrophilic Fv,N_hydrophilic mAb,HI_Fv,HI_mAb,SASA_phobic_Fv,...,net charges mAb,FvCSP,mAbCSP,Fv_pI,mAb_pI,SAP Fv,SAP mAb,SCM Fv,SCM mAb,classifier
0,mAb1,14.4,IgG1,83,492,120,712,1.098922,1.002246,3760.633301,...,26,-10,40,8.88,8.96,134.8,526.3,2522.9,6979.0,0
1,mAb2,20.9,IgG1,85,498,110,690,1.364037,1.078138,4469.273438,...,22,0,10,8.02,8.75,161.4,573.4,1687.7,5731.8,0
2,mAb3,14.9,IgG1,80,486,122,720,1.317641,1.057277,4007.478271,...,26,0,12,7.67,8.71,149.5,552.1,2170.0,6075.0,0
3,mAb4,93.4,IgG1,78,482,122,714,1.195039,1.024895,3754.267578,...,20,-2,-24,8.19,8.83,161.9,539.7,2406.3,7008.6,1
4,mAb5,8.6,IgG1,89,504,112,700,1.273285,1.052817,5683.70459,...,26,0,22,7.86,8.85,213.3,598.2,1636.9,5795.4,0


In [20]:
# Extract and save fasta entry for each antibody in the SI fasta file into separate file

from Bio.SeqIO.FastaIO import FastaIterator

# extract light and heavy chain sequences from fasta file
light_chains = {}
heavy_chains = {}
with open(os.path.join(os.path.join(DATA_DIR, 'SI'), 'seq_vis_SI.fasta'), 'r') as handle:
    for record in FastaIterator(handle):
        id_fields = record.id.split('_')
        title = id_fields[0]
        if title == 'mAB27': # handle the inconsistent naming in the SI file
            title = 'mAb27'
        chain_type = id_fields[1]
        if chain_type == 'light':
            light_chains[title] = str(record.seq)
        else:
            heavy_chains[title] = str(record.seq)

fasta_files = []
for k, v in light_chains.items():
    if k in df_Ab21_merged[ENTITY_KEY].values:
        fasta_file = os.path.join(FASTA_DIR, k + '.fasta')
        fasta_files.append(fasta_file)
        with open(fasta_file, 'w') as fptr:
            fptr.write('>' + k + '_VH\n')
            fptr.write(heavy_chains[k] + '\n')
            fptr.write('>' + k + '_VL\n')
            fptr.write(v + '\n')

print('%d fasta files were saved in FASTA_DIR' % len(fasta_files))

21 fasta files were saved in FASTA_DIR


In [22]:
Ab21_entity_list = []; Ab21_LC_list = []; Ab21_HC_list = []
for k, v in light_chains.items():
    Ab21_entity_list.append(k)
    Ab21_LC_list.append(v)
    Ab21_HC_list.append(heavy_chains[k])
    
df_tmp = pd.DataFrame({ENTITY_KEY:Ab21_entity_list, 'LC':Ab21_LC_list, 'HC':Ab21_HC_list})

df_Ab21 = df_Ab21_merged.merge(df_tmp, on=ENTITY_KEY)
df_Ab21.head()

Unnamed: 0,Entity,Viscosity_at_150,Isotype,N_hydrophobic Fv,N_hydrophobic mAb,N_hydrophilic Fv,N_hydrophilic mAb,HI_Fv,HI_mAb,SASA_phobic_Fv,...,mAbCSP,Fv_pI,mAb_pI,SAP Fv,SAP mAb,SCM Fv,SCM mAb,classifier,LC,HC
0,mAb1,14.4,IgG1,83,492,120,712,1.098922,1.002246,3760.633301,...,40,8.88,8.96,134.8,526.3,2522.9,6979.0,0,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...
1,mAb2,20.9,IgG1,85,498,110,690,1.364037,1.078138,4469.273438,...,10,8.02,8.75,161.4,573.4,1687.7,5731.8,0,DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKL...,EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLE...
2,mAb3,14.9,IgG1,80,486,122,720,1.317641,1.057277,4007.478271,...,12,7.67,8.71,149.5,552.1,2170.0,6075.0,0,DIQMTQSPSSLSASVGDRVTITCSASQDISNYLNWYQQKPGKAPKV...,EVQLVESGGGLVQPGGSLRLSCAASGYTFTNYGMNWVRQAPGKGLE...
3,mAb4,93.4,IgG1,78,482,122,714,1.195039,1.024895,3754.267578,...,-24,8.19,8.83,161.9,539.7,2406.3,7008.6,1,DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRL...,QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLE...
4,mAb5,8.6,IgG1,89,504,112,700,1.273285,1.052817,5683.70459,...,22,7.86,8.85,213.3,598.2,1636.9,5795.4,0,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL...,EVQLLESGGGLVQPGGSLRLSCAVSGFTFNSFAMSWVRQAPGKGLE...


## Process PDGF38 dataset

In [23]:
def get_plos_seq_data(df_seq_plos):
    df_tmp = df_seq_plos.loc[:,'FW1':'FW4']
    df_seq_plos['seq'] = df_tmp.apply(''.join, axis=1)
    entity_to_sequence = {}
    sequences = df_seq_plos['seq'].values
    for _, row in df_seq_plos.iterrows():
        ref_sequence = list(row['seq'])
        entity_to_sequence[row['Name']] = ref_sequence
        for _, row2 in df_seq_plos.iterrows():
            if row2['Name'] == row['Name']: 
                continue
            sequence2 = list(row2['seq'])
            sequence2_mod = copy.copy(ref_sequence)
            for idx, (aa1, aa2) in enumerate(zip(ref_sequence, sequence2)):
                if aa2 != '-':
                    sequence2_mod[idx] = aa2
            entity_to_sequence[row2['Name']] = sequence2_mod

        return entity_to_sequence

In [25]:
# Extract PDGF sequences and viscosity values from Lai SI: SI/mutants_SI.xlsx

# Extract sequences from PLOS SI
df_plos_lc = pd.read_csv('data/PDGF38_light.csv', sep='\t')
df_plos_hc = pd.read_csv('data/PDGF38_heavy.csv', sep='\t')

entity_to_sequence_hc = get_plos_seq_data(df_plos_hc)
entity_to_sequence_lc = get_plos_seq_data(df_plos_lc)
plos_data = [(k, ''.join(entity_to_sequence_hc[k]), ''.join(v)) for k, v in entity_to_sequence_lc.items()]
df_plos = pd.DataFrame(plos_data, columns=[ENTITY_KEY, 'HC', 'LC'])

# Lai SI
df_PDGF38_raw = pd.read_csv(os.path.join(DATA_DIR, 'PDGF38_raw.csv'))

xls = open(os.path.join(DATA_DIR, 'SI/mutants_SI.xlsx'), 'rb')
df_PDGF38_sheet3 = pd.read_excel(xls, 'result')
df_PDGF38_sheet3.rename({'Unnamed: 0':ENTITY_KEY}, inplace=True, axis=1)

df_PDGF38 = df_PDGF38_raw.merge(df_plos, on=ENTITY_KEY)

df_PDGF38 = df_PDGF38[[ENTITY_KEY, VISCOSITY_KEY, 'SCM', 'HC', 'LC']]
df_PDGF38.to_csv(os.path.join(DATA_DIR, 'PDGF38.csv'), index=False)

df_PDGF38.head()

Unnamed: 0,Entity,Viscosity_at_150,SCM,HC,LC
0,AB-001,440,-2213,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...
1,R1-002,288,-2008,EVQLLQSGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...
2,R1-003,523,-1985,EVQLLESGGGLVKPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...
3,R1-004,310,-1961,EVQLLESGGGLVQPGGSLRLSCRASGFTFSSYAMSWVRQAPGKGLE...,SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...
4,R1-005,190,-1838,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...


In [26]:
fasta_files = []
for entity, hc, lc in zip(df_PDGF38[ENTITY_KEY].values, df_PDGF38['HC'].values, df_PDGF38['LC'].values):
    if 'R1-001' in entity:
        print(entity)
    fasta_file = os.path.join(FASTA_DIR, entity + '.fasta')
    fasta_files.append(fasta_file)
    with open(fasta_file, 'w') as fptr:
        fptr.write('>' + entity + '_VH\n')
        fptr.write(hc + '\n')
        fptr.write('>' + entity + '_VL\n')
        fptr.write(lc + '\n')

print('%d fasta files were saved in FASTA_DIR' % len(fasta_files))


38 fasta files were saved in FASTA_DIR


### Prepare Ab8 dataset

In [27]:
# Extract supplementary data from Lai et al. MABS 2021, VOL. 13, NO. 1, e1991256 (19 pages) 
# https://www.tandfonline.com/doi/suppl/10.1080/19420862.2021.1991256/suppl_file/kmab_a_1991256_sm4057.zip
#
# 1. copy kmab_a_1991256_sm4057.zip to data
# 2. unzip kmab_a_1991256_sm4057.zip - this will extract files in data directory

In [107]:
df_Ab14 = pd.read_excel(open(os.path.join(DATA_DIR, 'supplemental Table.xlsx'), 'rb'), sheet_name='Sequence Listing')
df_Ab14 = df_Ab14.loc[df_Ab14.ISOTYPE == 'IgG1 / Kappa']
df_Ab14.reset_index(drop=True, inplace=True)

df_Ab14_VH = pd.read_excel(open(os.path.join(DATA_DIR, 'supplemental Table.xlsx'), 'rb'), sheet_name='VHs')
df_Ab14_VH = df_Ab14_VH.loc[df_Ab14_VH['HC Class'] == 'IgG1']
df_Ab14_VH['VH'] = df_Ab14_VH.apply(lambda x: x['HFR1'] + x['CDRH1'] + x['HFR2'] + x['CDRH2'] + x['HFR3'] + x['CDRH3'] + x['HFR4'], axis=1)

df_Ab14_VL = pd.read_excel(open(os.path.join(DATA_DIR, 'supplemental Table.xlsx'), 'rb'), sheet_name='VLs')
df_Ab14_VL['VL'] = df_Ab14_VL.apply(lambda x: x['LFR1'] + x['CDRL1'] + x['LFR2'] + x['CDRL2'] + x['LFR3'] + x['CDRL3'] + x['LFR4'], axis=1)
df_Ab14_VL.drop_duplicates(inplace=True)
df_Ab14_VL.head()

df_Ab14 = df_Ab14.merge(df_Ab14_VH, on='mAb')
df_Ab14 = df_Ab14.merge(df_Ab14_VL, on='mAb')

df_Ab14.rename({'mAb':ENTITY_KEY, 'Amino Acids, Mature Heavy Chain':'HC', 
                'Amino Acids, Mature Light Chain':'LC'}, inplace=True, axis=1)

df_Ab14.head()

Unnamed: 0,Clone Name,Entity,ISOTYPE,HC,LC,Unnamed: 5,Variable Domain Source,Source Details,HC Class,HFR1,...,VH,LC Class,LFR1,CDRL1,LFR2,CDRL2,LFR3,CDRL3,LFR4,VL
0,TGN1412 analog,TGN1412,IgG1 / Kappa,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL...,,PDB,1YJD,IgG1,QVQLVQSGAEVKKPGASVKVSCKAS,...,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLE...,Kappa,DIQMTQSPSSLSASVGDRVTITC,HASQNIYVWLN,WYQQKPGKAPKLLIY,KASNLHT,GVPSRFSGSGSGTDFTLTISSLQPEDFATYYC,QQGQTYPYT,FGGGTKVEIK,DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL...
1,Avastin analog,Bevacizumab,IgG1 / Kappa,EVQLVESGGGLVQPGGSLRLSCAASGYTFTNYGMNWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCSASQDISNYLNWYQQKPGKAPKV...,,PDB,1BJ1,IgG1,EVQLVESGGGLVQPGGSLRLSCAAS,...,EVQLVESGGGLVQPGGSLRLSCAASGYTFTNYGMNWVRQAPGKGLE...,Kappa,DIQMTQSPSSLSASVGDRVTITC,SASQDISNYLN,WYQQKPGKAPKVLIY,FTSSLHS,GVPSRFSGSGSGTDFTLTISSLQPEDFATYYC,QQYSTVPWT,FGQGTKVEIK,DIQMTQSPSSLSASVGDRVTITCSASQDISNYLNWYQQKPGKAPKV...
2,Herceptin analog,Trastuzumab,IgG1 / Kappa,EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKL...,,PDB,1N8Z,IgG1,EVQLVESGGGLVQPGGSLRLSCAAS,...,EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLE...,Kappa,DIQMTQSPSSLSASVGDRVTITC,RASQDVNTAVA,WYQQKPGKAPKLLIY,SASFLYS,GVPSRFSGSRSGTDFTLTISSLQPEDFATYYC,QQHYTTPPT,FGQGTKVEIK,DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKL...
3,Basiliximab analog,Basiliximab,IgG1 / Kappa,QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWI...,QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW...,,PDB,1MIM,IgG1,QLQQSGTVLARPGASVKMSCKAS,...,QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWI...,Kappa,QIVSTQSPAIMSASPGEKVTMTC,SASSSRSYMQ,WYQQKPGTSPKRWIY,DTSKLAS,GVPARFSGSGSGTSYSLTISSMEAEDAATYYC,HQRSSYT,FGGGTKLEIK,QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW...
4,Natalizumab analog,Natalizumab,IgG1 / Kappa,QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLE...,DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL...,,US Patent,US5840299A,IgG1,QVQLVQSGAEVKKPGASVKVSCKAS,...,QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLE...,Kappa,DIQMTQSPSSLSASVGDRVTITC,KTSQDINKYMA,WYQQTPGKAPRLLIH,YTSALQP,GIPSRFSGSGSGRDYTFTISSLQPEDIATYYC,LQYDNLWT,FGQGTKVEIK,DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL...


In [108]:
def vl_vh_match_in_Ab21(x):
    for vl, vh in zip(df_Ab21.LC.values, df_Ab21.HC.values):
        if x['VL'] in vl and x['VH'] in vh:
            return True
    return False

df_Ab14['match'] = df_Ab14.apply(lambda x: vl_vh_match_in_Ab21(x), axis=1)
df_Ab8 = df_Ab14[~df_Ab14.match]
df_Ab8.reset_index(drop=True, inplace=True)
df_Ab8.head()

Unnamed: 0,Clone Name,Entity,ISOTYPE,HC,LC,Unnamed: 5,Variable Domain Source,Source Details,HC Class,HFR1,...,LC Class,LFR1,CDRL1,LFR2,CDRL2,LFR3,CDRL3,LFR4,VL,match
0,TGN1412 analog,TGN1412,IgG1 / Kappa,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL...,,PDB,1YJD,IgG1,QVQLVQSGAEVKKPGASVKVSCKAS,...,Kappa,DIQMTQSPSSLSASVGDRVTITC,HASQNIYVWLN,WYQQKPGKAPKLLIY,KASNLHT,GVPSRFSGSGSGTDFTLTISSLQPEDFATYYC,QQGQTYPYT,FGGGTKVEIK,DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL...,False
1,Basiliximab analog,Basiliximab,IgG1 / Kappa,QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWI...,QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW...,,PDB,1MIM,IgG1,QLQQSGTVLARPGASVKMSCKAS,...,Kappa,QIVSTQSPAIMSASPGEKVTMTC,SASSSRSYMQ,WYQQKPGTSPKRWIY,DTSKLAS,GVPARFSGSGSGTSYSLTISSMEAEDAATYYC,HQRSSYT,FGGGTKLEIK,QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW...,False
2,Natalizumab analog,Natalizumab,IgG1 / Kappa,QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLE...,DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL...,,US Patent,US5840299A,IgG1,QVQLVQSGAEVKKPGASVKVSCKAS,...,Kappa,DIQMTQSPSSLSASVGDRVTITC,KTSQDINKYMA,WYQQTPGKAPRLLIH,YTSALQP,GIPSRFSGSGSGRDYTFTISSLQPEDIATYYC,LQYDNLWT,FGQGTKVEIK,DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL...,False
3,Tremelimumab analog,Tremelimumab,IgG1 / Kappa,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQSINSYLDWYQQKPGKAPKL...,,US Patent,US6682736,IgG1,QVQLVESGGGVVQPGRSLRLSCAAS,...,Kappa,DIQMTQSPSSLSASVGDRVTITC,RASQSINSYLD,WYQQKPGKAPKLLIY,AASSLQS,GVPSRFSGSGSGTDFTLTISSLQPEDFATYYC,QQYYSTPFT,FGPGTKVEIK,DIQMTQSPSSLSASVGDRVTITCRASQSINSYLDWYQQKPGKAPKL...,False
4,Ipilimumab analog,Ipilimumab,IgG1 / Kappa,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYTMHWVRQAPGKGLE...,EIVLTQSPGTLSLSPGERATLSCRASQSVGSSYLAWYQQKPGQAPR...,,US Patent,US6984720,IgG1,QVQLVESGGGVVQPGRSLRLSCAAS,...,Kappa,EIVLTQSPGTLSLSPGERATLSC,RASQSVGSSYLA,WYQQKPGQAPRLLIY,GAFSRAT,GIPDRFSGSGSGTDFTLTISRLEPEDFAVYYC,QQYGSSPWT,FGQGTKVEIK,EIVLTQSPGTLSLSPGERATLSCRASQSVGSSYLAWYQQKPGQAPR...,False


In [109]:
fasta_files = []
for entity, hc, lc in zip(df_Ab8[ENTITY_KEY].values, df_Ab8['HC'].values, df_Ab8['LC'].values):
    fasta_file = os.path.join(FASTA_DIR, entity + '.fasta')
    fasta_files.append(fasta_file)
    with open(fasta_file, 'w') as fptr:
        fptr.write('>' + entity + '_VH\n')
        fptr.write(hc + '\n')
        fptr.write('>' + entity + '_VL\n')
        fptr.write(lc + '\n')

print('%d fasta files were saved in FASTA_DIR' % len(fasta_files))

8 fasta files were saved in FASTA_DIR


In [111]:
# Save viscosity and other computed properties
Ab8_entities = ['TGN1412', 'Basiliximab', 'Natalizumab', 'Tremelimumab', 'Ipilimumab', 'Atezolizumab', 'Ganitumab', 'Vesencumab']
Ab8_visc = [16.42, 25.05, 13.67, 8.8, 8.6, 11.56, 10.1, 23.57]
Ab8_SCM = [844.6, 640.8, 815.5, 704.2, 754, 759.6, 806.5, 661.3]
df_Ab8_visc = pd.DataFrame({ENTITY_KEY: Ab8_entities, VISCOSITY_KEY: Ab8_visc, 'SCM': Ab8_SCM})

df_Ab8_visc.to_csv(os.path.join(DATA_DIR, 'Ab8.csv'), index=False)
print('Number of antibodies in Ab8 set: %d' % len(df_Ab8_visc))
df_Ab8_visc.head()

Number of antibodies in Ab8 set: 8


Unnamed: 0,Entity,Viscosity_at_150,SCM
0,TGN1412,16.42,844.6
1,Basiliximab,25.05,640.8
2,Natalizumab,13.67,815.5
3,Tremelimumab,8.8,704.2
4,Ipilimumab,8.6,754.0
