# 03. BioLip Preprocessing

In [1]:
from pathlib import Path
import re

import pandas as pd
import numpy as np
from gemmi import cif

Download BioLip2 here: https://zhanggroup.org/BioLiP/download/BioLiP.txt.gz

In [2]:
!mkdir -p ../data
!wget https://zhanggroup.org/BioLiP/download/BioLiP.txt.gz -O ../data/BioLiP.txt.gz

--2024-11-13 15:59:02--  https://zhanggroup.org/BioLiP/download/BioLiP.txt.gz
Resolving zhanggroup.org (zhanggroup.org)... 141.213.137.249
Connecting to zhanggroup.org (zhanggroup.org)|141.213.137.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 71879529 (69M) [application/x-gzip]
Saving to: ‘../data/BioLiP.txt.gz’


2024-11-13 15:59:03 (68.3 MB/s) - ‘../data/BioLiP.txt.gz’ saved [71879529/71879529]



In [3]:
DATA_DIR = Path('../data')
PATH_TO_BIOLIP = DATA_DIR / 'BioLiP.txt.gz'

In [4]:
biolip = pd.read_csv(PATH_TO_BIOLIP, compression='gzip', sep='\t', header=None, low_memory=False)
biolip

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,101m,A,2.07,BS01,HEM,A,1,F43 R45 V68 S92 H93 H97 I99 Y103,F44 R46 V69 S93 H94 H98 I100 Y104,,...,"1.11.1.-,1.7.-.-","0004601,0005344,0005737,0015671,0016491,001652...",,,,,P02185,,155,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
1,102m,A,1.84,BS01,HEM,A,1,F43 R45 T67 L89 S92 H93 H97 I99 Y103,F44 R46 T68 L90 S93 H94 H98 I100 Y104,,...,"1.11.1.-,1.7.-.-","0004601,0005344,0005737,0015671,0016491,001652...",,,,,P02185,,155,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
2,103m,A,2.07,BS01,HEM,A,1,F43 R45 S92 H93 H97 I99 Y103,F44 R46 S93 H94 H98 I100 Y104,,...,"1.11.1.-,1.7.-.-","0004601,0005344,0005737,0015671,0016491,001652...",,,,,P02185,,155,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
3,104m,A,1.71,BS01,HEM,A,1,F43 R45 V68 S92 H93 H97 I99 Y103 F138,F43 R45 V68 S92 H93 H97 I99 Y103 F138,,...,"1.11.1.-,1.7.-.-","0004601,0005344,0005737,0015671,0016491,001652...",,,,,P02185,,155,VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRF...
4,105m,A,2.02,BS01,HEM,A,1,F43 R45 H64 V68 L89 H93 H97 I99,F43 R45 H64 V68 L89 H93 H97 I99,,...,"1.11.1.-,1.7.-.-","0004601,0005344,0005737,0015671,0016491,001652...",,,,,P02185,,155,VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRF...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
902751,9xim,C,2.40,BS02,MN,C,1,E181 E217 D245 D292,E178 E214 D242 D289,H54 D57 M88 E181 K183 E217 H220 D245 D255 D257...,...,5.3.1.5,"0000287,0005737,0005975,0009045,0016853,004273...",,,,,P12851,1610791.0,395,QATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIGAY...
902752,9xim,C,2.40,BS03,MN,C,2,E217 H220 D255 D257,E214 H217 D252 D254,H54 D57 M88 E181 K183 E217 H220 D245 D255 D257...,...,5.3.1.5,"0000287,0005737,0005975,0009045,0016853,004273...",,,,,P12851,1610791.0,396,QATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIGAY...
902753,9xim,D,2.40,BS01,XLS,D,1,H54 W137 E181 K183 H220 D292,H52 W135 E179 K181 H218 D290,H54 D57 M88 E181 K183 E217 H220 D245 D255 D257...,...,5.3.1.5,"0000287,0005737,0005975,0009045,0016853,004273...",,,,,P12851,1610791.0,397,VQATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIGA...
902754,9xim,D,2.40,BS02,MN,D,1,E181 E217 D245 D292,E179 E215 D243 D290,H54 D57 M88 E181 K183 E217 H220 D245 D255 D257...,...,5.3.1.5,"0000287,0005737,0005975,0009045,0016853,004273...",,,,,P12851,1610791.0,395,VQATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIGA...


Instead of having SMILES of the ligands, BioLip contains ligand ids in the Chemical Component Dictionary used by PDB database. We need to convert them back to SMILES.

Download the CCD: https://files.wwpdb.org/pub/pdb/data/monomers/components.cif

In [5]:
!wget https://files.wwpdb.org/pub/pdb/data/monomers/components.cif -O ../data/components.cif

--2024-11-13 15:59:13--  https://files.wwpdb.org/pub/pdb/data/monomers/components.cif
Resolving files.wwpdb.org (files.wwpdb.org)... 132.249.213.193
Connecting to files.wwpdb.org (files.wwpdb.org)|132.249.213.193|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 444481438 (424M) [chemical/x-cif]
Saving to: ‘../data/components.cif’


2024-11-13 15:59:29 (28.3 MB/s) - ‘../data/components.cif’ saved [444481438/444481438]



In [6]:
PATH_TO_CCD = DATA_DIR / 'components.cif'
ccd = cif.read(str(PATH_TO_CCD))
ccd

<gemmi.cif.Document with 44714 blocks (000, 001, 002...)>

In [7]:
# Parse SMILES from CCD
ligand_ids = list(set(biolip[4].to_list()))
ligand_smiles = []
for ligand_id in ligand_ids:
    try:
        ligand = ccd[ligand_id]
    except:
        ligand_smiles.append(None)
        continue
    descriptor_types = list(ligand.find_values('_pdbx_chem_comp_descriptor.type'))
    is_smiles_found = False
    for i, dtype in enumerate(descriptor_types):
        if dtype == 'SMILES':
            ligand_smiles.append(ligand.find_values('_pdbx_chem_comp_descriptor.descriptor')[i].strip('"'))
            is_smiles_found = True
            break
    if not is_smiles_found:
        ligand_smiles.append(None)

In [8]:
ligand2smiles = dict(zip(ligand_ids, ligand_smiles))
smiles = biolip[4].map(ligand2smiles)

Now let's process the activities

In [9]:
biolip = biolip[[0, 14, 15, 16, 17, 20]].reset_index(drop=True)
biolip.columns = [
    'PDB',
    'activity_bmoad',
    'activity_pdbbind',
    'activity_bdb',
    'uniprot_id',
    'sequence',
]
biolip['smiles'] = smiles
biolip

Unnamed: 0,PDB,activity_bmoad,activity_pdbbind,activity_bdb,uniprot_id,sequence,smiles
0,101m,,,,P02185,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,C=1c3c(c(c4C=C5C(=C(C=6C=C7C(=C(C8=CC=2C(=C(C=...
1,102m,,,,P02185,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,C=1c3c(c(c4C=C5C(=C(C=6C=C7C(=C(C8=CC=2C(=C(C=...
2,103m,,,,P02185,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,C=1c3c(c(c4C=C5C(=C(C=6C=C7C(=C(C8=CC=2C(=C(C=...
3,104m,,,,P02185,VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRF...,C=1c3c(c(c4C=C5C(=C(C=6C=C7C(=C(C8=CC=2C(=C(C=...
4,105m,,,,P02185,VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRF...,C=1c3c(c(c4C=C5C(=C(C=6C=C7C(=C(C8=CC=2C(=C(C=...
...,...,...,...,...,...,...,...
902751,9xim,,,,P12851,QATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIGAY...,[Mn+2]
902752,9xim,,,,P12851,QATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIGAY...,[Mn+2]
902753,9xim,,,,P12851,VQATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIGA...,O=CC(O)C(O)C(O)CO
902754,9xim,,,,P12851,VQATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIGA...,[Mn+2]


In [10]:
moad = ~biolip['activity_bmoad'].isna()
pdbbind = ~biolip['activity_pdbbind'].isna()
bdb = ~biolip['activity_bdb'].isna()
has_activity =  moad | pdbbind | bdb

In [11]:
biolip = biolip[has_activity]
biolip 

Unnamed: 0,PDB,activity_bmoad,activity_pdbbind,activity_bdb,uniprot_id,sequence,smiles
9,10gs,Ki=0.4uM,"-logKd/Ki=6.40,Ki=0.4uM",Ki=420nM,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,O=C(O)C(N)CCC(=O)NC(C(=O)NC(c1ccccc1)C(=O)O)CS...
10,10gs,Ki=0.4uM,"-logKd/Ki=6.40,Ki=0.4uM",Ki=420nM,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,O=C(O)C(N)CCC(=O)NC(C(=O)NC(c1ccccc1)C(=O)O)CS...
38,11gs,,"-logKd/Ki=5.82,Ki=1.5uM",,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,O=C(NCC(=O)O)C(NC(=O)CCC(C(=O)O)N)CS
39,11gs,,"-logKd/Ki=5.82,Ki=1.5uM",IC50=2.6e+4nM,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,Clc1c(C(=O)\C(=C)CC)ccc(OCC(=O)O)c1Cl
40,11gs,,"-logKd/Ki=5.82,Ki=1.5uM",,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,O=C(NCC(=O)O)C(NC(=O)CCC(C(=O)O)N)CS
...,...,...,...,...,...,...,...
902425,9icd,Kd=125uM,"-logKd/Ki=3.90,Kd=125uM",,P08200,SKVVVPAQGKKITLQNGKLNVPENPIIPYIEGDGIGVDVTPAMLKV...,NC(=O)c1ccc[n+](c1)[CH]2O[CH](CO[P]([O-])(=O)O...
902479,9icu,,,IC50=1400nM,P06746,ETLNGGITDMLTELANFEKNVSQAIHKYNAYRKAASVIAKYPHKIK...,O=P(O)(O)OP(=O)(O)OP(=O)(O)OCC2OC(N1C(=O)NC(=O...
902728,9nse,Ki=0.039uM,,,P29473,GPKFPRVKNWELGSITYDTLCAQSQQDGPCTPRRCLGSLVLPRKLQ...,[N@H]=C([Se]CC)N
902734,9nse,Ki=0.039uM,,,P29473,KFPRVKNWELGSITYDTLCAQSQQDGPCTPRRCLGSLVLPRKLQTR...,[N@H]=C([Se]CC)N


In [12]:
# We don't need BDB data, because we will join it later
no_bdb = biolip['activity_bdb'].isna()
biolip = biolip[no_bdb].reset_index(drop=True)
biolip

Unnamed: 0,PDB,activity_bmoad,activity_pdbbind,activity_bdb,uniprot_id,sequence,smiles
0,11gs,,"-logKd/Ki=5.82,Ki=1.5uM",,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,O=C(NCC(=O)O)C(NC(=O)CCC(C(=O)O)N)CS
1,11gs,,"-logKd/Ki=5.82,Ki=1.5uM",,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,O=C(NCC(=O)O)C(NC(=O)CCC(C(=O)O)N)CS
2,13gs,Ki=24uM,"-logKd/Ki=4.62,Ki=24uM",,P09211,MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...,O=S(=O)(Nc1ncccc1)c3ccc(/N=N/c2cc(C(=O)O)c(O)c...
3,13gs,Ki=24uM,,,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,O=S(=O)(Nc1ncccc1)c3ccc(/N=N/c2cc(C(=O)O)c(O)c...
4,16pk,,"-logKd/Ki=5.22,Ki=6uM",,P07378,EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,FC(F)(P(=O)(O)O)CCCC(F)(F)P(=O)(O)OP(=O)(O)OCC...
...,...,...,...,...,...,...,...
28851,9hvp,,"-logKd/Ki=8.35,Ki=4.5nM",,P12497,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,O=C(OCc1ccccc1)NC(C(=O)NC(Cc2ccccc2)C(O)C(NC(=...
28852,9icd,Kd=125uM,"-logKd/Ki=3.90,Kd=125uM",,P08200,SKVVVPAQGKKITLQNGKLNVPENPIIPYIEGDGIGVDVTPAMLKV...,NC(=O)c1ccc[n+](c1)[CH]2O[CH](CO[P]([O-])(=O)O...
28853,9nse,Ki=0.039uM,,,P29473,GPKFPRVKNWELGSITYDTLCAQSQQDGPCTPRRCLGSLVLPRKLQ...,[N@H]=C([Se]CC)N
28854,9nse,Ki=0.039uM,,,P29473,KFPRVKNWELGSITYDTLCAQSQQDGPCTPRRCLGSLVLPRKLQTR...,[N@H]=C([Se]CC)N


In [13]:
for x in biolip[~biolip['activity_bmoad'].isna()]['activity_bmoad']:
    print(x)

Ki=24uM
Ki=24uM
Ka=8900M^-1
Ka=5170M^-1
Ka=51000M^-1
Kd=289uM
Kd=14uM
Kd=469uM
Ki=27nM
ic50=0.004uM
ic50=0.004uM
Kd=0.01uM
Kd=0.01uM
Ki~0.1pM
Ki~0.1pM
Ki~0.1pM
Ki~0.1pM
ic50=3.6uM
ic50=3.6uM
Kd=9.4pM
Kd=0.22uM
Ki=1.2uM
Kd=0.35uM
Kd=0.35uM
ic50=800uM
Ki=5uM
Ki=5uM
Ki=5uM
Kd=0.2nM
Ki=7.4nM
Ki=14nM
Ki=14nM
Kd=2uM
Kd=2uM
Kd=2uM
Kd=2uM
Ki=119nM
Ki=3nM
Kd=3.8uM
Kd=3.8uM
Ki=0.18uM
Kd=4.4uM
Ki=240nM
Ki=240nM
Ki=520nM
Ki=520nM
Kd=1nM
Kd=1nM
Kd=120uM
Kd=120uM
Ki=2.7uM
Ki=403uM
Ki=3.13mM
Ki=0.189mM
Ki=15uM
Kd=135uM
Ki=2.32mM
Ki=0.049mM
Ki=19.1nM
Ki=12.2nM
Kd=0.79nM
Kd=0.82nM
Kd=3.2nM
Kd=746nM
Kd=0.0026nM
Kd=2.6nM
Ki=4.8uM
Ki=0.3uM
Ki=0.3uM
Kd=132mM
Ki=0.05mM
Kd=29uM
Kd=1.51uM
Kd=1.51uM
Ki=0.52uM
Ki=0.52uM
Kd=60nM
Kd=0.125mM
Ki=540uM
Ka=1570M^-1
Ka=1570M^-1
Kd=2.4nM
Ki=41nM
Kd=0.254uM
Kd=0.12uM
Kd=97.9uM
Kd=40nM
Kd=40nM
Ki=550nM
Ki=550nM
Ki=550nM
Ki=1.8nM
Ki=5nM
Ki=4nM
Ki=3nM
Ki=50mM
Ki=23pM
Ki=14nM
Ki=20nM
Kd=0.14uM
Kd=0.14uM
Ka=500000M^-1
Ka=15000M^-1
Ki=460nM
Ka~10000000M^-1
ic50=0.5nM
ic50=0.5

We need to parse the activities to work with them

In [14]:
def parse_value(line):
    line = str(line)
    pattern = r'(?<!-)(Ki|Kd|IC50|EC50)\b\s*([<=>])\s*([0-9]*\.?[0-9]+)\s*(pM|nM|uM|mM)\b'
    matches = re.finditer(pattern, line, re.IGNORECASE)
    for match in matches:
        activity_type = match.group(1).upper()
        if activity_type == 'KI':
            activity_type = 'Ki'
        if activity_type == 'KD':
            activity_type = 'Kd'
        relation = match.group(2)
        value = float(match.group(3))
        unit = match.group(4).lower()
        # Convert value to nM
        if unit == 'pm':
            value /= 1000.0
        elif unit == 'nm':
            pass  # value is already in nM
        elif unit == 'um':
            value *= 1000.0
        else:
            return None  # Unknown unit
        return (activity_type, value, relation)
    return None

In [15]:
biolip['activity_pdbbind_parsed'] = biolip['activity_pdbbind'].map(parse_value)
biolip['activity_bmoad_parsed'] = biolip['activity_bmoad'].map(parse_value)

In [17]:
biolip = biolip.dropna(
    subset=['activity_pdbbind_parsed', 'activity_bmoad_parsed'],
    how='all'
).reset_index(drop=True)
biolip

Unnamed: 0,PDB,activity_bmoad,activity_pdbbind,activity_bdb,uniprot_id,sequence,smiles,activity_pdbbind_parsed,activity_bmoad_parsed
0,11gs,,"-logKd/Ki=5.82,Ki=1.5uM",,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,O=C(NCC(=O)O)C(NC(=O)CCC(C(=O)O)N)CS,"(Ki, 1500.0, =)",
1,11gs,,"-logKd/Ki=5.82,Ki=1.5uM",,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,O=C(NCC(=O)O)C(NC(=O)CCC(C(=O)O)N)CS,"(Ki, 1500.0, =)",
2,13gs,Ki=24uM,"-logKd/Ki=4.62,Ki=24uM",,P09211,MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...,O=S(=O)(Nc1ncccc1)c3ccc(/N=N/c2cc(C(=O)O)c(O)c...,"(Ki, 24000.0, =)","(Ki, 24000.0, =)"
3,13gs,Ki=24uM,,,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,O=S(=O)(Nc1ncccc1)c3ccc(/N=N/c2cc(C(=O)O)c(O)c...,,"(Ki, 24000.0, =)"
4,16pk,,"-logKd/Ki=5.22,Ki=6uM",,P07378,EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,FC(F)(P(=O)(O)O)CCCC(F)(F)P(=O)(O)OP(=O)(O)OCC...,"(Ki, 6000.0, =)",
...,...,...,...,...,...,...,...,...,...
26310,9hvp,,"-logKd/Ki=8.35,Ki=4.5nM",,P12497,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,O=C(OCc1ccccc1)NC(C(=O)NC(Cc2ccccc2)C(O)C(NC(=...,"(Ki, 4.5, =)",
26311,9icd,Kd=125uM,"-logKd/Ki=3.90,Kd=125uM",,P08200,SKVVVPAQGKKITLQNGKLNVPENPIIPYIEGDGIGVDVTPAMLKV...,NC(=O)c1ccc[n+](c1)[CH]2O[CH](CO[P]([O-])(=O)O...,"(Kd, 125000.0, =)","(Kd, 125000.0, =)"
26312,9nse,Ki=0.039uM,,,P29473,GPKFPRVKNWELGSITYDTLCAQSQQDGPCTPRRCLGSLVLPRKLQ...,[N@H]=C([Se]CC)N,,"(Ki, 39.0, =)"
26313,9nse,Ki=0.039uM,,,P29473,KFPRVKNWELGSITYDTLCAQSQQDGPCTPRRCLGSLVLPRKLQTR...,[N@H]=C([Se]CC)N,,"(Ki, 39.0, =)"


In [18]:
def get_activity(row):
    if row['activity_pdbbind_parsed'] is not None:
        return row['activity_pdbbind_parsed']
    return row['activity_bmoad_parsed']

def unfold_activities(value):
    # Return (Kd, Ki, IC50)
    measure, activity, sign = value
    ki = None
    ki_sign = None
    kd = None
    kd_sign = None
    ic50 = None
    ic50_sign = None

    if measure == 'Kd':
        kd = activity
        kd_sign = sign
    if measure == 'Ki':
        ki = activity
        ki_sign = sign
    if measure == 'IC50':
        ic50 = activity
        ic50_sign = sign
    return (kd, kd_sign, ki, ki_sign, ic50, ic50_sign)

In [19]:
single_activity = biolip.apply(get_activity, axis=1)
single_activity

0          (Ki, 1500.0, =)
1          (Ki, 1500.0, =)
2         (Ki, 24000.0, =)
3         (Ki, 24000.0, =)
4          (Ki, 6000.0, =)
               ...        
26310         (Ki, 4.5, =)
26311    (Kd, 125000.0, =)
26312        (Ki, 39.0, =)
26313        (Ki, 39.0, =)
26314        (Ki, 39.0, =)
Length: 26315, dtype: object

In [20]:
unfolded_activities = single_activity.apply(unfold_activities)
unfolded_activities = pd.DataFrame(unfolded_activities.to_list(), columns=['kd', 'kd_sign', 'ki', 'ki_sign', 'ic50', 'ic50_sign'])
unfolded_activities

Unnamed: 0,kd,kd_sign,ki,ki_sign,ic50,ic50_sign
0,,,1500.0,=,,
1,,,1500.0,=,,
2,,,24000.0,=,,
3,,,24000.0,=,,
4,,,6000.0,=,,
...,...,...,...,...,...,...
26310,,,4.5,=,,
26311,125000.0,=,,,,
26312,,,39.0,=,,
26313,,,39.0,=,,


In [21]:
biolip = pd.DataFrame({
    'smiles': biolip['smiles'],
    'uniprot_id': biolip['uniprot_id'],
    'sequence': biolip['sequence'],
    'kd': unfolded_activities['kd'],
    'ki': unfolded_activities['ki'],
    'ic50': unfolded_activities['ic50'],
    'kd_sign': unfolded_activities['kd_sign'],
    'ki_sign': unfolded_activities['ki_sign'],
    'ic50_sign': unfolded_activities['ic50_sign'],
})
biolip['source'] = 'biolip'
biolip['ec50'] = None
biolip['ec50_sign'] = None
biolip

Unnamed: 0,smiles,uniprot_id,sequence,kd,ki,ic50,kd_sign,ki_sign,ic50_sign,source,ec50,ec50_sign
0,O=C(NCC(=O)O)C(NC(=O)CCC(C(=O)O)N)CS,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,,1500.0,,,=,,biolip,,
1,O=C(NCC(=O)O)C(NC(=O)CCC(C(=O)O)N)CS,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,,1500.0,,,=,,biolip,,
2,O=S(=O)(Nc1ncccc1)c3ccc(/N=N/c2cc(C(=O)O)c(O)c...,P09211,MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...,,24000.0,,,=,,biolip,,
3,O=S(=O)(Nc1ncccc1)c3ccc(/N=N/c2cc(C(=O)O)c(O)c...,P09211,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,,24000.0,,,=,,biolip,,
4,FC(F)(P(=O)(O)O)CCCC(F)(F)P(=O)(O)OP(=O)(O)OCC...,P07378,EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,,6000.0,,,=,,biolip,,
...,...,...,...,...,...,...,...,...,...,...,...,...
26310,O=C(OCc1ccccc1)NC(C(=O)NC(Cc2ccccc2)C(O)C(NC(=...,P12497,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,,4.5,,,=,,biolip,,
26311,NC(=O)c1ccc[n+](c1)[CH]2O[CH](CO[P]([O-])(=O)O...,P08200,SKVVVPAQGKKITLQNGKLNVPENPIIPYIEGDGIGVDVTPAMLKV...,125000.0,,,=,,,biolip,,
26312,[N@H]=C([Se]CC)N,P29473,GPKFPRVKNWELGSITYDTLCAQSQQDGPCTPRRCLGSLVLPRKLQ...,,39.0,,,=,,biolip,,
26313,[N@H]=C([Se]CC)N,P29473,KFPRVKNWELGSITYDTLCAQSQQDGPCTPRRCLGSLVLPRKLQTR...,,39.0,,,=,,biolip,,


In [22]:
biolip.to_csv(DATA_DIR / 'biolip.csv', index=False)