# BindingDB Polymer table extraction

The purpose of this jupyter notebook is to extract the relevant data from the bindingdb dataset that can be found [here](https://www.bindingdb.org/rwd/bind/chemsearch/marvin/Download.jsp).

From the MySQL DB, extract out the polymer table

In [1]:
import pandas as pd
import numpy as np
import yaml

In [2]:
def load_config(yaml_path="P1-config.yaml"):
    with open(yaml_path, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)

config = load_config()

In [3]:
polymer_path = config["raw_paths"]["polymer_table"]
polymer_data = pd.read_csv(polymer_path, sep='\t')
polymer_data.head()



Unnamed: 0,"component_id""",comments,topology,weight,source_organism,unpid2,scientific_name,type,display_name,res_count,sequence,n_pdb_ids,taxid,unpid1,polymerid,pdb_ids,short_name,common_name,chembl_id
0,"NULL""",P0DTH5,Linear,40978.57,HHV-1,,Human herpesvirus 1 (strain 17),Enzyme,Thymidine kinase,376.0,MASYPCHQHASAFDQAARSRGHNNRRTALRPRRQQKATEVRLEQKM...,33,10299.0,P0DTH5,1,"1OF1,1QHI,2KI5,1KIM,3VTK,2VTK,1VTK,1P7C,4OQX,4...",,HHV11,
1,"NULL""",Biotin-Binding Protein,Linear,16447.7,Streptomyces avidinii,,Streptomyces avidinii,Protein,Streptavadin(N23A),159.0,DPSKDSKAQV SAAEAGITGT WYAQLGSTFI VTAGADGALT GT...,213,1895.0,,2,"8GVK,7EFD,7EFC,7DY0,5VCQ,5N99,5N8W,5N8T,5N8J,5...",,STRAV,
2,"NULL""",Biotin-Binding Protein,Linear,16398.63,Streptomyces avidinii,,Streptomyces avidinii,Protein,Streptavadin(Y43A),159.0,DPSKDSKAQV SAAEAGITGT WYNQLGSTFI VTAGADGALT GT...,218,1895.0,,3,"8GVK,7EFD,7EFC,7DY0,5VCQ,5N99,5N8W,5N8T,5N8J,5...",,STRAV,
3,"NULL""",Biotin-Binding Protein,Linear,16474.72,Streptomyces avidinii,,Streptomyces avidinii,Protein,Streptavadin(S27A),159.0,DPSKDSKAQV SAAEAGITGT WYNQLGATFI VTAGADGALT GT...,215,1895.0,,4,"8GVK,7EFD,7EFC,7DY0,5VCQ,5N99,5N8W,5N8T,5N8J,5...",,STRAV,
4,"4041""",P22629,Linear,18835.54,Streptomyces avidinii,,Streptomyces avidinii,Protein,Streptavidin,183.0,MRKIVVAAIAVSLTTVSITASASADPSKDSKAQVSAAEAGITGTWY...,218,1895.0,P22629,5,"8GVK,7EFD,7EFC,7DY0,5VCQ,5N99,5N8W,5N8T,5N8J,5...",,STRAV,CHEMBL1075026


## Filter table

### Filter by source organism

In [4]:
# filter by source organism 
source_organism = ['Human']

polymer_data_filtered = polymer_data[polymer_data['source_organism'].isin(source_organism)]
print(np.shape(polymer_data_filtered))


(4542, 19)


## Keep only relevant columns

In [6]:
# polymer final only keeps display_name and pdb_ids coumns
polymer_final = polymer_data_filtered[['display_name', 'pdb_ids']].drop_duplicates().reset_index(drop=True)
print(np.shape(polymer_final))

# save to csv
polymer_final_path = config["processed_paths"]["polymer_final"]
polymer_final.to_csv(polymer_final_path, index=False)

(4522, 2)
