In [None]:
# Jupyter notebook for filtering ChEMBL GPCRSARfari database

import os
import pandas as pd
import csv
import random


# load the CSV file from the ChEMBL GPCRSARfari database, The file is tab-separated
#df = pd.read_csv("chEMBL_dump_ReExport.csv", sep=',')
df = pd.read_csv("chEMBL_dump_ReExport.csv", sep=';')

# Print the key values of the dataframe
print(df.columns)

# Need to filter the dataframe to exclude rows where the MED_CHEM_FRIENDLY is N or empty
print(f"Initial number of rows: {len(df)}")


# Filter by Molecular Weight
df = df[df['Molecular Weight'] < 500]

# Filter by LogP values < 5
df = df[df['AlogP'] < 5]

# Filter by QED weighted (Quantitative Estimate of Drug-likeness)
df = df[df['QED Weighted'] > 0.6]

# Print the number of rows after filtering by MOL_WEIGHT
print(f"Number of rows after filtering: {len(df)}")

# Now we will filter by a random selection of 30000 rows
df = df.sample(n=30000, random_state=1)

# Print the number of rows after random sampling
print(f"Number of rows after random sampling: {len(df)}")

# save the filtered dataframe to a new CSV file
output_file = "chEMBL_filtered.csv"
df.to_csv(output_file, sep="\t", index=False)





Index(['ChEMBL ID', 'Type', 'Max Phase', 'Molecular Weight', 'Targets',
       'Bioactivities', 'AlogP', 'Polar Surface Area', 'HBA', 'HBD',
       '#RO5 Violations', '#Rotatable Bonds', 'Passes Ro3', 'QED Weighted',
       'CX Acidic pKa', 'CX Basic pKa', 'CX LogP', 'CX LogD', 'Aromatic Rings',
       'Structure Type', 'Inorganic Flag', 'Heavy Atoms', 'HBA (Lipinski)',
       'HBD (Lipinski)', '#RO5 Violations (Lipinski)',
       'Molecular Weight (Monoisotopic)', 'Np Likeness Score',
       'Molecular Species', 'Molecular Formula', 'Smiles'],
      dtype='object')
Initial number of rows: 1048575
Number of rows after filtering: 441711
Number of rows after random sampling: 30000
