In [None]:
# Jupyter notebook for filtering ChEMBL GPCRSARfari database

import os
import pandas as pd
import csv
import random


# load the CSV file from the ChEMBL GPCRSARfari database, The file is tab-separated
df = pd.read_csv("gs_compound.txt", sep="\t")

# Need to filter the dataframe to exclude rows where the MED_CHEM_FRIENDLY is N or empty
print(f"Initial number of rows: {len(df)}")
df = df[df["MED_CHEM_FRIENDLY"].isin(["Y", "N"])]

# Print the number of rows after filtering
print(f"Number of rows after filtering: {len(df)}")

# with the filtered dataframe, we now want to filter by MOL_WEIGHT
df = df[df["MOLWEIGHT"] < 600]

# Print the number of rows after filtering by MOL_WEIGHT
print(f"Number of rows after filtering by MOL_WEIGHT: {len(df)}")

# Now we will filter by a random selection of 50000 rows
df = df.sample(n=50000, random_state=1)

# Print the number of rows after random sampling
print(f"Number of rows after random sampling: {len(df)}")

# Print a specific column of the dataframe to verify the output
# Edit the EXTERNAL_SOURCES column to remove spaces from each entry
df["EXTERNAL_SOURCES"] = df["EXTERNAL_SOURCES"].str.replace(" ", "", regex=False)
df["EXTERNAL_SOURCES"] = df["EXTERNAL_SOURCES"].str.replace("(", "", regex=False)
df["EXTERNAL_SOURCES"] = df["EXTERNAL_SOURCES"].str.replace(")", "", regex=False)
df["EXTERNAL_SOURCES"] = df["EXTERNAL_SOURCES"].str.replace("C", "_C", regex=False)

print(df["EXTERNAL_SOURCES"].head(5))


# save the filtered dataframe to a new CSV file
output_file = "gs_compound_filtered.csv"
df.to_csv(output_file, sep="\t")





Initial number of rows: 141990
Number of rows after filtering: 134060
Number of rows after filtering by MOL_WEIGHT: 122663
Number of rows after random sampling: 50000
107724    629429_CHEMBL
88952     480565_CHEMBL
733         1723_CHEMBL
56977      93549_CHEMBL
109239    632264_CHEMBL
Name: EXTERNAL_SOURCES, dtype: object
