In [None]:
# Jupyter notebook for filtering GPCRSARfari Database
import os
import pandas as pd
import csv
import random
import re
from rdkit import Chem

# Functions:
def format_external_sources(val):
    match = re.match(r'(\d+) \(([^)]+)\)', str(val))
    if match:
        result = f"{match.group(1)}_{match.group(2)}"
        return result.replace(' ', '')
    return str(val).replace(' ', '')


# load the CSV file from the ChEMBL GPCRSARfari database, The file is tab-separated
df = pd.read_csv("gs_compound.txt", sep='\t')

# Print the key values of the dataframe
print(df.columns)

# Need to filter the dataframe to exclude rows where the MED_CHEM_FRIENDLY is N or empty
print(f"Initial number of rows: {len(df)}")


# Filter by Molecular Weight
df = df[df['MOLWEIGHT'] < 500]

# Filter by LogP values < 5
df = df[df['ALOGP'] < 5]

# Filter by MED_CHEM_FRIENDLY values that are 'Y'
df = df[df['MED_CHEM_FRIENDLY'] == 'Y']

# Print the number of rows after filtering by MOL_WEIGHT
print(f"Number of rows after filtering: {len(df)}")

# Now we will filter by a random selection of 30000 rows
df = df.sample(n=30000, random_state=1)

# remove characters from EXTERNAL_SOURCES
df['EXTERNAL_SOURCES'] = df['EXTERNAL_SOURCES'].apply(format_external_sources)

# Convert SMILES strings to Canonical smiles
df['SMILES'] = df['SMILES'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), canonical=True))

# Print the number of rows after random sampling
print(f"Number of rows after random sampling: {len(df)}")

# save the filtered dataframe to a new CSV file
output_file = "gpcrsarfari_filtered.csv"
df.to_csv(output_file, sep=",", index=False)



