<a href="https://colab.research.google.com/github/maxgeds/PCOL3911-QSAR-Toxicology-Project/blob/main/PCOL3911_520513026_QSAR_GENERATE_FINGERPRINTS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# Load raw CSV from Veselinovic et al., 2015.
data = pd.read_csv("https://github.com/maxgeds/PCOL3911-QSAR-Toxicology-Project/raw/main/PCOL3911_OP_EXPR.csv")

In [None]:
# Check the data.
data.head(2)

Unnamed: 0,ID,SMILES,Expr,DCW,Calc,Expr-Calc,Set,DCW.1,Calc.1,Expr-Calc.1,Set.1,DCW.2,Calc.2,Expr-Calc.2,Set.2
0,1,CSP(=O)(c1ccccc1)c1ccccc1,2.34,39.0137,2.5431,-0.2031,Tr,30.28552,2.6612,-0.3212,ST,34.79287,2.3165,0.0235,Ca
1,2,CCSP(=O)(c1ccccc1)c1ccccc1,2.69,40.13724,2.8413,-0.1513,Va,31.2953,2.9406,-0.2506,Va,36.00961,2.687,0.003,Va


In [None]:
# Data looking awesome so lets get rdkit in here.
!pip install rdkit



In [None]:
# Then we add a molecule column and make sure that rdkit can convert all of the SMILES
from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
PandasTools.AddMoleculeColumnToFrame(data,'SMILES','Molecule')
data[["SMILES","Molecule"]].head(1)

Unnamed: 0,SMILES,Molecule
0,CSP(=O)(c1ccccc1)c1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x7e648919b0d0>


In [None]:
# Check for any SMILES which cannot be converted by rdkit, a sum of 0 indicates that none are problematic
data.Molecule.isna().sum()

0

In [None]:
# Define a function which will generate fingerprints from the SMILES. We will use a Morgan Fingerprint with radius 2 with a range of bit sizes.
# 1024 bits is standard, but 2048 and 4096 will be attempted as well to improve the fidelity of the later model.
from rdkit.Chem import rdFingerprintGenerator

mfpgen1 = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=1024)
mfp_list1 = []
for mol in data['Molecule']:
  mfp1 = mfpgen1.GetFingerprintAsNumPy(mol)
  mfp_list1.append(mfp1)
data['MFP1'] = mfp_list1

mfpgen2 = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)
mfp_list2 = []
for mol in data['Molecule']:
  mfp2 = mfpgen2.GetFingerprintAsNumPy(mol)
  mfp_list2.append(mfp2)
data['MFP2'] = mfp_list2

mfpgen4 = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=4096)
mfp_list4 = []
for mol in data['Molecule']:
  mfp4 = mfpgen4.GetFingerprintAsNumPy(mol)
  mfp_list4.append(mfp4)
data['MFP4'] = mfp_list4

In [None]:
data.head(2)

Unnamed: 0,ID,SMILES,Expr,DCW,Calc,Expr-Calc,Set,DCW.1,Calc.1,Expr-Calc.1,Set.1,DCW.2,Calc.2,Expr-Calc.2,Set.2,Molecule,MFP1,MFP2,MFP4
0,1,CSP(=O)(c1ccccc1)c1ccccc1,2.34,39.0137,2.5431,-0.2031,Tr,30.28552,2.6612,-0.3212,ST,34.79287,2.3165,0.0235,Ca,<rdkit.Chem.rdchem.Mol object at 0x7e648919b0d0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,CCSP(=O)(c1ccccc1)c1ccccc1,2.69,40.13724,2.8413,-0.1513,Va,31.2953,2.9406,-0.2506,Va,36.00961,2.687,0.003,Va,<rdkit.Chem.rdchem.Mol object at 0x7e648919b290>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# Then we can create three datasets for the fingerprints of varying definition.
data_fp1 = data['MFP1'].apply(pd.Series)
data_fp2 = data['MFP2'].apply(pd.Series)
data_fp4 = data['MFP4'].apply(pd.Series)

data_fp4.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
# Then we need to reincorporate the Expr column into these datasets.
data_fp1.insert(1024, "Expr", data["Expr"])
data_fp1.head(3)

data_fp2.insert(2048, "Expr", data["Expr"])
data_fp2.head(3)

data_fp4.insert(4096, "Expr", data["Expr"])
data_fp4.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,Expr
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.34
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.69
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,3.08


In [None]:
# Now we can save these datasets to csv files.
#Save dataframe as csv
data_fp1.to_csv('organophosphate_fp1024.csv',index=None)
data_fp2.to_csv('organophosphate_fp2048.csv',index=None)
data_fp4.to_csv('organophosphate_fp4096.csv',index=None)