<h1>Data preprocessing</h1>

<h2>Importing necessary modules</h2>

In [None]:
import csv
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
IPythonConsole.ipython_useSVG=True
from rdkit.Chem import Descriptors

<p>V <em>What file is read in here actually??</em> V</p>

In [None]:
# Define filepath to Molecules_combined.csv.
path = r""
csv_file = open(path, 'r')
reader = csv.reader(csv_file)
data = [row for row in reader]

<h2>Create DataFrame containing molecules and descriptors</h2>
<p>Possible to export it as csv file.</p>

In [None]:
# TODO: more sensible variable names?
m = Chem.MolFromSmiles(data[1][0])
vs = Descriptors.CalcMolDescriptors(m)

df_full = pd.DataFrame(vs.items())
df_full.rename(columns={1:data[1][0]},inplace=True)

for i,x in enumerate(data[2:]):
    m = Chem.MolFromSmiles(x[0])
    vs = Descriptors.CalcMolDescriptors(m)
    df_full[x[0]] = vs.values()

#df_full.to_csv('descriptors_known_molecs.csv')

<p>DataFrame won't have to be read in anymore, as it is created above already.</p>

In [None]:
#df_full = pd.read_csv("descriptors_known_molecs.csv")
#df.tail()

<h2>Create DataFrame containing ALDH1 inhibition data for 2000 given molecules</h2>

In [None]:
df_inhibition1 = pd.read_csv("tested_molecules-1.csv")
df_inhibition2 = pd.read_csv("tested_molecules_v2.csv")

In [None]:
df_inhibition = pd.concat([df_inhibition1, df_inhibition2], ignore_index = True)
df_inhibition.tail()

In [None]:
len(df_inhibition), len(df_inhibition1), len(df_inhibition2)

<h2>Combine feature data and label data</h2>
<p>A row containing the ALDH1 inhibition for every molecule is added. If no inhibition data is available for a molecule, <br>that entry is deleted from the DataFrame.</p>

In [None]:
# Create list with all molecules reported in df_full, 
# leaving out the non-molecule names.
all_molecules = list(df_full.columns)
del all_molecules[:2]

# Create list with all molecules reported in df_inhibition.
select_molecules = list(df_inhibition["SMILES"])

# Create list to store molecules in which are not found in the bigger dataset, if any.
molecule_not_found = []

# Create new DataFrame, with similar shape to df_full though with 
# one extra row for the ALDH1 inhibition data.
df_combined = pd.DataFrame(np.zeros, index=pd.RangeIndex(len(df_full)+1), columns=df_full.columns)
df_combined.loc[np.arange(len(df_full))] = df_full.values

# Loop through the molecules in df_inhibition and check whether the molecules can be 
# found in df_full. If so, check what value to add into df_combined for ALDH1 inhibition.
# If a molecule from df_inhibition is NOT found in df_full, it is added to the list
# "molecule_not_found".
for molecule in list(df_inhibition["SMILES"]):
    if molecule in all_molecules:
        index = df_inhibition[df_inhibition["SMILES"] == molecule].index[0]
        if df_inhibition["ALDH1_inhibition"][index] == 1:
            df_combined.loc[209, molecule] = "Yes"
        else:
            df_combined.loc[209, molecule] = "No"
    else:
        molecule_not_found.append(molecule)

# Variable name "ALDH1_inhibition" is added to df_combined.
df_combined.loc[209, "0"] = "ALDH1_inhibition"

# Finally check for what molecules the ALDH1 inhibition is known. All molecules for which 
# this is unknown are removed from df_combined.
for molecule in df_full.columns:
    if (molecule not in select_molecules) & (molecule != "0"):
        df_combined = df_combined.drop([molecule], axis=1)

<p>Check how many molecules from df_inhibition were not found in df_full.<p>

In [None]:
not_found = len(molecule_not_found)
assert not_found == 0, f"{not_found} molecule(s) from df_inhibition was/were not in df_full."

<h2>Transpose df_combined and exprot it as csv</h2>

In [None]:
df_transposed = df_combined.transpose()
df_transposed.to_csv("Nearly_useful_sheet.csv")

<h3><em>After the "Nearly_useful_sheet.csv" has been saved, it has to be slightly edited <br>
        so that it is more practical to use for further purposes.<br>
        Here we might want to edit the csv through a code block also instead of manually.</em></h3>

<h2>Create DataFrame containing altered version of previously saved csv</h2>

<p>For the sake of variable analysis, the 2 non-numerical columns are removed.</p>

In [None]:
DF3 = pd.read_csv("Useful_sheet.csv")
DF2 = DF3.loc[:, DF3.columns != "MolecularStructure"]
DF = DF2.loc[:, DF2.columns != "ALDH1_inhibition"]

<h2>Scale the data using a MinMax scaler to prepare for PCA</h2>

In [None]:
scaler = MinMaxScaler()
DF[:] = scaler.fit_transform(DF[:])

<h2>Perform a PCA on DF, so that the given amount data variance (= n_components) is maintained.</h2>

In [None]:
pca = PCA(n_components = 0.99)
pca.fit(DF)
df_reduced = pd.DataFrame(pca.transform(DF))

# Show snippet of reduced DataFrame.
df_reduced.head()

<h2>Re-enter the previously removed columns</h2>

In [None]:
df_reduced.insert(0, "MolecularStructure", DF3["MolecularStructure"])
df_reduced.insert(102, "ALDH1_inhibition", DF3["ALDH1_inhibition"])

# Show snippet of final df_reduced.
df_reduced.head()

<h2>Export the final df_reduced as csv for further use</h2>

In [None]:
df_reduced.to_csv("reduced_scaled_list_of_molecs.csv")