In [96]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint

In [97]:
csv_file = "reduced_scaled_list_of_molecs_v2.csv"
data = pd.read_csv(csv_file, header = 0)
print(data.head())

                                          Unnamed: 0       PC1       PC2  \
0  COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...  0.873691  1.011789   
1             O=C(CSc1nc2cccnc2n1Cc1ccccc1)NCc1ccco1 -0.070396  0.836068   
2  Cc1cccc2cc(C[NH+](CC3CCCO3)C(c3nnnn3Cc3ccco3)C...  0.568791  1.049937   
3                  CCN(CC)c1ccc2c(Cl)c(Br)c(=O)oc2c1 -0.358623 -0.421055   
4  CS(=O)(=O)N1CCc2cc(-c3csc(NC(=O)Cc4cccs4)n3)ccc21  0.437417  0.474394   

        PC3       PC4       PC5       PC6       PC7       PC8       PC9  ...  \
0 -0.182139 -0.223956  0.492136  0.135993 -0.127539  0.019637 -0.268770  ...   
1  0.068009 -0.372655 -0.195054  0.073084 -0.009113  0.026222  0.142134  ...   
2 -0.180732 -0.860063  0.427130 -0.503204 -0.012247  0.523686  0.085882  ...   
3  0.059214 -0.008505 -0.312905  0.392098  0.258758  0.156754  0.273009  ...   
4  0.312002 -0.176393 -0.255540  0.271514  0.048435  0.332471 -0.144189  ...   

       PC25      PC26      PC27      PC28      PC29      PC30 

In [98]:
# Verwijderen van irrelevante kolommen
#columns_to_drop = ["MolecularStructure"]  # Voeg hier de namen van de irrelevante kolommen toe
#data = data.drop(columns=columns_to_drop)
data = data.drop(data.columns[0], axis=1)

data = data.dropna() #verwijderd rijen met ontbrekende waarden
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data["ALDH1_inhibition"] = label_encoder.fit_transform(data["ALDH1_inhibition"]) #yes = 1 no = 0
print(data.head())

        PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0  0.873691  1.011789 -0.182139 -0.223956  0.492136  0.135993 -0.127539   
1 -0.070396  0.836068  0.068009 -0.372655 -0.195054  0.073084 -0.009113   
2  0.568791  1.049937 -0.180732 -0.860063  0.427130 -0.503204 -0.012247   
3 -0.358623 -0.421055  0.059214 -0.008505 -0.312905  0.392098  0.258758   
4  0.437417  0.474394  0.312002 -0.176393 -0.255540  0.271514  0.048435   

        PC8       PC9      PC10  ...      PC25      PC26      PC27      PC28  \
0  0.019637 -0.268770  0.096624  ...  0.194245 -0.039295 -0.026117  0.195499   
1  0.026222  0.142134 -0.404668  ...  0.138816  0.109786  0.078531 -0.168753   
2  0.523686  0.085882 -0.498935  ... -0.165430 -0.001814 -0.016709  0.321922   
3  0.156754  0.273009  0.205335  ... -0.076742  0.075923  0.206183  0.116790   
4  0.332471 -0.144189  0.215461  ... -0.017319  0.087436 -0.018026 -0.305317   

       PC29      PC30      PC31      PC32      PC33  ALDH1_inhibitio

In [99]:

X = data.drop(columns=["ALDH1_inhibition"])
y = data["ALDH1_inhibition"]

# 20% testset, 80% training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Schalen van de functies
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

In [100]:
# Importeren van het machine learning-model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=12, n_estimators=246)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [101]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluatie van het model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Uitvoer van de evaluatieresultaten
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.7873417721518987
Precision: 0.7341772151898734
Recall: 0.4793388429752066
F1 Score: 0.58


Tunen hyperparamters

In [102]:
parm_distribution = {'n_estimators': randint(50,500),
                    'max_depth': randint(1,20)}
rf_model = RandomForestClassifier()
randomized_parm_optimization = RandomizedSearchCV(rf_model,
                                                  param_distributions=parm_distribution,
                                                  n_iter=6,
                                                  cv=5)
randomized_parm_optimization.fit(X_train,y_train)

top_rf_model = randomized_parm_optimization.best_estimator_
print(top_rf_model)

RandomForestClassifier(max_depth=12, n_estimators=403)
