In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np

# read the csv
train_data = pd.read_csv("/content/drive/MyDrive/1:1_Neel Banga/Datasets/train.csv")

In [4]:
# seperate features and labels
X = train_data.loc[:, [cols for cols in list(train_data.columns) if cols != "pIC50"]].values
Y = train_data.loc[:, "pIC50"].values

### 5 fold cross validation for different models

In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

def cross_validation(model, data = (X, Y), splits = 5):
    kf = KFold(n_splits=splits, shuffle=True, random_state=42)

    # Perform k-fold cross-validation
    rmse_scores = []
    r2_scores = []

    for train_index, valid_index in kf.split(data[0]):
        X_train, X_valid = data[0][train_index], data[0][valid_index]
        y_train, y_valid = data[1][train_index], data[1][valid_index]

        # Fit the KNN model on the training data
        model.fit(X_train, y_train)

        # Make predictions on the test data
        y_pred = model.predict(X_valid)

        # Calculate Mean Squared Error (MSE)
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        r2 = r2_score(y_valid, y_pred)
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    # get arrays
    rmse_set = np.array(rmse_scores)
    r2_set = np.array(r2_scores)

    #print("RMSE scores: {}".format(rmse_set))
    #print("R2 scores: {}".format(r2_set))
    #print("Mean RMSE: {}".format(rmse_set.mean()))
    #print("Mean r2 score: {}".format(r2_set.mean()))

    return rmse_set.mean(), r2_set.mean()

### KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Use 5-fold cross validation for hyper-parameter tuning
# Try out different values and choose the best hyper-parameters


knn = KNeighborsRegressor(n_neighbors=5)
rmse, r2 = cross_validation(knn)

RMSE scores: [0.90525947 0.86123095 0.86315038 0.89105987 0.86343273]
R2 scores: [0.56940441 0.62222656 0.6420847  0.5872566  0.62534779]
Mean RMSE: 0.8768266791277082
Mean r2 score: 0.609264010551654


### RF

In [None]:
from sklearn.ensemble import RandomForestRegressor


# Use 5-fold cross validation for hyper-parameter tuning
# Try out different values and choose the best hyper-parameters

n=10
for d in range(5, 8):
  while n<=100:
    random_forest = RandomForestRegressor(n_estimators=100, max_depth=7)
    rmse, r2 = cross_validation(random_forest)
    print(f'Trees: {d}; Depth{n}: RMSE: {rmse}, R2: {r2}')
    n+=10
  else:
    n=10


print("CSV file created successfully!")

Trees: 5; Depth10: RMSE: 0.9294637310613465, R2: 0.5614950826367888
Trees: 5; Depth20: RMSE: 0.9303877148536905, R2: 0.5605677784068284
Trees: 5; Depth30: RMSE: 0.929755231213916, R2: 0.5611936038479183
Trees: 5; Depth40: RMSE: 0.9300494187717941, R2: 0.5609823664604419
Trees: 5; Depth50: RMSE: 0.9289780599910383, R2: 0.5619998087156873
Trees: 5; Depth60: RMSE: 0.9312422633103707, R2: 0.5597930963865919
Trees: 5; Depth70: RMSE: 0.9308026956422054, R2: 0.5601947795744785
Trees: 5; Depth80: RMSE: 0.9294867083304693, R2: 0.5615543856975327
Trees: 5; Depth90: RMSE: 0.9289446464838396, R2: 0.5619213516313628
Trees: 5; Depth100: RMSE: 0.9302189322178351, R2: 0.5607528173443018
Trees: 6; Depth10: RMSE: 0.9278029211230662, R2: 0.5630434024237216
Trees: 6; Depth20: RMSE: 0.929263455392887, R2: 0.5616532390624266
Trees: 6; Depth30: RMSE: 0.9284556716870052, R2: 0.5624485175164966
Trees: 6; Depth40: RMSE: 0.9302243901037086, R2: 0.5608110789946422
Trees: 6; Depth50: RMSE: 0.9265460921124822, R2: 

### Save the best Model

In [None]:
import pickle
import os

MODEL_SAVE_FOLDER = "/content/drive/MyDrive/1:1_Neel Banga/Models"
MODEL_NAME = "best_model_KNN"

# Model with the lowest RMSE or highest r2; Comment the following line of code if best model is RF
best_model = KNeighborsRegressor(n_neighbors=10)
# Uncomment the following line of code if best model is RF and update the model name in the variable named "MODEL_NAME"
#best_model = RandomForestRegressor(n_estimators=50, max_depth=7)

# Fit the model on the full training dataset
best_model.fit(X, Y)

fh = open(os.path.join(MODEL_SAVE_FOLDER, MODEL_NAME), "wb")
pickle.dump(best_model, fh)
fh.close()