In [1]:
import os
import json
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
def load_data():
    data = pd.read_csv('C:/Users/Kimberly/Documents/Fall 2024/ML CLSE/Assignment 4/Lipophilicity.csv')
    return data

In [3]:
def featurize_data(data):
    X = []
    for smile in data['smiles']:
        mol = Chem.MolFromSmiles(smile)
        if mol:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
            X.append(fp)
    return np.array(X), data['exp']

In [4]:
def train_model(X_train, y_train, n_estimators, max_depth):
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    model.fit(X_train, y_train)
    return model

In [5]:
def main(n_estimators, max_depth):
    # Load and featurize data
    data = load_data()
    X, y = featurize_data(data)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train model
    model = train_model(X_train, y_train, n_estimators, max_depth)
    
    # Predict and calculate RMSE
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Save results
    env_name = os.getenv("CONDA_DEFAULT_ENV")
    results = {
        "RMSE": rmse,
        "Conda Environment": env_name,
        "Hyperparameters": {
            "n_estimators": n_estimators,
            "max_depth": max_depth
        }
    }
    
    with open("results.txt", "w") as f:
        json.dump(results, f)

# Replace argparse with hardcoded values or inputs for Jupyter
n_estimators = 100  # or use input("Enter number of estimators: ")
max_depth = 10      # or use input("Enter max depth: ")
main(n_estimators, max_depth)

