In [1]:

import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib




In [2]:

# Function to read and preprocess data
def read_and_preprocess(file_path, header=None):
    df = pd.read_excel(file_path, header=header)
    return df

# Load data
X_Dam = read_and_preprocess(os.path.join(ROOT_DIR, "Dam dispensing.xlsx"), header=1)
X_AutoClave = read_and_preprocess(os.path.join(ROOT_DIR, "Auto clave.xlsx"), header=1)
X_Fill1 = read_and_preprocess(os.path.join(ROOT_DIR, "Fill1 dispensing.xlsx"), header=1)
X_Fill2 = read_and_preprocess(os.path.join(ROOT_DIR, "Fill2 dispensing.xlsx"), header=1)
y = pd.read_csv(os.path.join(ROOT_DIR, "train_y.csv"))


In [3]:
# Constants
ROOT_DIR = "data"
RANDOM_STATE = 110
# Merge data
X_Dam.columns = [i + " - Dam" for i in X_Dam.columns]
X_AutoClave.columns = [i + " - AutoClave" for i in X_AutoClave.columns]
X_Fill1.columns = [i + " - Fill1" for i in X_Fill1.columns]
X_Fill2.columns = [i + " - Fill2" for i in X_Fill2.columns]
X_Dam = X_Dam.rename(columns={"Set ID - Dam": "Set ID"})
X_AutoClave = X_AutoClave.rename(columns={"Set ID - AutoClave": "Set ID"})
X_Fill1 = X_Fill1.rename(columns={"Set ID - Fill1": "Set ID"})
X_Fill2 = X_Fill2.rename(columns={"Set ID - Fill2": "Set ID"})

X = pd.merge(X_Dam, X_AutoClave, on="Set ID")
X = pd.merge(X, X_Fill1, on="Set ID")
X = pd.merge(X, X_Fill2, on="Set ID")

# Merge with target variable
df = pd.merge(X, y, on="Set ID")


In [4]:

# Split into train and validation set
df_train, df_val = train_test_split(df, test_size=0.3, stratify=df["target"], random_state=RANDOM_STATE)

# Function to preprocess data and extract features
def preprocess_and_extract_features(df_train, df_val):
    features = []
    for col in df_train.columns:
        try:
            df_train[col] = df_train[col].astype(int)
            df_val[col] = df_val[col].astype(int)
            features.append(col)
        except:
            continue
    if "Set ID" in features:
        features.remove("Set ID")
    return df_train[features], df_train["target"], df_val[features], df_val["target"], features

# Extract features and target
train_x, train_y, val_x, val_y, features = preprocess_and_extract_features(df_train, df_val)


In [None]:

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=RANDOM_STATE),
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           verbose=2)

grid_search.fit(train_x, train_y)

# Best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


In [None]:

# Evaluate on validation set
val_pred = best_model.predict(val_x)
print(f"Validation Accuracy: {accuracy_score(val_y, val_pred)}")
print(f"Classification Report:\n{classification_report(val_y, val_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(val_y, val_pred)}")


In [None]:

from sklearn.metrics import f1_score

# Calculate F1 score
f1 = f1_score(val_y, val_pred, average='weighted')
print(f"F1 Score: {f1}")


In [None]:

# Save the best model
model_filename = "best_random_forest_model.joblib"
joblib.dump(best_model, model_filename)

# Output best parameters and validation accuracy
best_params, best_model.score(val_x, val_y)
    