In [1]:
# notebooks/model_experimentation.ipynb

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
# Load the preprocessed train and test data
train_data = np.load('path/to/preprocessed_data.npy')
test_data = np.load('path/to/test_data.npy')

In [3]:
# Convert the NumPy arrays to pandas DataFrames
train_df = pd.DataFrame(train_data, columns=['feature_' + str(i) for i in range(train_data.shape[1] - 1)] + ['target'])
test_df = pd.DataFrame(test_data, columns=['feature_' + str(i) for i in range(test_data.shape[1] - 1)] + ['target'])

In [4]:
# Split the train and test data into features and target
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

In [7]:
# Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Encode the target variable as discrete classes
y_train_encoded = np.where(y_train > 0.5, 1, 0)
y_test_encoded = np.where(y_test > 0.5, 1, 0)

rf_model.fit(X_train, y_train_encoded)
rf_predictions = rf_model.predict(X_test)

print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test_encoded, rf_predictions))
print("Precision:", precision_score(y_test_encoded, rf_predictions))
print("Recall:", recall_score(y_test_encoded, rf_predictions))
print("F1 Score:", f1_score(y_test_encoded, rf_predictions))

Random Forest Classifier:
Accuracy: 0.955
Precision: 0.9540229885057471
Recall: 0.9431818181818182
F1 Score: 0.9485714285714286


In [8]:
# XGBoost Classifier
xgb_model = XGBClassifier(random_state=42)

# Encode the target variable as discrete classes
y_train_encoded = np.where(y_train > 0.5, 1, 0)
y_test_encoded = np.where(y_test > 0.5, 1, 0)

xgb_model.fit(X_train, y_train_encoded)
xgb_predictions = xgb_model.predict(X_test)

print("XGBoost Classifier:")
print("Accuracy:", accuracy_score(y_test_encoded, xgb_predictions))
print("Precision:", precision_score(y_test_encoded, xgb_predictions))
print("Recall:", recall_score(y_test_encoded, xgb_predictions))
print("F1 Score:", f1_score(y_test_encoded, xgb_predictions))

XGBoost Classifier:
Accuracy: 0.96
Precision: 0.9444444444444444
Recall: 0.9659090909090909
F1 Score: 0.9550561797752809


In [10]:
# Hyperparameter tuning for Random Forest Classifier
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Encode the target variable as discrete classes
y_train_encoded = np.where(y_train > 0.5, 1, 0)

rf_grid = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train_encoded)

print("Best parameters for Random Forest Classifier:", rf_grid.best_params_)
print("Best score for Random Forest Classifier:", rf_grid.best_score_)

Best parameters for Random Forest Classifier: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
Best score for Random Forest Classifier: 0.9262499999999999


In [11]:
# Hyperparameter tuning for XGBoost Classifier
xgb_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}

# Encode the target variable as discrete classes
y_train_encoded = np.where(y_train > 0.5, 1, 0)

xgb_grid = GridSearchCV(estimator=XGBClassifier(random_state=42), param_grid=xgb_params, cv=5, scoring='accuracy')
xgb_grid.fit(X_train, y_train_encoded)

print("Best parameters for XGBoost Classifier:", xgb_grid.best_params_)
print("Best score for XGBoost Classifier:", xgb_grid.best_score_)

Best parameters for XGBoost Classifier: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Best score for XGBoost Classifier: 0.9287500000000002
