# Overview
- Train a basic logistic regression 
    - cross validation
    - Evaluate model accuracy score
- Train multiple models to identify best 
- Train winning model using grid search and cv

In [15]:
# Import necessary packages
from utils import (
    pd, 
    accuracy_score, f1_score, GridSearchCV,
    LogisticRegression,
    RandomForestClassifier, GradientBoostingClassifier,
    SVC, KNeighborsClassifier,
    DecisionTreeClassifier, GaussianNB
)


In [16]:
# Import processed data
train_df = pd.read_csv("../data/processed/train_data.csv")
test_df = pd.read_csv("../data/processed/test_data.csv")

In [17]:
# Split input features and target 
X_train = train_df.drop(['label', 'subject_id', 'trial_id'], axis=1) 
y_train = train_df['label']
X_test = test_df.drop(['label', 'subject_id', 'trial_id'], axis=1)
y_test = test_df['label']

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((168, 256), (168,), (42, 256), (42,))

In [4]:
# Train basic logistic regression model
from sklearn.model_selection import cross_val_score

# Initialize model
model = LogisticRegression()

# Perform 5-fold cross validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)

# Print mean and standard deviation of CV scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Average CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# Fit the model on full training data
model.fit(X_train, y_train)

Cross-validation scores: [0.5        0.52941176 0.52941176 0.51515152 0.51515152]
Average CV score: 0.518 (+/- 0.022)


In [5]:
# Make predictions
y_pred = model.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
#f1_score = f1_score(y_test, y_pred)
accuracy

0.38095238095238093

In [12]:
# Create dictionary of multiple models to try
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB()
}

In [19]:
scores = []

for name, model in models.items(): 
    this_model = model
    this_model.fit(X_train, y_train)
    y_pred = this_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    #f1_score = f1_score(y_test, y_pred)
   
    scores.append(f"accuracy {name}: {accuracy}")
    
scores

['accuracy Logistic Regression: 0.38095238095238093',
 'accuracy Random Forest: 0.6428571428571429',
 'accuracy Gradient Boosting: 0.6666666666666666',
 'accuracy SVM: 0.5714285714285714',
 'accuracy KNN: 0.5476190476190477',
 'accuracy Decision Tree: 0.6190476190476191',
 'accuracy Naive Bayes: 0.5']

In [18]:

# Define parameter grids for both models
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10]
}

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV for both models
gb_grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(),
    param_grid=gb_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

rf_grid_search = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=rf_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# Fit both grid searches
print("Fitting Gradient Boosting...")
gb_grid_search.fit(X_train, y_train)
print("Fitting Random Forest...")
rf_grid_search.fit(X_train, y_train)

# Print best parameters and scores for both models
print("\nGradient Boosting Results:")
print("Best parameters:", gb_grid_search.best_params_)
print("Best cross-validation score:", gb_grid_search.best_score_)

print("\nRandom Forest Results:")
print("Best parameters:", rf_grid_search.best_params_)
print("Best cross-validation score:", rf_grid_search.best_score_)

# Get best models and make predictions
best_gb_model = gb_grid_search.best_estimator_
best_rf_model = rf_grid_search.best_estimator_

gb_y_pred = best_gb_model.predict(X_test)
rf_y_pred = best_rf_model.predict(X_test)

print("\nTest accuracies:")
print("Gradient Boosting:", accuracy_score(y_test, gb_y_pred))
print("Random Forest:", accuracy_score(y_test, rf_y_pred))


Fitting Gradient Boosting...
Fitting Random Forest...

Gradient Boosting Results:
Best parameters: {'learning_rate': 0.2, 'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validation score: 0.7443850267379679

Random Forest Results:
Best parameters: {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best cross-validation score: 0.6844919786096257

Test accuracies:
Gradient Boosting: 0.6666666666666666
Random Forest: 0.7142857142857143


# Key Takeways
- Logistic Regression Accuracy Score: .39
- Best Performing Models: Gradient Boosting and Random Forest
- Winning Model after hyperparameter tuning: Random Forest with Accuracy of .71