# Baseline, LOGIT, Random Forest, LDA, Cross-Validation CART, Bagging, GBC

In [153]:
# Dependencies
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [154]:
# Load in data
training_data = pd.read_csv("data/Letters_train.csv")
testing_data = pd.read_csv("data/Letters_test.csv")

# Question 2 (25 points)

In [155]:
#Create new variable here
training_data['isB'] = np.where(training_data['letter']  == 'B', 1, 0)
testing_data['isB'] = np.where(testing_data['letter'] == 'B', 1, 0)


In [156]:
#Split into X and y
y = training_data['isB']
X = training_data.drop(['letter', 'isB'], axis=1)

### Part A: Baseline Model (3 points)

In [157]:
# Q1A code
most_common_label = training_data['isB'].mode()[0]

baseline_predictions = [most_common_label] * len(testing_data)

baseline_accuracy = accuracy_score(testing_data['isB'], baseline_predictions)

baseline_1_acc = baseline_accuracy
print(f'Baseline Test Accuracy: {baseline_1_acc:.4f}')

Baseline Test Accuracy: 0.7743


### Part B: Logistic Regression (5 points)

In [158]:
# Q1B code
X_train = training_data.drop(['Unnamed: 0','letter', 'isB'], axis=1)
y_train = training_data['isB']

X_test = testing_data.drop(['Unnamed: 0','letter', 'isB'], axis=1)
y_test = testing_data['isB']

# Initialize and train the logistic regression model
logistic_model = LogisticRegression(random_state=2023, max_iter=1500)
logistic_model.fit(X_train, y_train)

# Make predictions on the test set
predictions = logistic_model.predict(X_test)

# Calculate the accuracy
model_1b_acc = accuracy_score(y_test, predictions)
print(f'Logistic Regression Test Accuracy: {model_1b_acc:.4f}')

Logistic Regression Test Accuracy: 0.9401


### Part C: AUC (2 point)

In [159]:
# Q1C code
probabilities = logistic_model.predict_proba(X_test)[:, 1] 

model_1b_auc =roc_auc_score(y_test, probabilities)
print(f'Logistic Regression Test AUC: {model_1b_auc:.4f}')

Logistic Regression Test AUC: 0.9785


### Part D: Cross-validated CART (5 points)

**Written Answer**: 

* Cross-Validation for `ccp_alpha`: I used `GridSearchCV` from `sklearn.model_selection` to perform cross-validation. I defined a range of `ccp_alpha` values (between 0.0001 and 0.1) and performed cross-validation for each of them to determine the one that results in the best cross-validated metric (accuracy). I used `.best_param_` to determine the value and plugged it in the `DecisionTreeClassifier`

* Train the CART model: After determining the best `ccp_alpha`, train the `DecisionTreeClassifier` from sklearn.tree with this hyperparameter on the full training set.

* Evaluate the model: Make predictions on the test set and calculate the accuracy of the CART model using the `accuracy_score` function.

**The best ccp_alpha is 0.0011 and the CV CART test accuracy is 94.01%**

In [160]:
# Q1D Code

# Initialize the CART model
cart_model = DecisionTreeClassifier(random_state=2023)

# Define a range of ccp_alpha values for cross-validation
ccp_alpha_range = np.linspace(0.0001, 0.01, 100)

# Setup the grid to be searched over
param_grid = {'ccp_alpha': ccp_alpha_range}

# Perform cross-validation
cv = KFold(n_splits=5,random_state=2023,shuffle=True) 

cv_cart_model = GridSearchCV(cart_model, param_grid, cv=cv, scoring='accuracy')
cv_cart_model.fit(X_train, y_train)

# Best ccp_alpha value
model_1d_best_ccp_alpha = cv_cart_model.best_params_['ccp_alpha']

# Train the CART model with the best ccp_alpha
cart_model = DecisionTreeClassifier(random_state=2023, ccp_alpha=model_1d_best_ccp_alpha)
cart_model.fit(X_train, y_train)

# Make predictions on the test set
cart_predictions = cart_model.predict(X_test)

model_1d_acc = accuracy_score(y_test, cart_predictions)


print(f'CV CART Test Accuracy: {model_1d_acc:.4f}')
print(f'Best ccp_alpha: {model_1d_best_ccp_alpha:.4f}')

CV CART Test Accuracy: 0.9401
Best ccp_alpha: 0.0011


### Part E: Random Forest (5 points)


In [161]:
# Q1E Code
rf = RandomForestClassifier(random_state=2023)
rf.fit(X_train, y_train)

# Make predictions on the test set
rf_predictions = rf.predict(X_test)

model_1e_acc = accuracy_score(y_test, rf_predictions)
print(f'Random Forest Test Accuracy: {model_1e_acc:.4f}')

Random Forest Test Accuracy: 0.9840


### Part F: Performance Comparison (5 points)

**Written Answer**:

The Baseline model performs the worst of all, which makes it not a good model compared to the others. Logistic regression and CART models are equivalent in performance at 94%. Whereas Random forest model's performance is much better at 98.4%. Logistic regression and CART models are more interpretable. Logistic regression provides coefficients that tell you the effect size of each feature, and CART provides a decision tree that can be visualized and understood even by non-experts. Random Forest is usually more accurate than CART and logistic regression because it builds multiple trees and averages their predictions, which can capture more complex patterns in the data.

The importance between accuracy and interpretibility depends each situation and highly depends who is the intended reader. in the current problem, given that it is letter recognition, accuracy might be more important since the model might be used to sort mail, therefore accuracy is more important than interpretation.


In [162]:
# Q1F Code
accuracies = {
    'Model': ['Baseline','Logistic Regression', 'CART', 'Random Forest'],
    'Test Accuracy': [baseline_1_acc, model_1b_acc, model_1d_acc, model_1e_acc]
}
accuracy_df = pd.DataFrame(accuracies)
print(accuracy_df)

                 Model  Test Accuracy
0             Baseline       0.774332
1  Logistic Regression       0.940107
2                 CART       0.940107
3        Random Forest       0.983957


***
# Question 3 (50 points)

In [163]:
#Redefine target y
y_train_multiclass = training_data['letter']
y_test_multiclass = testing_data['letter']

### Part A: Baseline Model (5 points)

In [164]:
# Q2A
most_common_class = y_train_multiclass.mode()[0]

baseline_predictions_multiclass = [most_common_class] * len(y_test_multiclass)

baseline_2_acc = accuracy_score(y_test_multiclass, baseline_predictions_multiclass)
print(f'Baseline Test Accuracy: {baseline_2_acc:.4f}')

Baseline Test Accuracy: 0.2439


### Part B: LDA (8 points)

In [165]:
# Q2B code
lda_model = LinearDiscriminantAnalysis()

lda_model.fit(X_train, y_train_multiclass)

lda_predictions = lda_model.predict(X_test)

model_2b_acc =  accuracy_score(y_test_multiclass, lda_predictions)
print(f'LDA Test Accuracy: {model_2b_acc:.4f}')

LDA Test Accuracy: 0.9102


### Part C: Cross-validated CART (10 points)

**Written Answer**:

The best ccp_alpha value from CV for the CART model is approximately 0.0006. The highest cross-validated accuracy achieved with this ccp_alpha is around 91.94%.


* Cross-Validation for `ccp_alpha`: I used `cross_val_score` from `sklearn.model_selection` to perform cross-validation. I defined a range of `ccp_alpha` values (between 0.0001 and 0.1) and performed cross-validation for each of them to determine the one that results in the best cross-validated metric (accuracy). I used `.best_param_` to determine the value and plugged it in the `DecisionTreeClassifier`

* Train the CART model: After determining the best `ccp_alpha`, train the `DecisionTreeClassifier` from sklearn.tree with this hyperparameter on the full training set.

* Evaluate the model: Make predictions on the test set and calculate the accuracy of the CART model using the `accuracy_score` function.

**The best ccp_alpha is 0.0011 and the CV CART test accuracy is 94.01%**

In [166]:
# Q2C Code
cart_model_multiclass = DecisionTreeClassifier(random_state=2023)

# Define a range of ccp_alpha values
ccp_alpha_range_multiclass = np.linspace(0.0001, 0.01, 100)

# Perform cross-validation
cv_scores = [cross_val_score(DecisionTreeClassifier(random_state=2023, ccp_alpha=alpha),
                             X_train, y_train_multiclass, cv=5, scoring='accuracy').mean() 
             for alpha in ccp_alpha_range_multiclass]

# Find the best ccp_alpha value
best_index = np.argmax(cv_scores)
best_ccp_alpha_multiclass = ccp_alpha_range_multiclass[best_index]
best_cv_score = cv_scores[best_index]

# Train the CART model with the best ccp_alpha
cart_model_multiclass = DecisionTreeClassifier(random_state=2023, ccp_alpha=best_ccp_alpha_multiclass)
cart_model_multiclass.fit(X_train, y_train_multiclass)

# Make predictions on the test set
cart_predictions_multiclass = cart_model_multiclass.predict(X_test)

model_2c_acc = accuracy_score(y_test_multiclass, cart_predictions_multiclass)
print(f'CART Test Accuracy: {model_2c_acc:.4f}')

CART Test Accuracy: 0.9294


### Part D: Vanilla Bagging (8 points)

In [167]:
# Q2D
# Find the total number of features
total_features = X_train.shape[1]

# Initialize the RF model with m=p 
bagging_model = RandomForestClassifier(max_features=total_features, random_state=2023)

# Train the model 
bagging_model.fit(X_train, y_train_multiclass)

# Make predictions on test
bagging_predictions = bagging_model.predict(X_test)

# Calculate accuracy of RF model
model_2d_acc = accuracy_score(y_test_multiclass, bagging_predictions)
print(f'No CV Random Forest Test Accuracy: {model_2d_acc:.4f}')

No CV Random Forest Test Accuracy: 0.9476


### Part E: Cross-validated Random Forest (10 points)

**Written Answer**: 

* Parameter Grid Setup: I define a parameter grid for `max_features` ranging from 1 to the total number of features present in the data.

* Grid Search with Cross-Validation: For the RF model, I used GridSearchCV, 5-fold cross-validation, with scoring parameter of 'accuracy'.

* Performing Grid Search: I call `grid_search.fit()` on the training data (`X_train, y_train_multiclass`), to do cross-validation for each possible value of `max_features`.

* Selecting the Best `max_features`: Retreive optimal `max_features` from `grid_search.best_params_`. This value is the one that, on average, produces the highest cross-validated accuracy score.

* Training the Final Model: Train a new RF model on the whole training set using the best `max_features` value.

* Testing the Model: using the best RF model (from train) to predict the classes on the test set (X_test), and use the `accuracy_score` function to compare the predictions (`rf_predictions`) against the actual values (`y_test_multiclass`) to calculate the test set accuracy.
The test set accuracy for the Random Forest model after cross-validation is 97.65%.

In [168]:
# Q2E
# Set up the param grid for max_features
param_grid = {'max_features': range(1, total_features + 1)}

# Initialize the RF model
rf_grid_search = RandomForestClassifier(random_state=2023)

# Set up the grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_grid_search, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train_multiclass)

# Get the best max_features value
best_max_features = grid_search.best_params_['max_features']

# Train a model with the best max_features value
best_rf_model = RandomForestClassifier(max_features=best_max_features, random_state=2023)
best_rf_model.fit(X_train, y_train_multiclass)

# Make predictions on test set with the best model
rf_predictions = best_rf_model.predict(X_test)

model_2e_acc = accuracy_score(y_test_multiclass, rf_predictions)
print(f'CV Random Forest Test Accuracy: {model_2e_acc:.4f}')

CV Random Forest Test Accuracy: 0.9765


### Part F: Gradient Boosting Classifier (9 points)

In [169]:
# Q2F
# Initialize the GBC
gbc_model = GradientBoostingClassifier(n_estimators=200, max_leaf_nodes=10, random_state=2023)

# Train the model
gbc_model.fit(X_train, y_train_multiclass)

# Make predictions
gbc_predictions = gbc_model.predict(X_test)

model_2f_acc = accuracy_score(y_test_multiclass, gbc_predictions)
print(f'GBC Test Accuracy: {model_2f_acc:.4f}')

GBC Test Accuracy: 0.9701


In [6]:
%%capture
!jupyter nbconvert --to html ieor_142_hw4_starter_code.ipynb