In [25]:
import pandas as pd
import re

import warnings
warnings.filterwarnings("ignore")

# Data Analysis
## Putting Data Together


Here, I put together some input data for my algorithm. The goal of this algorithm is to model classification using all homework, lab, and project data to predict the midterm letter grade (A+ to E).

In [26]:
data = pd.read_csv('pre_mt1_train.csv')

In [27]:
input_data = data.filter(regex='^(Homework|Lab|Project).*')

In [28]:
regr_label_data = data[['Midterm 1']]

In [30]:
test_data = pd.read_csv('pre_mt1_test.csv')

In [31]:
test_input_data = test_data.filter(regex='^(Homework|Lab|Project).*')

In [32]:
test_regr_label_data = test_data[['Midterm 1']]

In [38]:
combined_data = pd.concat([data, test_data], axis=0)

combined_data_encoded = pd.get_dummies(combined_data[['Midterm 1 Grade']])

class_label_data = combined_data_encoded[:len(data)]
test_class_label_data = combined_data_encoded[len(data):]

## Classification


Here, I train 3 different classification models to the data: 

In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

X = input_data.to_numpy()
y = class_label_data.to_numpy()


param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'chebyshev']
}


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_scaled, y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_knn_model = grid_search.best_estimator_

print("Best parameters:", best_params)
print("Best cross-validation score:", best_score)

Best parameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Best cross-validation score: 0.1603733815115929


In [42]:
from sklearn.metrics import accuracy_score

X_test = test_input_data.to_numpy()
y_test = test_class_label_data.to_numpy()
y_pred = best_knn_model.predict(X_test)

print("Test accuracy:", accuracy_score(y_test, y_pred))

Test accuracy: 0.10784313725490197


Overall, it looks like KNN isn't great for this dataset. The best I could get was 16% cross-validation accuracy and 10% for test set accuracy. We can do better.

In [46]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

X = input_data.to_numpy()
y = class_label_data.to_numpy()


param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


random_forest_clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(random_forest_clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

best_random_forest_clf = grid_search.best_estimator_

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation score: 0.02953929539295393


In [47]:
from sklearn.metrics import accuracy_score

X_test = test_input_data.to_numpy()
y_test = test_class_label_data.to_numpy()
y_pred = best_random_forest_clf.predict(X_test)

print("Test accuracy:", accuracy_score(y_test, y_pred))

Test accuracy: 0.058823529411764705


The Random Forest does even worse. Ugh.

In [49]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X = input_data.to_numpy()
y = class_label_data.to_numpy()

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
knn = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_neighbors=3, metric='manhattan', weights='distance'))])

ensemble = VotingClassifier(estimators=[('rf', random_forest), ('knn', knn)], voting='soft')

ensemble.fit(X, y)

NotImplementedError: Multilabel and multi-output classification is not supported.

In [None]:
X_test = test_input_data.to_numpy()
y_test = test_class_label_data.to_numpy()

accuracy = ensemble.score(X_test, y_test)
print("Ensemble accuracy:", accuracy)

## Regression


Here's where the juicy stuff happens. I used a random forest regressor to model this data, as I've never used it before and kinda wanted to try it. I also think it somewhat fits our data. I divided the data with a train-test split of 80/20. I did a grid search over the parameters in `param_grid` to get the best set of parameters for our model. You can see the calculated best parameters below.

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

X = input_data.to_numpy()
y = regr_label_data.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


random_forest_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=random_forest_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")

best_random_forest_model = grid_search.best_estimator_

It looks like it did pretty well. The values for R^2 and MSE aren't bad at all, and the sample prediction, taken from the test set, is pretty dang close. It would make more sense to divide predictions for Midterm 1, Midterm 2, and the Final grades into separate models, as those things are temporally dependent on each other. However, even without that consideration, this model did really well.

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = best_random_forest_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

mse = mean_squared_error(y_test, y_pred)
print(f"Mean squared error: {mse}")

print(f"Student 1 predicted values (Midterm 1, Midterm 2, Final Exam, Final Score): {best_random_forest_model.predict(X_test[0,:].reshape(1, -1))}")
print(f"Student 1 actual values (Midterm 1, Midterm 2, Final Exam, Final Score): {y_test[0,:]}")