In [1]:
import pandas as pd
import re

# Data Cleaning

Did some cleaning here.
- Removed weird parenthesized numbers after lab and homework names, as I didn't like them.
- Removed rows with "(read only)" in them (I think it was only one).
- Replaced grades with enumerated values from the `grade_mapping` dictionary.
- Filled empty values with 0s.

In [2]:
data = pd.read_csv('Training_set.csv')

data.rename(columns=lambda x: re.sub('\s\([^)]*\)', '', x), inplace=True)

# data = data[~data.isin(['(read only)']).any(axis=1)]

# grade_mapping = {'A': 0, 'A-': 1, 'B+': 2, 'B': 3, 'B-': 4,
#                  'C+': 5, 'C': 6, 'C-': 7, 'D+': 8, 'D': 9,
#                  'D-': 10, 'E': 11}

# for column in data.columns:
#     if column.endswith('Grade'):
#         data[column] = data[column].map(grade_mapping)

data.fillna(0, inplace=True)
print(data.columns)

Index(['Lab 02', 'Lab 03', 'Lab 04', 'Lab 05', 'Lab 06', 'Lab 07', 'Lab 08',
       'Lab 09', 'Lab 10', 'Lab 11', 'Lab 12', 'Lab 13', 'Lab 14', 'Lab 15',
       'Lab 16', 'Lab 17', 'Lab 20', 'Lab 21', 'Lab 22', 'Lab 23',
       'Homework 1', 'Homework 2', 'Homework 3', 'Homework 4', 'Homework 5',
       'Homework 6', 'Homework 7', 'Project 1', 'Project 2A', 'Project 2B',
       'Project 3', 'Project 4', 'Final Exam'],
      dtype='object')


In [3]:
print(f'Number of students in the class: {len(data)}')
print(f'Number of columns: {len(data.columns)}')

Number of students in the class: 406
Number of columns: 33


- This is only data about labs.

In [4]:
lab_data = data.filter(regex='^Lab')
print(lab_data.columns)

Index(['Lab 02', 'Lab 03', 'Lab 04', 'Lab 05', 'Lab 06', 'Lab 07', 'Lab 08',
       'Lab 09', 'Lab 10', 'Lab 11', 'Lab 12', 'Lab 13', 'Lab 14', 'Lab 15',
       'Lab 16', 'Lab 17', 'Lab 20', 'Lab 21', 'Lab 22', 'Lab 23'],
      dtype='object')


- This is only data about homework.

In [5]:
hw_data = data.filter(regex='^Homework')
print(hw_data.columns)

Index(['Homework 1', 'Homework 2', 'Homework 3', 'Homework 4', 'Homework 5',
       'Homework 6', 'Homework 7'],
      dtype='object')


- This is only data about projects.

In [6]:
proj_data = data.filter(regex='^Project')
print(proj_data.columns)

Index(['Project 1', 'Project 2A', 'Project 2B', 'Project 3', 'Project 4'], dtype='object')


# Data Analysis

Here, I put together some input data for my algorithm. The goal of this algorithm is to model classification comparing all homework, lab, and project data to the grades 

In [7]:
input_data = data.filter(regex='^(Homework|Lab|Project).*')
print(input_data.columns)

Index(['Lab 02', 'Lab 03', 'Lab 04', 'Lab 05', 'Lab 06', 'Lab 07', 'Lab 08',
       'Lab 09', 'Lab 10', 'Lab 11', 'Lab 12', 'Lab 13', 'Lab 14', 'Lab 15',
       'Lab 16', 'Lab 17', 'Lab 20', 'Lab 21', 'Lab 22', 'Lab 23',
       'Homework 1', 'Homework 2', 'Homework 3', 'Homework 4', 'Homework 5',
       'Homework 6', 'Homework 7', 'Project 1', 'Project 2A', 'Project 2B',
       'Project 3', 'Project 4'],
      dtype='object')


In [15]:
label_data = data[['Midterm 1', 'Midterm 2', 'Final Exam', 'Final Score']]
print(label_data.columns)

Index(['Midterm 1', 'Midterm 2', 'Final Exam', 'Final Score'], dtype='object')


Here's where the juicy stuff happens. I used a random forest regressor to model this data, as I've never used it before and kinda wanted to try it. I also think it somewhat fits our data. I divided the data with a train-test split of 80/20. I did a grid search over the parameters in `param_grid` to get the best set of parameters for our model. You can see the calculated best parameters below.

In [10]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

X = input_data.to_numpy()
y = label_data.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


random_forest_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=random_forest_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")

best_random_forest_model = grid_search.best_estimator_

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}


It looks like it did pretty well. The values for R^2 and MSE aren't bad at all, and the sample prediction, taken from the test set, is pretty dang close. It would make more sense to divide predictions for Midterm 1, Midterm 2, and the Final grades into separate models, as those things are temporally dependent on each other. However, even without that consideration, this model did really well.

In [16]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = best_random_forest_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

mse = mean_squared_error(y_test, y_pred)
print(f"Mean squared error: {mse}")

print(f"Student 1 predicted values (Midterm 1, Midterm 2, Final Exam, Final Score): {best_random_forest_model.predict(X_test[0,:].reshape(1, -1))}")
print(f"Student 1 actual values (Midterm 1, Midterm 2, Final Exam, Final Score): {y_test[0,:]}")

R-squared: 0.6137633299246896
Mean squared error: 30.016585756739374
Student 1 predicted values (Midterm 1, Midterm 2, Final Exam, Final Score): [[36.61293053 43.24134006 53.95283325 97.24878178]]
Student 1 actual values (Midterm 1, Midterm 2, Final Exam, Final Score): [37.0 45.0 61.0 '99.81']
