In [2]:
# load libraries
import numpy as np
import pandas as pd

## Preprocessing Data

1. filter out useless columns
2. drop useless columns: `YearTerm`, `W`
3. filter out items with missing fields
4. compute `GPA` of each row
5. convert number of grade into percentage
6. one-hot encoding for categorical fields
7. split train and test dataset
8. split features and labels

In [3]:
# load data
data = pd.read_pickle('data/full.pkl')

# filter out useless columns
data = data.drop(columns=['YearTerm', 'W'])

# filter out missing fields
data = data.dropna()

# compute GPA of each course
data['Student Number'] = data['A+'] + data['A'] + data['A-'] + data['B+'] + data['B'] + data['B-'] + data['C+'] + data['C'] + data['C-'] + data['D+'] + data['D'] + data['D-'] + data['F']
grade_mapping = {
    'A+': 4.0,
    'A': 4.0,
    'A-': 3.7,
    'B+': 3.3,
    'B': 3.0,
    'B-': 2.7,
    'C+': 2.3,
    'C': 2.0,
    'C-': 1.7,
    'D+': 1.3,
    'D': 1.0,
    'D-': 0.7,
    'F': 0.0
}
data['GPA'] = 0
for col in ['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F']:
    data['GPA'] += grade_mapping[col] * data[col]
data['GPA'] /= data['Student Number']

# convert student number into percentage
for col in ['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F']:
    data[col] = data[col] / data['Student Number']

# split features and labels
feature_columns = ['Year', 'Term', 'Subject', 'Course Title', 'Number', 'Sched Type', 'Primary Instructor']
label_columns = ['GPA']
X = data[feature_columns]
y = data[label_columns]

# encoding features: numerical encoding
from sklearn.preprocessing import LabelEncoder
for col in feature_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

print(X.shape, y.shape)

# split train and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

(58915, 7) (58915, 1)
(53023, 7) (53023, 1)
(5892, 7) (5892, 1)


In [4]:
# linear regression
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

y_pred = reg.predict(X_test)

# compute RMSE
from sklearn.metrics import root_mean_squared_error
print('RMSE of testing set:', root_mean_squared_error(y_test, y_pred))
print('RMSE of GPA in testing set:', root_mean_squared_error(y_test['GPA'], y_pred[:, -1]))

RMSE of testing set: 0.3740702119775465
RMSE of GPA in testing set: 0.3740702119775465


## Baseline: Random Forest and Results

Using a simple random forest model as the baseline.

The RMSE of testing set is 0.374. This model works poorly for the dataset and can not generate any meaningful prediction. We will try to improve the model in the future. 