# Lab 05 - Extended Exercises on Model Evaluation
## Predicting student performance

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import scipy as sp

# Data directory
DATA_DIR = "./../../data/"

In [124]:
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, ParameterGrid, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, auc, mean_squared_error
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.pipeline import Pipeline

In [2]:
import requests

exec(requests.get("https://courdier.pythonanywhere.com/get-send-code").content)

npt_config = {
    'session_name': 'lab-05',
    'session_owner': 'mlbd',
    'sender_name': input("Your name: "),
}

Your name:  Olivia


## Introduction
The data has already been cleaned and it comes from 29 students in 3 different groups in a course of 26 weeks.

You already used this data in week 03. 

In this lab you will explore different models to predict the quiz grade. 

In [3]:
# Load data
df= pd.read_csv(f'{DATA_DIR}grades_in_time.csv.gz')
send(len(df), 0)
df.head()

Unnamed: 0,student,week,studying_hours,group,quiz_grade
0,0,0,39.9,3,6.1
1,0,1,32.4,3,7.0
2,0,2,17.5,3,6.9
3,0,3,16.0,3,7.0
4,0,4,15.9,3,7.2


In [4]:
df.describe(include='all')

Unnamed: 0,student,week,studying_hours,group,quiz_grade
count,810.0,810.0,810.0,810.0,810.0
mean,14.5,13.0,10.050617,1.933333,6.931975
std,8.660789,7.793693,8.270041,0.772199,1.336888
min,0.0,0.0,1.0,1.0,1.2
25%,7.0,6.0,5.7,1.0,6.4
50%,14.5,13.0,7.8,2.0,7.2
75%,22.0,20.0,11.1,3.0,7.8
max,29.0,26.0,64.0,3.0,10.1


# Task 1: Predict the quiz grade using the studying hours and the group.
----------
### 1.1 Split the data. 80% to train and the rest to test. 


In [7]:
df[['quiz_grade']]

Unnamed: 0,quiz_grade
0,6.1
1,7.0
2,6.9
3,7.0
4,7.2
...,...
805,8.3
806,8.2
807,8.1
808,8.1


In [42]:
X =  df.drop(columns='quiz_grade')
y =  df['quiz_grade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 1.2 Preprocess the data
Recall that group is a categorical feature.

Hint: Use ColumnTransformer.

In [43]:
preprocessor = ColumnTransformer(
    [
        ("categorical", OneHotEncoder(handle_unknown="ignore", drop='first'), ['group']),
        ("numerical", Normalizer(norm='l1'), ['studying_hours'])
    ]
)

preprocessor.fit_transform(X_train)


array([[0., 0., 1., 1.],
       [0., 0., 1., 1.],
       [1., 0., 0., 1.],
       ...,
       [0., 1., 0., 1.],
       [1., 0., 0., 1.],
       [0., 1., 0., 1.]])

### 1.3 Create a pipeline (including the preprocessing steps) to predict the quiz grade using the studying hours and the group.

1. Use the model ElasticNet for the regression task.
2. Calculate the mean sqaured error of the prediction. 


Hint: Integrate the ColumnTransformer as a pipeline step

In [73]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ElasticNet(random_state=0) )
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
error = mean_squared_error(y_true=y_test, y_pred=y_pred)
send(f"Mean Squared Error = {error}", 13)
print(f"Mean Squared Error = {error}")

Mean Squared Error = 1.8601583333333334


### 1.4 Compute the cross validation score

In [118]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ElasticNet(random_state=0, alpha=1.0, l1_ratio=0.5) )
])

cvscore = 1# Missing code
send(cvscore, 14)

<Response [200]>

In [75]:
scores = cross_validate(pipe, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
print(f'neg MSE with cross-validation: {scores["test_score"].mean():.3f}')


neg MSE with cross-validation: -1.830


In [77]:
# dont use this because not all models have a CV variant

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ElasticNetCV(random_state=0) )
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
error = mean_squared_error(y_true=y_test, y_pred=y_pred)
print(error)

1.8601583333333334


### 1.5 Does the score in 1.3 differ from the score in 1.4? Why? 

In [78]:
answer = "Simetimes, depends on the seed. CV gets a better estimate as without it you are sensitive to random init "
send(answer, 15) 

<Response [200]>

### 1.6 What is wrong with data split?

In [112]:
answer = "Time series data should not be split randomly. Using the future to predict the past"
send(answer, 16) 

<Response [200]>

## Task 2: Time Validation

### 2.1 Train with the first 25 weeks and predict week 26.

Hint: You may re-use your pipeline

In [114]:
# df.query('week < 26')

In [62]:
X_train = df[df['week'] < 26].drop(columns=['quiz_grade'])
y_train = df[df['week'] < 26]['quiz_grade']

X_test = df[df['week'] == 26].drop(columns=['quiz_grade'])
y_test = df[df['week'] == 26]['quiz_grade']

In [64]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
error = mean_squared_error(y_true=y_test, y_pred=y_pred)
send(f"Mean Squared Error = {error}", 21)
print(f"Mean Squared Error = {error}")

Mean Squared Error = 1.8601583333333334


### 2.2 Time splits
Would the model also be able to predict week 16 from all the previous weeks? 

What about week 5 from the previous weeks?

Create all the data splits so that the model predicts the next week given the information from the previous weeks. 

You may start taking the first 4 weeks to predict the 5th, then taking the first 5 weeks to predict the 6th, etc.


In [65]:
# from sklearn.model_selection import TimeSeriesSplit
# tscv = TimeSeriesSplit(test_size=1)

In [98]:
time_splits = [ (X_train[X_train['week'] < a].index, (X_train[X_train['week'] == a].index) ) for a in range(5,26)]

### 2.3 Using the previously created splits, calculate the cross validation score

Hint: You may pass the splits with the parameter cv to the cross vadation function

In [108]:
errors = (-1)*cross_val_score(pipe, X, y, cv = time_splits, scoring ='neg_mean_squared_error')
cvscore = np.mean(errors)
print(cvscore)
send(float(cvscore), 23)

1.9879575704154184


<Response [200]>

### 2.4 How does the error differ from the error of 2.1? Why?

In [109]:
answer = "More error "
send(answer, 24) 

<Response [200]>

## Task 3: Nested cross-validation

Now imagine we want to optimize the hyperparameters for the model.

We will "ignore" time for now and take the mean studying hours and quiz grade. 


In [110]:
df_agg = df.groupby('student').mean()

X = df_agg[['studying_hours', 'group']]
y = df_agg['quiz_grade']

### 3.1 Gridsearch with cross validation

ElasticNet has two interesting parameters: alpha and l1_ratio.

Run a GridSearch to explore the following values:
* alpha = 0.1 and 1
* l1_ratio = 0.1, 0.5 and 1

What is the best score (smallest error)? 

In [120]:
param_grid =  {
    'model__alpha': [0.1, 1],
    'model__l1_ratio': [0.1, 0.5,1]
}

search = GridSearchCV(pipe, param_grid, cv=10,scoring='neg_mean_squared_error', refit=True)
search.fit(X,y)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['group']),
                                                                        ('numerical',
                                                                         Normalizer(norm='l1'),
                                                                         ['studying_hours'])])),
                                       ('model', ElasticNet(random_state=0))]),
             param_grid={'model__alpha': [0.1, 1],
                         'model__l1_ratio': [0.1, 0.5, 1]},
             scoring='neg_mean_squared_error')

In [121]:
(-1)*search.best_score_

0.2822916485555311

### 3.2 Why is the error from the best model in 3.1 biased?


In [122]:
answer = " Ignoring the time means it uses future information to predict the past "
send(answer, 32) 

<Response [200]>

### 3.3 Improve 3.1 to have an unbiased estimation of the generalization error

Hint: Use nested cross-validation

In [126]:
inner_cv = KFold(n_splits=4, shuffle=True, random_state=123)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=123)

search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=inner_cv, scoring='neg_mean_squared_error')
errors = (-1)*cross_val_score(search,X=X,y=y,cv=outer_cv)

np.mean(errors)

0.275863991769547