# Lab 05.2 - Part 2

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
pd.set_option('display.max_columns', 500)

%matplotlib inline

## 1. Preliminaries

In [2]:
# Read in subset of footballer data
model_data = pd.read_csv('footballer_reduced.csv')
model_data

Unnamed: 0,age,height_cm,weight_kg,work_rate_att,overall
0,20,175,70,Medium,58
1,29,183,80,High,65
2,35,183,78,High,67
3,24,178,72,Medium,69
4,23,173,73,Medium,70
...,...,...,...,...,...
355,25,180,77,Medium,66
356,23,180,75,Medium,71
357,22,180,72,Medium,63
358,22,180,72,Medium,62


In [3]:
# Turn category into numeric variables
model_data = pd.get_dummies(model_data, drop_first=True)
model_data

Unnamed: 0,age,height_cm,weight_kg,overall,work_rate_att_Low,work_rate_att_Medium
0,20,175,70,58,0,1
1,29,183,80,65,0,0
2,35,183,78,67,0,0
3,24,178,72,69,0,1
4,23,173,73,70,0,1
...,...,...,...,...,...,...
355,25,180,77,66,0,1
356,23,180,75,71,0,1
357,22,180,72,63,0,1
358,22,180,72,62,0,1


In [4]:
# Define our X and y
y = model_data.overall
X = model_data.drop('overall', axis = 'columns')

In [5]:
# Split into train&validation, test
# Random state assures that folds are consistent across models
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size = 50, random_state = 0)
print(Xtrain.shape,Xtest.shape)

(310, 5) (50, 5)


In [6]:
# Define training and testing loss 
def mse(y,ypred):
    return np.mean((y-ypred)**2)

## 2. Use cross-validation to select the best model 

In [7]:
# Model 1
model1 = LinearRegression()
model1 = model1.fit(Xtrain,ytrain)
trainloss = mse(ytrain,model1.predict(Xtrain))
print(f"Training loss: {trainloss}")

kf = KFold(n_splits=5,shuffle=False)
sc = make_scorer(mse)
cv_scores = cross_val_score(model1, Xtrain, ytrain, cv=kf, scoring=sc)
print(f"CV loss: {cv_scores.mean()}")

Training loss: 34.647804072402714
CV loss: 36.0088230581963


In [8]:
# Model 2: Squared trend for age
# Construct a new feature - age squared
Xtrain2 = Xtrain
Xtrain2 = Xtrain2.assign(age2 = Xtrain.age**2)
model2 = LinearRegression()
model2 = model2.fit(Xtrain2,ytrain)
trainloss = mse(ytrain,model2.predict(Xtrain2))
print(f"Training loss: {trainloss}")

cv_scores = cross_val_score(model2, Xtrain2, ytrain, cv=kf, scoring=sc)
print(f"CV loss: {cv_scores.mean()}")

Training loss: 31.95545717826095
CV loss: 33.4089109962452


In [9]:
# Model 3: All polynomial features
PT =  PolynomialFeatures(degree=2,include_bias=False)
Xtrain3 = PT.fit_transform(Xtrain)
print(Xtrain3.shape)
model3 = LinearRegression().fit(Xtrain3,ytrain)
trainloss = mse(ytrain,model3.predict(Xtrain3))
print(f"Training loss: {trainloss}")

cv_scores = cross_val_score(model3, Xtrain3, ytrain, cv=kf, scoring=sc)
print(f"CV loss: {cv_scores.mean()}")

(310, 20)
Training loss: 30.708788577785157
CV loss: 35.332946611635904


## 3. Pipelines 

In [10]:
# Define the different pipelines 
model1 = Pipeline([
    ('linear_regression', LinearRegression())
])
model3 = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('linear_regression', LinearRegression())
])

In [11]:
# Model2 with costum transform (You can also use ColumnTransformer)
class Age2(BaseEstimator,TransformerMixin):  
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        X = X.assign(age2 = X.age**2)
        return X

model2 = Pipeline([
    ('age2', Age2()),
    ('linear_regression', LinearRegression())
])

In [12]:
# Check training loss
print(cross_val_score(model1, Xtrain, ytrain, cv=kf, scoring=sc).mean())
print(cross_val_score(model2, Xtrain, ytrain, cv=kf, scoring=sc).mean())
print(cross_val_score(model3, Xtrain, ytrain, cv=kf, scoring=sc).mean())


36.0088230581963
33.4089109962452
35.332946611635904


In [13]:
# Now report test loss on selected model 
model2 = model2.fit(Xtrain,ytrain)
testloss = mse(ytest,model2.predict(Xtest))
print(f"Test loss: {testloss}")

Test loss: 30.64475044863152
