LASSO REGRESSION


In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Lasso
import datetime

df = pd.read_excel('/content/data_carradios.xlsx')
df

Unnamed: 0,perc_defec,bdate,team,training,datep,prizeq,prized
0,0.00,1981-07-02,8,1,2021-07-01,0,0
1,36.32,1992-06-14,6,0,2021-07-02,0,0
2,48.91,2003-05-28,7,0,2021-07-05,500,0
3,20.36,1992-06-14,10,0,2021-07-06,0,0
4,42.07,2003-05-28,7,0,2021-07-07,500,0
...,...,...,...,...,...,...,...
995,21.50,1981-07-02,1,0,2021-12-13,0,0
996,7.53,1981-07-02,2,0,2021-12-14,500,600
997,27.70,1997-12-05,10,0,2021-12-15,0,0
998,0.00,1986-12-23,5,0,2021-12-16,0,600


In [6]:
def get_ages(col):
  result = (datetime.datetime.now()-col).astype('<m8[Y]')
  result = pd.DataFrame(result)
  return result

ager = Pipeline([
    ('ages', FunctionTransformer(get_ages, feature_names_out='one-to-one')),
    ('scale', StandardScaler())
])

def get_weekdays(col):
  result = col.iloc[:,0].dt.weekday
  result = pd.DataFrame(result)
  return result

weeker = Pipeline([
    ('weekd', FunctionTransformer(get_weekdays, feature_names_out='one-to-one')),
    ('oneh', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer([
    ('ages_tr', ager, ['bdate']),
    ('weekd_tr', weeker, ['datep']),
    ('team_tr', OneHotEncoder(drop='first'), ['team']),
    ('scaler', StandardScaler(), ['prized', 'prizeq'])],
    remainder='passthrough')

X = df.drop('perc_defec', axis=1)
y = df['perc_defec']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

pipe = Pipeline([
    ('pre', preprocessor),
    ('lasso', Lasso(alpha=0.1))])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)
y_pred

array([ 1.95854979e+01,  2.07321659e+01,  1.13849310e+01,  2.30907362e+00,
        2.97110759e+01,  2.07321659e+01,  3.83644755e+01,  2.07321659e+01,
        2.97730735e+01,  2.07603466e+01,  2.97110759e+01,  3.83926562e+01,
        1.95318828e+00,  2.00033808e+01,  2.06983490e+01,  2.07603466e+01,
        3.87497320e+01,  2.07603466e+01,  2.33725437e+00,  2.06983490e+01,
        3.87497320e+01,  5.64440392e+01,  2.06983490e+01,  1.08333274e+01,
        1.08615081e+01,  2.07603466e+01,  1.92500754e+00,  2.24707599e+00,
        2.27525673e+00,  2.91594722e+01,  4.74053831e+01,  1.77057668e+01,
        1.08333274e+01,  2.00033808e+01,  3.87497320e+01,  1.13849310e+01,
        3.83926562e+01,  2.96828951e+01,  1.78565071e+00,  3.87497320e+01,
        2.27525673e+00,  2.30907362e+00,  2.07941635e+01,  3.87497320e+01,
        2.96828951e+01,  2.07603466e+01,  3.84546538e+01,  3.84264731e+01,
        1.78565071e+00,  3.86877344e+01,  1.14595725e-02,  2.06701683e+01,
        3.87779127e+01,  

## ERRORS

In [4]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [5]:
mae = mean_absolute_error(y_train, y_pred)
rmse = mean_squared_error(y_train, y_pred, squared=False)
r2 = r2_score(y_train, y_pred)

print(f'MAE = {mae}')
print(f'RMSE = {rmse}')
print(f'R2 = {r2}')

MAE = 3.3044484189593932
RMSE = 4.388575673373468
R2 = 0.9153057337184486


In [8]:
y_pred = pipe.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'MAE = {mae}')
print(f'RMSE = {rmse}')
print(f'R2 = {r2}')

MAE = 3.7958035263812975
RMSE = 5.03412150010278
R2 = 0.8923672422785776
