In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

df = pd.read_csv('/content/winequality-red.csv',
    sep=';')

X = df.drop('quality', axis=1)
y = df['quality']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

scaler = Pipeline([
    ('any_name1', StandardScaler())
    ])

preprocessor = ColumnTransformer([
    ('any_name_does', scaler, X.columns.to_list())],
    remainder='passthrough')

pipe = Pipeline([
    ('pre', preprocessor),
    ('lm', LinearRegression())])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)
y_pred


array([6.48553273, 4.72694379, 5.84088531, ..., 5.48660099, 6.34047368,
       5.4922784 ])

In [None]:
pd.DataFrame({
    'y_true': y_train,
    'y_pred': y_pred
})

Unnamed: 0,y_true,y_pred
1108,7,6.485533
709,5,4.726944
823,6,5.840885
4109,6,6.496332
1243,7,6.425209
...,...,...
4473,5,5.191853
580,5,5.176507
163,6,5.486601
4703,7,6.340474


Python has a function to calculate the mean absolute error. We need first to load the function.

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
mean_absolute_error(y_train, y_pred)

0.5859156936493961

Python has a function to calculate the mean squared error. We need first to load the function.

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(y_train, y_pred)

0.5656942908707611

Since the errors are squared, to better compare with the outcome variable, we can use square root of the mean squared error:

In [None]:
mean_squared_error(y_train, y_pred, squared=False)

0.7521265125434424

In [None]:
y_train.mean()

5.8797856049004595

In [None]:
np.sqrt(0.5656942908707611)

0.7521265125434424

In [None]:
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = mean_squared_error(y_train, y_pred, squared=False)

In [None]:
print(f'MAE= {mae}')
print(f'MSE= {mse}')
print(f'RMSE= {rmse}')

MAE= 0.5859156936493961
MSE= 0.5656942908707611
RMSE= 0.7521265125434424


To calculate R2, we need to load r2_score function.

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2 = r2_score(y_train, y_pred)

In [None]:
print(f'R2= {r2}')

R2= 0.27577917074526737


The predictive performance of this model is not good, since the R2 is low. This means that we should try another model. Or this is suggesting that some important predictors are missing.

To get an idea how our model works with unseen data, we will use the test set, which has not been used for anything up to now. 

In [None]:
y_pred = pipe.predict(X_test)


In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'MAE= {mae}')
print(f'MSE= {mse}')
print(f'RMSE= {rmse}')
print(f'R2= {r2}')


MAE= 0.5730106620077944
MSE= 0.5543180198119331
RMSE= 0.7445253654590508
R2= 0.30403261702378337


# Car radios problem

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
import datetime

df = pd.read_excel('/content/data_carradios.xlsx')

def get_ages(col):
  result = (datetime.datetime.now()-col).astype('<m8[Y]')
  result = pd.DataFrame(result)
  return result

ager = Pipeline([
    ('ages', FunctionTransformer(get_ages, feature_names_out='one-to-one')),
    ('scale', StandardScaler())
])

def get_weekdays(col):
  result = col.iloc[:,0].dt.weekday
  result = pd.DataFrame(result)
  return result

weeker = Pipeline([
    ('weekd', FunctionTransformer(get_weekdays, feature_names_out='one-to-one')),
    ('oneh', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer([
    ('ages_tr', ager, ['bdate']),
    ('weekd_tr', weeker, ['datep']),
    ('team_tr', OneHotEncoder(drop='first'), ['team']),
    ('scaler', StandardScaler(), ['prized', 'prizeq'])],
    remainder='passthrough')

X = df.drop('perc_defec', axis=1)
y = df['perc_defec']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

pipe = Pipeline([
    ('pre', preprocessor),
    ('lm', LinearRegression())])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)
y_pred

array([20.08740234, 20.41943359, 10.64794922,  2.07958984, 30.55419922,
       20.18896484, 38.86279297, 20.26904297, 30.61669922, 20.64013672,
       30.55419922, 38.41552734,  1.99755859, 20.46630859, 20.10888672,
       20.63232422, 39.83154297, 20.32177734,  2.44287109, 20.34716797,
       39.83154297, 57.99560547, 20.10888672,  9.65576172,  9.78857422,
       20.40185547,  2.08740234,  1.78466797,  2.14990234, 29.56787109,
       48.71044922, 17.26318359,  9.65576172, 20.46630859, 39.83154297,
       11.29248047, 38.49560547, 30.18896484,  1.45849609, 39.60498047,
        1.91943359,  2.31787109, 21.30029297, 39.60498047, 30.18896484,
       20.56005859, 39.29052734, 39.15576172,  1.45849609, 39.55029297,
       -0.88525391, 20.20458984, 39.97998047,  1.24169922, 39.60498047,
       10.95654297,  1.10107422, 11.13623047, 10.64013672, 19.64599609,
       29.67529297, -0.76025391, 20.33935547,  2.31787109, 11.65966797,
       19.43310547, 19.13818359,  1.10107422, 20.84716797,  1.32