# Car radios problem - 20230320

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
import datetime

df = pd.read_excel('/content/data_carradios.xlsx')

def get_ages(col):
  result = (datetime.datetime.now()-col).astype('<m8[Y]')
  result = pd.DataFrame(result)
  return result

ager = Pipeline([
    ('ages', FunctionTransformer(get_ages, feature_names_out='one-to-one')),
    ('scale', StandardScaler())
])

def get_weekdays(col):
  result = col.iloc[:,0].dt.weekday
  result = pd.DataFrame(result)
  return result

weeker = Pipeline([
    ('weekd', FunctionTransformer(get_weekdays, feature_names_out='one-to-one')),
    ('oneh', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer([
    ('ages_tr', ager, ['bdate']),
    ('weekd_tr', weeker, ['datep']),
    ('team_tr', OneHotEncoder(drop='first'), ['team']),
    ('scaler', StandardScaler(), ['prized', 'prizeq'])],
    remainder='passthrough')

X = df.drop('perc_defec', axis=1)
y = df['perc_defec']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

pipe = Pipeline([
    ('pre', preprocessor),
    ('lm', LinearRegression())])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
mae = mean_absolute_error(y_train, y_pred)
rsme = mean_squared_error(y_train, y_pred, squared=False)
r2 = r2_score(y_train, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')

MAE= 3.3389007324218745
RSME= 4.333033342195751
R2= 0.9174359688584746


In [None]:
y_pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rsme = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')

MAE= 3.8249537109375003
RSME= 5.0228109370622995
R2= 0.8928503531812346


Since the measures of errors are similar for the test set, we can expect good performance of the model in unseen new cases.

# Lasso regression



1.   Sometimes, some predictors are weak, in the sense that the respective oeeficient is close to zero, but not zero.
2.   When a predictor is weak, we should remove it, because:
    - The smaller is the number of predictors, the easier is the interpretation of the model.
    - The larger the number of predictors, the more data are needed to have the model working with good predictive performance.

Lasso regression sets to zero the weaker predictors automatically.



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
import datetime

df = pd.read_excel('/content/data_carradios.xlsx')

def get_ages(col):
  result = (datetime.datetime.now()-col).astype('<m8[Y]')
  result = pd.DataFrame(result)
  return result

ager = Pipeline([
    ('ages', FunctionTransformer(get_ages, feature_names_out='one-to-one')),
    ('scale', StandardScaler())
])

def get_weekdays(col):
  result = col.iloc[:,0].dt.weekday
  result = pd.DataFrame(result)
  return result

weeker = Pipeline([
    ('weekd', FunctionTransformer(get_weekdays, feature_names_out='one-to-one')),
    ('oneh', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer([
    ('ages_tr', ager, ['bdate']),
    ('weekd_tr', weeker, ['datep']),
    ('team_tr', OneHotEncoder(drop='first'), ['team']),
    ('scaler', StandardScaler(), ['prized', 'prizeq'])],
    remainder='passthrough')

X = df.drop('perc_defec', axis=1)
y = df['perc_defec']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

pipe = Pipeline([
    ('pre', preprocessor),
    ('lasso', Lasso(alpha=0.1))])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)


The parameter alpha controls the intensity of penalization of the weaker predictors. The larger the value of alpha, the more coefficents will be set to zero.

In [None]:
pipe.named_steps['pre'].get_feature_names_out()

array(['ages_tr__bdate', 'weekd_tr__datep_1', 'weekd_tr__datep_2',
       'weekd_tr__datep_3', 'weekd_tr__datep_4', 'team_tr__team_2',
       'team_tr__team_3', 'team_tr__team_4', 'team_tr__team_5',
       'team_tr__team_6', 'team_tr__team_7', 'team_tr__team_8',
       'team_tr__team_9', 'team_tr__team_10', 'scaler__prized',
       'scaler__prizeq', 'remainder__training'], dtype=object)

In [None]:
pipe.named_steps['lasso'].coef_

array([  0.03130011,  -0.55160365,  -0.        ,  -0.        ,
        17.69430722,  -0.        ,   9.01272689,  -0.        ,
         0.        ,  -0.        ,   8.55764264,  -2.29761405,
         0.        ,  -0.        ,  -8.59784934,   4.31942568,
       -18.42309227])

In [None]:
results=pd.DataFrame(
    {'names': pipe.named_steps['pre'].get_feature_names_out(),
     'estimates': pipe.named_steps['lasso'].coef_}
)

results

Unnamed: 0,names,estimates
0,ages_tr__bdate,0.0313
1,weekd_tr__datep_1,-0.551604
2,weekd_tr__datep_2,-0.0
3,weekd_tr__datep_3,-0.0
4,weekd_tr__datep_4,17.694307
5,team_tr__team_2,-0.0
6,team_tr__team_3,9.012727
7,team_tr__team_4,-0.0
8,team_tr__team_5,0.0
9,team_tr__team_6,-0.0


# KNN - Classification

In [None]:
df = pd.read_excel('/content/knn01_clas.xlsx')
df

Unnamed: 0,X1,X2,Y
0,1,4,B
1,6,2,B
2,5,3,A
3,3,1,A
4,2,9,B
5,1,2,A


What is the prediction to the outcome variable for the point (X1, X2) = (3,5), by using KNN with k=3?

In [None]:
df['dist'] = np.sqrt((df['X1'].values-3)**2 + (df['X2'].values-5)**2)
df.sort_values('dist')

Unnamed: 0,X1,X2,Y,dist
0,1,4,B,2.236068
2,5,3,A,2.828427
5,1,2,A,3.605551
3,3,1,A,4.0
4,2,9,B,4.123106
1,6,2,B,4.242641


# KNN - Regression

In [None]:
df = pd.read_excel('/content/knn01_reg.xlsx')
df

Unnamed: 0,X1,X2,Y
0,1,4,8
1,6,2,5
2,5,3,7
3,3,1,10
4,2,9,3
5,1,2,6


What is the prediction for point (X1, X2)=(3.5), fopr k=3

In [None]:
df['dist'] = np.sqrt((df['X1'].values-3)**2 + (df['X2'].values-5)**2)
df.sort_values('dist')

Unnamed: 0,X1,X2,Y,dist
0,1,4,8,2.236068
2,5,3,7,2.828427
5,1,2,6,3.605551
3,3,1,10,4.0
4,2,9,3,4.123106
1,6,2,5,4.242641


The prediction for (X1, X2)=(3.5) will be 

(8+7+6)/3 = 7

How to implement this in Python?

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
X = df.drop(['Y', 'dist'], axis=1)
y = df['Y']

In [None]:
X_new = pd.DataFrame({
    'X1': [3],
    'X2': [5]
})

In [None]:
knn_model = KNeighborsRegressor(n_neighbors=3)

knn_model.fit(X, y)

knn_model.predict(X_new)

array([7.])

KNN - example classification

In [None]:
df = pd.read_excel('/content/knn01_clas.xlsx')
df

Unnamed: 0,X1,X2,Y
0,1,4,B
1,6,2,B
2,5,3,A
3,3,1,A
4,2,9,B
5,1,2,A


In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
X = df.drop(['Y'], axis=1)
y = df['Y']

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=3)

knn_model.fit(X, y)

knn_model.predict(X_new)

array(['A'], dtype=object)

KNN - Bank notes problem

In [None]:
df = pd.read_csv('/content/data_banknote_authentication.txt', header=None)
df.columns = ['X1', 'X2', 'X3', 'X4', 'Y']
df

Unnamed: 0,X1,X2,X3,X4,Y
0,3.62160,8.66610,-2.8073,-0.44699,0
1,4.54590,8.16740,-2.4586,-1.46210,0
2,3.86600,-2.63830,1.9242,0.10645,0
3,3.45660,9.52280,-4.0112,-3.59440,0
4,0.32924,-4.45520,4.5718,-0.98880,0
...,...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949,1
1368,-1.38870,-4.87730,6.4774,0.34179,1
1369,-3.75030,-13.45860,17.5932,-2.77710,1
1370,-3.56370,-8.38270,12.3930,-1.28230,1


In [None]:
X = df.drop('Y', axis=1)
y = df['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

scaler = Pipeline([
    ('scale', StandardScaler())])

preprocessor = ColumnTransformer([
    ('scale_tr', scaler, ['X1', 'X2', 'X3', 'X4'])], #X.columns.to_list()
    remainder='passthrough'
)

pipe = Pipeline([
    ('pre', preprocessor),
    ('knn', KNeighborsClassifier(n_neighbors=3))])

pipe.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_pred = pipe.predict(X_train)
accuracy_score(y_train, y_pred)


0.9981768459434822

In [None]:
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)

1.0