# **KNN REGRESSION - CAR RADIOS**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
# from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

import datetime

df = pd.read_excel('/content/data_carradios.xlsx')

def get_ages(col):
  result = (datetime.datetime.now()-col).astype('<m8[Y]')
  result = pd.DataFrame(result)
  return result

ager = Pipeline([
    ('ages', FunctionTransformer(get_ages, feature_names_out='one-to-one')),
    ('scale', StandardScaler())
])

def get_weekdays(col):
  result = col.iloc[:,0].dt.weekday
  result = pd.DataFrame(result)
  return result

weeker = Pipeline([
    ('weekd', FunctionTransformer(get_weekdays, feature_names_out='one-to-one')),
    # ('oneh', OneHotEncoder(drop='first'))
    ('oneh', OneHotEncoder())
])

# tirámos o drop first porque em KNN não existe multicolinearidade

preprocessor = ColumnTransformer([
    ('ages_tr', ager, ['bdate']),
    ('weekd_tr', weeker, ['datep']),
    # ('team_tr', OneHotEncoder(drop='first'), ['team']),
    ('team_tr', OneHotEncoder(), ['team']),
    ('scaler', StandardScaler(), ['prized', 'prizeq'])],
    remainder='passthrough')

X = df.drop('perc_defec', axis=1)
y = df['perc_defec']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

pipe = Pipeline([
    ('pre', preprocessor),
    # ('lm', LinearRegression())])
    ('knn', KNeighborsRegressor(n_neighbors=3))])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)

mae = mean_absolute_error(y_train, y_pred)
rsme = mean_squared_error(y_train, y_pred, squared=False)
r2 = r2_score(y_train, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')
y_pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rsme = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')

MAE= 3.0324041666666672
RSME= 4.198885925324585
R2= 0.922469072562689
MAE= 4.002416666666667
RSME= 5.326234989809094
R2= 0.879513683327535


# **GRID SEARCH CV**

In the case of KNN, applying GridSearchCV means find the hyperparameter `n_neighbors` that leads to the best predictive performance of the model.

In the LassoRegression we try to find the best alpha for the model, and now, we want to fin the best K.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
# from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV # new

import datetime

df = pd.read_excel('/content/data_carradios.xlsx')

def get_ages(col):
  result = (datetime.datetime.now()-col).astype('<m8[Y]')
  result = pd.DataFrame(result)
  return result

ager = Pipeline([
    ('ages', FunctionTransformer(get_ages, feature_names_out='one-to-one')),
    ('scale', StandardScaler())
])

def get_weekdays(col):
  result = col.iloc[:,0].dt.weekday
  result = pd.DataFrame(result)
  return result

weeker = Pipeline([
    ('weekd', FunctionTransformer(get_weekdays, feature_names_out='one-to-one')),
    # ('oneh', OneHotEncoder(drop='first'))
    ('oneh', OneHotEncoder())

])

preprocessor = ColumnTransformer([
    ('ages_tr', ager, ['bdate']),
    ('weekd_tr', weeker, ['datep']),
    # ('team_tr', OneHotEncoder(drop='first'), ['team']),
    ('team_tr', OneHotEncoder(), ['team']),
    ('scaler', StandardScaler(), ['prized', 'prizeq'])],
    remainder='passthrough')

X = df.drop('perc_defec', axis=1)
y = df['perc_defec']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

hyperparameters ={ #new
    'n_neighbors': [2, 3, 5, 6, 8, 10] 
}

pipe = Pipeline([
    ('pre', preprocessor),
    # ('lm', LinearRegression())])
    # ('knn', KNeighborsRegressor(n_neighbors=3))])
    ('grid', GridSearchCV(KNeighborsRegressor(), hyperparameters, cv=5))]) # new

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)

mae = mean_absolute_error(y_train, y_pred)
rsme = mean_squared_error(y_train, y_pred, squared=False)
r2 = r2_score(y_train, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')

y_pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rsme = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')

MAE= 3.0302549999999995
RSME= 4.062708178050695
R2= 0.927416470307902
MAE= 3.88707
RSME= 5.2391171603620394
R2= 0.8834228852593191


To know what is the K that leads to the better predictive performance of the model:

In [None]:
pipe.named_steps['grid'].best_params_

{'n_neighbors': 5}

So, in order to get the best predictive performance of the model we should use a K=5 instead of using a K=3.