In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge
from sklearn.metrics import mean_squared_error, r2_score

In [9]:
data = pd.read_csv("https://www.dropbox.com/s/360xhh2d9lnaek3/allegro-api-transactions.csv?dl=1")
selected_columns = ['main_category', 'categories', 'it_location', 'price']
categorical = ['main_category', 'categories', 'it_location']
data = data[selected_columns]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420020 entries, 0 to 420019
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   main_category  420020 non-null  object 
 1   categories     420020 non-null  object 
 2   it_location    420020 non-null  object 
 3   price          420020 non-null  float64
dtypes: float64(1), object(3)
memory usage: 12.8+ MB


Przeprowadzam transformacje pre-processingowe analogiczne do tych z pracy domowej 2. 

In [10]:
data['it_location'] = data['it_location'].str.lower()

In [12]:
data[categorical].describe()

Unnamed: 0,main_category,categories,it_location
count,420020,420020,420020
unique,27,9020,7903
top,Dom i Ogród,"['Dom i Ogród', 'Ogród', 'Rośliny', 'Rośliny o...",warszawa
freq,91042,3753,27042


Pomimo zastosowanych sztuczek, ilość unikalnych kategorii zmiennych `categories` oraz `it_location` jest bardzo duża.

Dalej, aby mieć pewien układ odniesienia, wykorzystamy najprostszy model (`LinearRegression`) oraz standardowy encoding (`TargetEncoder`).

In [15]:
X_train, X_test, y_train, y_test = train_test_split(data[categorical], data["price"], test_size=0.25, random_state=37)

In [16]:
my_pipeline = Pipeline(steps=[('process', TargetEncoder()),  ('model', LinearRegression())])  
parameters = {
    'process__smoothing': [0.05*10**(i) for i in range(1,6)],
    'process__min_samples_leaf': [i for i in range(1,5)]
             }
model_linear = GridSearchCV(my_pipeline, parameters,
                          cv = 5, 
                          scoring = ['neg_root_mean_squared_error', 'r2'], 
                          refit = 'neg_root_mean_squared_error')

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('process', TargetEncoder()),
                                       ('model', LinearRegression())]),
             param_grid={'process__min_samples_leaf': [1, 2, 3, 4],
                         'process__smoothing': [0.5, 5.0, 50.0, 500.0, 5000.0]},
             refit='neg_root_mean_squared_error',
             scoring=['neg_root_mean_squared_error', 'r2'])

In [None]:
model_linear.fit(X_train, y_train)

In [32]:
def judge_model(model, x, y):
    my_predictions_y = model.predict(x)
    rmse_score = f'RMSE: {mean_squared_error(y, my_predictions_y, squared=False):.1f}\n'
    r2 = f'R^2 score: {r2_score(y, my_predictions_y):.4f}'
    print(rmse_score + r2)

In [33]:
judge_model(model_linear, X_test, y_test)

RMSE: 402.8
R^2 score: 0.0926


Dalej wykorzystamy dwie najprostsze metody regularyzacji oferowane przez pakiet `scikit-learn` - Ridge i Lasso.


1. [Ridge - opis](https://scikit-learn.org/stable/modules/linear_model.html#ridge-regression-and-classification)
2. [Lasso - opis](https://scikit-learn.org/stable/modules/linear_model.html#lasso)

## Ridge

In [22]:
pipeline_ridge = Pipeline(steps=[('process', TargetEncoder()),('model', Ridge())])  
params_ridge = parameters
params_ridge["model__alpha"] = [0.05*10**(i) for i in range(1,5)]
model_ridge = GridSearchCV(pipeline_ridge, params_ridge,
                          cv = 2, 
                          scoring = ['neg_root_mean_squared_error', 'r2'], 
                          refit = 'neg_root_mean_squared_error')


In [23]:
model_ridge.fit(X_train, y_train)

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('process', TargetEncoder()),
                                       ('model', Ridge())]),
             param_grid={'model__alpha': [0.5, 5.0, 50.0, 500.0],
                         'process__min_samples_leaf': [1, 2, 3, 4],
                         'process__smoothing': [0.5, 5.0, 50.0, 500.0, 5000.0]},
             refit='neg_root_mean_squared_error',
             scoring=['neg_root_mean_squared_error', 'r2'])

In [34]:
judge_model(model_ridge,X_test,y_test)

RMSE: 402.6
R^2 score: 0.0934


Możemy więc zauważyć, że wynik jest praktycznie ten sam. 

[A to ja gdy widzę ten rezultat.](https://www.google.com/search?q=smutna+%C5%BCabka&sxsrf=ALeKk03JNu2HRvbAWA72dTrRm-TD8VT8Eg:1619458577200&tbm=isch&source=iu&ictx=1&fir=BtRo85q8mlNvyM%252Ckl6Ur4-I2CVQXM%252C_&vet=1&usg=AI4_-kQGvvrTZnglbHW0OPSt0oZx6fS4HA&sa=X&ved=2ahUKEwjTtOOpuZzwAhWCi8MKHVI_AW0Q9QF6BAgKEAE&biw=1478&bih=759#imgrc=BtRo85q8mlNvyM)

## Lasso

In [35]:
pipeline_lasso = Pipeline(steps=[('process', TargetEncoder()),  
                     ('model', Lasso())])  
params_lasso = params_ridge
model_lasso = GridSearchCV(pipeline_lasso, params_lasso,
                          cv = 2, 
                          scoring = ['neg_root_mean_squared_error', 'r2'], 
                          refit = 'neg_root_mean_squared_error')

In [36]:
model_lasso.fit(X_train, y_train)

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('process', TargetEncoder()),
                                       ('model', Lasso())]),
             param_grid={'model__alpha': [0.5, 5.0, 50.0, 500.0],
                         'process__min_samples_leaf': [1, 2, 3, 4],
                         'process__smoothing': [0.5, 5.0, 50.0, 500.0, 5000.0]},
             refit='neg_root_mean_squared_error',
             scoring=['neg_root_mean_squared_error', 'r2'])

In [37]:
judge_model(model_lasso,X_test,y_test)

RMSE: 402.2
R^2 score: 0.0952


Możemy więc zauważyć, że wynik jest praktycznie ten sam. 

[A to ja gdy widzę ten rezultat.](https://www.google.com/search?q=smutna+%C5%BCabka&sxsrf=ALeKk03JNu2HRvbAWA72dTrRm-TD8VT8Eg:1619458577200&tbm=isch&source=iu&ictx=1&fir=BtRo85q8mlNvyM%252Ckl6Ur4-I2CVQXM%252C_&vet=1&usg=AI4_-kQGvvrTZnglbHW0OPSt0oZx6fS4HA&sa=X&ved=2ahUKEwjTtOOpuZzwAhWCi8MKHVI_AW0Q9QF6BAgKEAE&biw=1478&bih=759#imgrc=BtRo85q8mlNvyM)

### Inny model - BayesianRidge

In [52]:
pipeline_bayesian = Pipeline(steps=[('process', TargetEncoder()),  
                     ('model', BayesianRidge())])  
param_bayesian = {'model__alpha_1' : [6 , 7 , 8 , 9 , 10] , 
              'model__lambda_1' : [6 , 7 , 8 , 9 , 10]}
model_bayesian = GridSearchCV(pipeline_bayesian, param_bayesian, cv=2)

In [54]:
model_bayesian.fit(X_train,y_train)

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('process', TargetEncoder()),
                                       ('model', BayesianRidge())]),
             param_grid={'model__alpha_1': [6, 7, 8, 9, 10],
                         'model__lambda_1': [6, 7, 8, 9, 10]})

In [55]:
judge_model(model_bayesian,X_test,y_test)

RMSE: 402.5
R^2 score: 0.0936


Widzimy więc, że osiągnięty rezultat jest porównywalny z poprzednimi.

## Wnioski

Na podstawie przeprowadzonej pracy, możemy wysnuć hipotezę, że w sytuacji gdy model liniowy "nie odpowiada" danemu problemowi, jakiekolwiek sztuczki związane z regularyzacją czy parametryzacją nie pomagają. Musimy wtedy sięgnąć po bardziej złożone modele uczenia maszynowego.