# Zadanie Bonusowe - Wstęp do uczenia maszynowego
## Kinga Ułasik IiAD 27.04.2021

# Potrzebne importy

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from numpy import log
import category_encoders as ce
from sklearn import svm
import warnings
warnings.filterwarnings('ignore')
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Wczytanie danych

In [2]:
allegro = pd.read_csv('allegro-api-transactions.csv')
allegro = allegro[['price','main_category','categories','it_location']]
allegro.head()

Unnamed: 0,price,main_category,categories,it_location
0,59.99,Komputery,"['Komputery', 'Dyski i napędy', 'Nośniki', 'No...",Warszawa
1,4.9,"Odzież, Obuwie, Dodatki","['Odzież, Obuwie, Dodatki', 'Bielizna damska',...",Warszawa
2,109.9,Dom i Ogród,"['Dom i Ogród', 'Budownictwo i Akcesoria', 'Śc...",Leszno
3,18.5,Książki i Komiksy,"['Książki i Komiksy', 'Poradniki i albumy', 'Z...",Wola Krzysztoporska
4,19.9,"Odzież, Obuwie, Dodatki","['Odzież, Obuwie, Dodatki', 'Ślub i wesele', '...",BIAŁYSTOK


# Target Encoding

In [3]:
allegro['it_location'] = allegro['it_location'].str.lower()

In [4]:
import category_encoders as ce

target_encoder = ce.TargetEncoder() 
location_target = target_encoder.fit_transform(allegro['it_location'], allegro['price'])
print(location_target)

        it_location
0         84.132898
1         84.132898
2         64.883187
3         35.433365
4         73.772916
...             ...
420015    26.346402
420016    63.645927
420017    18.682800
420018   104.254805
420019    78.136792

[420020 rows x 1 columns]


In [5]:
target_encoder = ce.TargetEncoder() 
categories_target = target_encoder.fit_transform(allegro['categories'], allegro['price'])
print(categories_target)

        categories
0        61.839771
1        12.375798
2       105.272597
3        24.072564
4        17.102344
...            ...
420015   76.811350
420016   23.312063
420017    9.630092
420018  134.824626
420019  965.619857

[420020 rows x 1 columns]


In [6]:
target_encoder = ce.TargetEncoder() 
main_category_target = target_encoder.fit_transform(allegro['main_category'], allegro['price'])
print(main_category_target)

        main_category
0          121.810064
1           75.858049
2           72.434754
3           25.027497
4           75.858049
...               ...
420015     107.534272
420016      28.128577
420017      75.858049
420018      71.206386
420019     134.426801

[420020 rows x 1 columns]


In [8]:
allegro['it_location'] = location_target
allegro['categories'] = categories_target
allegro['main_category'] = main_category_target
allegro.head()

Unnamed: 0,price,main_category,categories,it_location
0,59.99,121.810064,61.839771,84.132898
1,4.9,75.858049,12.375798,84.132898
2,109.9,72.434754,105.272597,64.883187
3,18.5,25.027497,24.072564,35.433365
4,19.9,75.858049,17.102344,73.772916


# Smoothing

Smoothing przy target encodingu jest istotny ponieważ może on prowadzić do overfittingu oraz opieranie się na wartości średniej nie zawsze jest dobrym pomysłem np. gdy liczba wartości użytych w średniej jest niewielka. Smoothing polega na „wygładzeniu” średniej poprzez uwzględnienie globalnej średniej

In [9]:
#df - pandas.DataFrame
#by - categorical column name
#on - the name of the target column
#m-  weight 

def calc_smooth_mean(df, by, on, m):
    # Compute the global mean
    mean = df[on].mean()
    # Compute the number of values and the mean of each group
    agg = df.groupby(by)[on].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']
    # Compute the "smoothed" means
    smooth = (counts * means + m * mean) / (counts + m)
    # Replace each value by the according smoothed mean
    return df[by].map(smooth)

In [10]:
#df['x_1'] = calc_smooth_mean(df, by='x_1', on='y', m=10)
allegro['it_location'] = calc_smooth_mean(allegro, by='it_location', on='price', m=10)
allegro['categories'] = calc_smooth_mean(allegro, by='categories', on='price', m=10)
allegro['main_category'] = calc_smooth_mean(allegro, by='main_category', on='price', m=10)
allegro.head()

Unnamed: 0,price,main_category,categories,it_location
0,59.99,121.779033,68.968738,84.130192
1,4.9,75.858225,13.221409,84.130192
2,109.9,72.435235,102.00119,64.98673
3,18.5,25.072207,28.225224,39.063013
4,19.9,75.858225,25.171128,73.776217


# Tworzenie modelu

In [15]:
X = allegro.drop('price', axis=1)
y =  allegro.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

## Random Forest regressor

In [38]:
from sklearn.ensemble import RandomForestRegressor
forestreg = RandomForestRegressor(n_estimators = 100, random_state = 0)
forestreg.fit(X_train, y_train)
y_pred = forestreg.predict(X_test)  # test the output by changing values
print('RMSE : ' + str(mean_squared_error(y_test, y_pred, squared=False)))
print('R2 squared : ' + str(r2_score(y_test, y_pred)))
print('Mean of the target variable: ' + str(y.mean()))

RMSE : 278.82173583912515
R2 squared : 0.07205348545219736
Mean of the target variable: 76.81135036426836


In [39]:
from sklearn.model_selection import cross_validate
print((-1)*np.mean(cross_validate(forestreg, X_test, y_test, cv=20, scoring='neg_root_mean_squared_error').get('test_score')))
print(np.mean(cross_validate(forestreg, X_test, y_test, cv=20, scoring='r2').get('test_score')))

214.11429308280972
0.26025343494510655


# Regularyzacja

## Ridge Regression

Krótki opis:
* Ridge Regression wykorzystuje technikę regularyzacji L2
* Wykonuje aktualizacje wagi zmiennej, funkcja straty (loss function) ma dodatkowy kwadratowy element
* Zmniejsza ogólny rozmiar wartości wagi zmiennych podczas optymalizacji i zmniejsza szanse na overfitting

In [27]:
from sklearn.linear_model import Ridge

ridgereg = Ridge(alpha=0, normalize=True)
ridgereg.fit(X_train, y_train)
y_pred = ridgereg.predict(X_test)
print('RMSE : ' + str(mean_squared_error(y_test, y_pred, squared=False)))
print('R2 squared : ' + str(r2_score(y_test, y_pred)))
print('Mean of the target variable: ' + str(y.mean()))

RMSE : 255.45870778675553
R2 squared : 0.22104725225413357
Mean of the target variable: 76.81135036426836


In [29]:
#szukamy najbardziej optymalnego alfa
#im wyżesz alfa tym większa regularyzacja
alpha_range = 10.**np.arange(-2, 3)

from sklearn.linear_model import RidgeCV
ridgeregcv = RidgeCV(alphas=alpha_range, normalize=True, scoring='neg_mean_squared_error')
ridgeregcv.fit(X_train, y_train)
ridgeregcv.alpha_

0.01

In [30]:
from sklearn.linear_model import Ridge

ridgereg = Ridge(alpha=0.01, normalize=True)
ridgereg.fit(X_train, y_train)
y_pred = ridgereg.predict(X_test)

import numpy as np
from sklearn.model_selection import cross_validate

print((-1)*np.mean(cross_validate(ridgereg, X_test, y_test, cv=20, scoring='neg_root_mean_squared_error').get('test_score')))
print(np.mean(cross_validate(ridgereg, X_test, y_test, cv=20, scoring='r2').get('test_score')))

207.58385346425817
0.36499445086244353


## Lasso Regression

Krótki opis:
* Wykorzystuje technikę regularyzacji L1
* Wykonuje aktualizacje wagi zmienne,funkja straty (loss function) ma dodatkowy alement zawierający normę L1 wektora wag
* Wagi niektórych zmiennych w pewnym momencie spadają do zera, skutecznie eliminując w ten sposób te cechy, które powodują problemy z dużą wariancją i nadmiernym dopasowaniem modelu

In [32]:
from sklearn.linear_model import Lasso
lassoreg = Lasso(alpha=0, normalize=True)
lassoreg.fit(X_train, y_train)
y_pred = lassoreg.predict(X_test)
print('RMSE : ' + str(mean_squared_error(y_test, y_pred, squared=False)))
print('R2 squared : ' + str(r2_score(y_test, y_pred)))
print('Mean of the target variable: ' + str(y.mean()))

RMSE : 255.45870778675553
R2 squared : 0.22104725225413357
Mean of the target variable: 76.81135036426836


In [31]:
from sklearn.linear_model import LassoCV
lassoregcv = LassoCV(n_alphas=100, normalize=True, random_state=1)
lassoregcv.fit(X_train, y_train)
print('alpha : ',lassoregcv.alpha_)

alpha :  0.00031014483724454517


In [34]:
from sklearn.linear_model import Lasso
lassoreg = Lasso(alpha= 0.00031014483724454517, normalize=True)
lassoreg.fit(X_train, y_train)
y_pred = lassoreg.predict(X_test)

print((-1)*np.mean(cross_validate(lassoreg, X_test, y_test, cv=20, scoring='neg_root_mean_squared_error').get('test_score')))
print(np.mean(cross_validate(lassoreg, X_test, y_test, cv=20, scoring='r2').get('test_score')))

207.66213049380139
0.3641233397688211


## (Classic) Linear Regression

In [36]:
from sklearn.linear_model import LinearRegression

X = allegro.drop('price', axis=1)
y =  allegro.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print('RMSE : ' + str(mean_squared_error(y_test, y_pred, squared=False)))
print('R2 squared : ' + str(r2_score(y_test, y_pred)))
print('Mean of the target variable: ' + str(y.mean()))

RMSE : 255.45870778675499
R2 squared : 0.2210472522541368
Mean of the target variable: 76.81135036426836


In [37]:
print((-1)*np.mean(cross_validate(regressor, X_test, y_test, cv=20, scoring='neg_root_mean_squared_error').get('test_score')))
print(np.mean(cross_validate(regressor, X_test, y_test, cv=20, scoring='r2').get('test_score')))

207.66929994660555
0.36404281542884526


# Ewaluacja

Biorąc pod uwagę wyniki przeprowadzonej kroswalidacji, patrząc jednocześnie na błąd RMSE i wskaźnik dopasowania regresji R2 najlepszym modelem okazał się 'ridgereg' (czyli model zbudowany za pomoca Ridge Regression)