# PD4 - Adam Frej

## Import paczek i danych:

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib
from sklearn.svm import SVR,SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
import category_encoders as ce
import warnings
warnings.filterwarnings('ignore')

In [2]:
apartments_df = pd.read_csv("apartments.csv")

In [3]:
apartments_df.head()

Unnamed: 0,m2.price,construction.year,surface,floor,no.rooms,district
0,5897,1953,25,3,1,Srodmiescie
1,1818,1992,143,9,5,Bielany
2,3643,1937,56,1,2,Praga
3,3517,1995,93,7,3,Ochota
4,3013,1992,144,6,5,Mokotow


# Zadanie podstawowe

## Preprocessing

Sprawdźmy czy potrzeba dokonać normalizacji stringów.

In [4]:
np.unique(apartments_df['district'])

array(['Bemowo', 'Bielany', 'Mokotow', 'Ochota', 'Praga', 'Srodmiescie',
       'Ursus', 'Ursynow', 'Wola', 'Zoliborz'], dtype=object)

Jak widać dane są poprawne.\
Zastosujmy one hot encdoing na zmiennej "district".

In [5]:
encoder = ce.OneHotEncoder(cols="district")
apartments_df = encoder.fit_transform(apartments_df, apartments_df["m2.price"])
apartments_df.head()

Unnamed: 0,m2.price,construction.year,surface,floor,no.rooms,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10
0,5897,1953,25,3,1,1,0,0,0,0,0,0,0,0,0
1,1818,1992,143,9,5,0,1,0,0,0,0,0,0,0,0
2,3643,1937,56,1,2,0,0,1,0,0,0,0,0,0,0
3,3517,1995,93,7,3,0,0,0,1,0,0,0,0,0,0
4,3013,1992,144,6,5,0,0,0,0,1,0,0,0,0,0


Załadujmy nasz drugi zbiór danych, czyli pakiet wine z sklearn.datasets. Ten zbiór jest już przygotowany do modelowania.

In [6]:
wine = load_wine(as_frame=True)
X_wine = wine['data']
y_wine = wine['target']
disp = X_wine.copy()
disp['target']=y_wine
disp.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


## SVM

SVM dla apartments. Stosuję RMSE jako metrykę, ponieważ apartments wymaga regresji.

In [7]:
y = apartments_df['m2.price']
X = apartments_df.drop(['m2.price'], axis=1)
svm = SVR()
results=cross_val_score(svm, X, y, cv=4, scoring='neg_root_mean_squared_error')
print(np.mean(results), np.std(results))

-911.4198210912405 38.54245059829314


Zalinkowany artykuł zalecał skalowanie danych, dla tego tutaj je stosujemy. Jak widać wyniki się polepszyły.

In [8]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('SVM', SVR())
    ]
)
results=cross_val_score(pipe, X, y, cv=4, scoring='neg_root_mean_squared_error')
print(np.mean(results), np.std(results))

-889.5550811374401 38.118021399024464


SVM dla wine. Stosuję ROC AUC Over jako metrykę, ponieważ wine wymaga klasyfikacji.

In [9]:
svm = SVC(probability=True)
results_wine=cross_val_score(svm, X_wine, y_wine, scoring='roc_auc_ovr')
print(np.mean(results_wine), np.std(results_wine))

0.8649796315945384 0.013181747837235767


SVM dla wine ze skalowaniem. Ponownie wyniki się polepszyły.

In [10]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('SVM', SVC(probability=True))
    ]
)
results_wine=cross_val_score(pipe, X_wine, y_wine, cv=4, scoring='roc_auc_ovr')
print(np.mean(results_wine), np.std(results_wine))

0.9998285322359396 0.0002969908792128092


## Random Search

Dla apartments:

In [11]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('SVM', SVR())
    ]
)
random = RandomizedSearchCV(estimator=pipe, 
                            param_distributions=dict(SVM__C=[0.5, 1.0, 2.0], 
                                                     SVM__gamma=['scale', 'auto'], 
                                                     SVM__degree=[1, 2, 3, 4, 5]), 
                            cv = 4, n_jobs=-1, scoring='neg_root_mean_squared_error')
random_result = random.fit(X, y)
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))

Best: -867.778023 using {'SVM__gamma': 'auto', 'SVM__degree': 5, 'SVM__C': 2.0}


Dla wine:

In [12]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('SVM', SVC(probability=True))
    ]
)
random_wine = RandomizedSearchCV(estimator=pipe, 
                            param_distributions=dict(SVM__C=[0.5, 1.0, 2.0], 
                                                     SVM__gamma=['scale', 'auto'], 
                                                     SVM__degree=[1, 2, 3, 4, 5]), 
                            cv = 4, n_jobs=-1, scoring='roc_auc_ovr')
random_result_wine = random_wine.fit(X_wine, y_wine)
print("Best: %f using %s" % (random_result_wine.best_score_, random_result_wine.best_params_))

Best: 0.999829 using {'SVM__gamma': 'scale', 'SVM__degree': 3, 'SVM__C': 0.5}


Random Search dał najlepsze wyniki, ale nie są one znacząco polepszone.

# Zadanie bonusowe

## Preprocessing

In [13]:
allegro_df = pd.read_csv("allegro-api-transactions.csv")
allegro_df.head()

Unnamed: 0,lp,date,item_id,categories,pay_option_on_delivery,pay_option_transfer,seller,price,it_is_allegro_standard,it_quantity,it_is_brand_zone,it_seller_rating,it_location,main_category
0,0,2016-04-03 21:21:08,4753602474,"['Komputery', 'Dyski i napędy', 'Nośniki', 'No...",1,1,radzioch666,59.99,1,997,0,50177,Warszawa,Komputery
1,1,2016-04-03 15:35:26,4773181874,"['Odzież, Obuwie, Dodatki', 'Bielizna damska',...",1,1,InwestycjeNET,4.9,1,9288,0,12428,Warszawa,"Odzież, Obuwie, Dodatki"
2,2,2016-04-03 14:14:31,4781627074,"['Dom i Ogród', 'Budownictwo i Akcesoria', 'Śc...",1,1,otostyl_com,109.9,1,895,0,7389,Leszno,Dom i Ogród
3,3,2016-04-03 19:55:44,4783971474,"['Książki i Komiksy', 'Poradniki i albumy', 'Z...",1,1,Matfel1,18.5,0,971,0,15006,Wola Krzysztoporska,Książki i Komiksy
4,4,2016-04-03 18:05:54,4787908274,"['Odzież, Obuwie, Dodatki', 'Ślub i wesele', '...",1,1,PPHU_RICO,19.9,1,950,0,32975,BIAŁYSTOK,"Odzież, Obuwie, Dodatki"


Wycinamy potrzebne zmienne.

In [14]:
allegro_df = allegro_df[['main_category', 'categories', 'it_location', 'price']]
allegro_df.head()

Unnamed: 0,main_category,categories,it_location,price
0,Komputery,"['Komputery', 'Dyski i napędy', 'Nośniki', 'No...",Warszawa,59.99
1,"Odzież, Obuwie, Dodatki","['Odzież, Obuwie, Dodatki', 'Bielizna damska',...",Warszawa,4.9
2,Dom i Ogród,"['Dom i Ogród', 'Budownictwo i Akcesoria', 'Śc...",Leszno,109.9
3,Książki i Komiksy,"['Książki i Komiksy', 'Poradniki i albumy', 'Z...",Wola Krzysztoporska,18.5
4,"Odzież, Obuwie, Dodatki","['Odzież, Obuwie, Dodatki', 'Ślub i wesele', '...",BIAŁYSTOK,19.9


Normalizacja danych kategorycznych.

In [15]:
allegro_df['it_location'] = allegro_df['it_location'].str.lower()
allegro_df['main_category'] = allegro_df['main_category'].str.lower()
allegro_df['categories'] = allegro_df['categories'].str.lower()
allegro_df.head()

Unnamed: 0,main_category,categories,it_location,price
0,komputery,"['komputery', 'dyski i napędy', 'nośniki', 'no...",warszawa,59.99
1,"odzież, obuwie, dodatki","['odzież, obuwie, dodatki', 'bielizna damska',...",warszawa,4.9
2,dom i ogród,"['dom i ogród', 'budownictwo i akcesoria', 'śc...",leszno,109.9
3,książki i komiksy,"['książki i komiksy', 'poradniki i albumy', 'z...",wola krzysztoporska,18.5
4,"odzież, obuwie, dodatki","['odzież, obuwie, dodatki', 'ślub i wesele', '...",białystok,19.9


Target Encoding. Parametr smoothing jest istotny, ponieważ ma wpływ na regularyzację. Im wyższa jego wartość, tym mocniejsza regularyzacja. Jednak nie jestem pewien jak go zeskalować.

In [16]:
encoder = ce.TargetEncoder(cols=["main_category", "categories", "it_location"])
allegro_df = encoder.fit_transform(allegro_df, allegro_df["price"])
allegro_df

Unnamed: 0,main_category,categories,it_location,price
0,121.810064,61.839771,84.132898,59.99
1,75.858049,12.375798,84.132898,4.90
2,72.434754,105.272597,64.883187,109.90
3,25.027497,24.072564,35.433365,18.50
4,75.858049,17.102344,73.772916,19.90
...,...,...,...,...
420015,107.534272,76.811350,26.346402,180.00
420016,28.128577,23.312063,63.645927,14.99
420017,75.858049,9.630092,18.682800,5.99
420018,71.206386,134.824626,104.254805,200.00


## Modelowanie

Model liniowy z dwiema wymaganymi metrykami - RMSE i R2.

In [17]:
y = allegro_df['price']
X = allegro_df.drop(['price'], axis=1)
results=cross_validate(LinearRegression(), X, y, cv=4, scoring=['neg_root_mean_squared_error', 'r2'])
results_RMSE = results['test_neg_root_mean_squared_error']
results_R2 = results['test_r2']
print("RMSE: ", np.mean(results_RMSE), np.std(results_RMSE))
print("R2: ", np.mean(results_R2), np.std(results_R2))

RMSE:  -321.8969753137839 75.0087116513592
R2:  0.2869341607117164 0.14774291776015055


Regularyzacja l2. Model liniowy Ridge stosuje taką regularyzację.

In [18]:
y = allegro_df['price']
X = allegro_df.drop(['price'], axis=1)
results=cross_validate(Ridge(), X, y, cv=4, scoring=['neg_root_mean_squared_error', 'r2'])
results_RMSE = results['test_neg_root_mean_squared_error']
results_R2 = results['test_r2']
print("RMSE: ", np.mean(results_RMSE), np.std(results_RMSE))
print("R2: ", np.mean(results_R2), np.std(results_R2))

RMSE:  -321.89697531231684 75.00871165216113
R2:  0.2869341607193705 0.1477429177609861


Regularyzacja L1. Model liniowy Lasso stosuje taką regularyzację.

In [19]:
y = allegro_df['price']
X = allegro_df.drop(['price'], axis=1)
results=cross_validate(Lasso(), X, y, cv=4, scoring=['neg_root_mean_squared_error', 'r2'])
results_RMSE = results['test_neg_root_mean_squared_error']
results_R2 = results['test_r2']
print("RMSE: ", np.mean(results_RMSE), np.std(results_RMSE))
print("R2: ", np.mean(results_R2), np.std(results_R2))

RMSE:  -321.89664767171183 75.00982426928391
R2:  0.28693529648670124 0.1477501383654618


Jak widać regularyzacja bardzo nieznacznie wpłynęła na wyniki.

Dodatkowy model regresyjny - regresyjne drzewo decyzyjne. Osiąga gorsze wyniki.

In [20]:
y = allegro_df['price']
X = allegro_df.drop(['price'], axis=1)
results=cross_validate(DecisionTreeRegressor(), X, y, cv=4, scoring=['neg_root_mean_squared_error', 'r2'])
results_RMSE = results['test_neg_root_mean_squared_error']
results_R2 = results['test_r2']
print("RMSE: ", np.mean(results_RMSE), np.std(results_RMSE))
print("R2: ", np.mean(results_R2), np.std(results_R2))

RMSE:  -374.9019022883412 77.7420973780097
R2:  0.030705816219388893 0.14729123692550242
