In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt

In [2]:
allegro_df = pd.read_csv("https://www.dropbox.com/s/360xhh2d9lnaek3/allegro-api-transactions.csv?dl=1")

In [3]:
lokacje = allegro_df['it_location'].copy()
lokacje = lokacje.apply(lambda x : x.lower())
lokacje = lokacje.apply(lambda x : x.replace('ó', 'o').replace('ż', 'z').replace('ś','s').replace('ź','z').replace('ł','l').replace('ą','a').replace('ę','e').replace('ń','n'))
rzadkie_lokacje = lokacje.value_counts()[(lokacje.value_counts()==1) | (lokacje.value_counts()==2)].index
lokacje = pd.Series(np.where(lokacje.isin(rzadkie_lokacje), 'Other', lokacje))

In [4]:
allegro_df['it_location'] = lokacje

In [5]:
allegro_df = allegro_df.loc[:, ['categories','main_category','it_location', 'price']]

#### Target Encoding

In [6]:
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 16.1MB/s eta 0:00:01[K     |████████▏                       | 20kB 7.8MB/s eta 0:00:01[K     |████████████▏                   | 30kB 7.7MB/s eta 0:00:01[K     |████████████████▎               | 40kB 7.3MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 4.2MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 4.9MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 5.3MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 3.6MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [7]:
from category_encoders.target_encoder import TargetEncoder

  import pandas.util.testing as tm


In [8]:

te_smooth = TargetEncoder(smoothing=10)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [10]:
X = allegro_df.drop(['price'],axis=1)
y = allegro_df['price']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25)

Smoothing- pomaga szczególnie przy wartościach kategorycznych występująych rzadko, dodaje warunek ze jeśli wartości jest mniej niż (jakaś ustalona wartości parametru), to zostaną one zastąpione globalną średnią zamiast lokalnej

In [12]:
X_train = te_smooth.fit_transform(X_train,y_train)

  elif pd.api.types.is_categorical(cols):


In [13]:
X_test = te_smooth.fit_transform(X_test,y_test)

  elif pd.api.types.is_categorical(cols):


In [14]:
lr = LinearRegression()

In [15]:
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [16]:
from sklearn.metrics import mean_squared_error, r2_score

In [17]:
def scoring(X,y, model):
  print("RMSE: {}".format(mean_squared_error(y, model.predict(X), squared=False)))
  print("R2 score: {}".format(r2_score(y, model.predict(X))))

In [18]:
scoring(X_test,y_test,lr)

RMSE: 272.55902670515286
R2 score: 0.19449069992765355


#### Regularyzacja

In [19]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [20]:
ridge = Ridge()
lasso = Lasso()

In [21]:
ridge.fit(X_train,y_train)
lasso.fit(X_train,y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [22]:
scoring(X_test,y_test,ridge)

RMSE: 272.5590267034831
R2 score: 0.194490699937523


In [23]:
scoring(X_test,y_test,lasso)

RMSE: 272.5595051782089
R2 score: 0.19448787180752547


W obydwu rozwiązaniach do wyliczanych wartości dodajemy pewną "karę" mającą zapobiegać przeuczaniu.

*   Ridge- suma kwadratów wag
*   Lasso- suma wartości absolutnych wag



Jak widać, powyższe działania nie wpłynęły znacząco na wyniki.

#### Test innego modelu

In [24]:
from xgboost import XGBRegressor

In [25]:
xgbr = XGBRegressor(
                    learning_rate=0.02, 
                    booster='gbtree', 
                    nround = 200, 
                    max_depth=3)

In [26]:
xgbr.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.02, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nround=200, nthread=None, objective='reg:linear',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [27]:
scoring(X_test, y_test, xgbr)

RMSE: 275.1240183921943
R2 score: 0.17925842821900273
