# Praca domowa nr 2
## Jakub Lis

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from category_encoders import TargetEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Przygotowanie zbioru car_prices_poland

In [2]:
# wczytanie danych cars
cars = pd.read_csv('car_prices_poland/Car_Prices_Poland.csv')

cars.drop(columns=cars.columns[0], axis=1, inplace=True)

In [3]:
# przekształcenie danych cars do postaci uczącej

# generation_name posiada braki danych, to jest zmienna kategoryczna, więc braki uzupełnie stringiem 'BRAK'

cars.loc[cars['generation_name'].isna(), 'generation_name'] = 'BRAK'

# w province pojawiają się: Moravian-Silesian Region (35 wystapien), Berlin (3),
# Wiedeń (2), Trenczyn (1), Niedersachsen (1), Nordrhein-Westfalen (1), ( (1) - najpewniej błąd w danych
# Warto je zgrupowac do jednej kategorii

abroad = ['Trenczyn', 'Wiedeń', 'Niedersachsen', 'Nordrhein-Westfalen', 
          'Moravian-Silesian Region', 'Berlin', '(']
cars.loc[cars['province'].isin(abroad), 'province'] = 'abroad'

# mark, fuel, province - unikalnych wartosci jest akceptowalnie nieduzo, wiec uzylem one hot encodera

encoded = pd.get_dummies(cars[['mark', 'fuel', 'province']].astype(str))
encoded.drop(['mark_alfa-romeo', 'fuel_CNG', 'province_abroad'], axis=1, inplace=True)
cars.drop(['mark', 'fuel', 'province'], axis=1, inplace=True)
cars = pd.concat([cars, encoded], axis=1)

# w przypadku pozostalych zmiennych kategorycznych uzywam target encodera

y_cars = cars['price']
X_cars = cars.drop('price', axis=1)
encoder = TargetEncoder(cols=['model', 'generation_name', 'city'])
X_cars = encoder.fit_transform(X_cars, y_cars)

In [4]:
# podział na zbiory: treningowy, testowy i walidacyjny (w proporcjach 70%, 20%, 10%)

X_train_val_c, X_test_c, y_train_val_c, y_test_c = train_test_split(X_cars, y_cars, 
                                                                    random_state=420, test_size=0.2)
X_train_c, X_val_c, y_train_c, y_val_c = train_test_split(X_train_val_c, y_train_val_c, 
                                                          random_state=420, test_size=0.125)

# Przygotowanie zbioru airline_passenger_satisfaction

In [5]:
airline_train = pd.read_csv('airline_passenger_satisfaction/train.csv')
airline_test = pd.read_csv('airline_passenger_satisfaction/test.csv')

airline_train.drop(columns=airline_train.columns[0], axis=1, inplace=True)
airline_test.drop(columns=airline_test.columns[0], axis=1, inplace=True)

In [6]:
airline_train.info() # braki tylko w kolumnie Arrival Delay in Minutes, takze w zbiorze testowym

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 103904 non-null  int64  
 1   Gender                             103904 non-null  object 
 2   Customer Type                      103904 non-null  object 
 3   Age                                103904 non-null  int64  
 4   Type of Travel                     103904 non-null  object 
 5   Class                              103904 non-null  object 
 6   Flight Distance                    103904 non-null  int64  
 7   Inflight wifi service              103904 non-null  int64  
 8   Departure/Arrival time convenient  103904 non-null  int64  
 9   Ease of Online booking             103904 non-null  int64  
 10  Gate location                      103904 non-null  int64  
 11  Food and drink                     1039

In [7]:
# zlacze zbior treningowy z testowym na czas przeksztalcen, 
# nie bedzie potrzeby wykonywania tych samych czynnosci na obu zbiorach
# dodaje kolumne odpowiadajaca za info czy dany wiersz byl ze zbioru treningowego

airline_train['train'] = True
airline_test['train'] = False

airline = pd.concat([airline_train, airline_test], axis=0)

In [8]:
# transformacja kolumn z tylko dwiema kategoriami

airline['Gender'] = (airline['Gender'] == 'Female')*1
airline['Customer Type'] = (airline['Customer Type'] == 'Loyal Customer')*1
airline['Type of Travel'] = (airline['Type of Travel'] == 'Personal Travel')*1
airline['satisfaction'] = (airline['satisfaction'] == 'satisfied')*1

# uzupelnienie brakow danych srednia

airline.loc[airline['Arrival Delay in Minutes'].isna(), 
            'Arrival Delay in Minutes'] = airline['Arrival Delay in Minutes'].mean()

# one hot encoder

encoded = pd.get_dummies(airline[['Class']].astype(str))
encoded.drop(['Class_Eco Plus'], axis=1, inplace=True)
airline.drop(['Class'], axis=1, inplace=True)
airline = pd.concat([airline, encoded], axis=1)

# usuniecie kolumny id

airline.drop(['id'], axis=1, inplace=True)

In [9]:
# podział na zbiory: treningowy, testowy i walidacyjny (70%/20%/10%)
# zmienną predykowaną będzie satisfaction

airline_train = airline.loc[airline.train == True].drop(['train'], axis=1)
airline_test = airline.loc[airline.train == False].drop(['train'], axis=1)

X_test_a = airline_test.drop(['satisfaction'], axis=1)
y_test_a = airline_test['satisfaction']

# ze zbioru treningowego wydzielimy jeszcze zbior walidacyjny

X_train_val_a = airline_train.drop(['satisfaction'], axis=1)
y_train_val_a = airline_train['satisfaction']

X_train_a, X_val_a, y_train_a, y_val_a = train_test_split(X_train_val_a, y_train_val_a, 
                                                          random_state=420, test_size=0.125)

# Random Forest

## Zbiór car_prices (regresja)

### Ręczne testowanie różnych parametrów

In [10]:
# Będziemy szukać parametrów spośród: n_estimators, min_samples_split, max_depth
# Na początek ustawmy: n_estimators=100, min_samples_split=3, max_depth=5
rf1 = RandomForestRegressor(n_estimators=100, min_samples_split=3,
                           max_depth=5, random_state = 42)
rf1.fit(X_train_c, y_train_c)

RandomForestRegressor(max_depth=5, min_samples_split=3, random_state=42)

In [11]:
predictions = rf1.predict(X_val_c)
mse = mean_squared_error(y_val_c, predictions, squared=False)
print('RMSE:', round(mse, 2))

RMSE: 28622.32


In [12]:
# n_estimators=100, min_samples_split=6, max_depth=5
rf2 = RandomForestRegressor(n_estimators=100, min_samples_split=6,
                           max_depth=5, random_state = 42)
rf2.fit(X_train_c, y_train_c)

predictions = rf2.predict(X_val_c)
mse = mean_squared_error(y_val_c, predictions, squared=False)
print('RMSE:', round(mse, 2))

RMSE: 28617.04


In [13]:
# n_estimators=100, min_samples_split=3, max_depth=10
rf3 = RandomForestRegressor(n_estimators=100, min_samples_split=3,
                           max_depth=10, random_state = 42)
rf3.fit(X_train_c, y_train_c)

predictions = rf3.predict(X_val_c)
mse = mean_squared_error(y_val_c, predictions, squared=False)
print('RMSE:', round(mse, 2))

RMSE: 20434.13


In [14]:
# n_estimators=100, min_samples_split=3, max_depth=15
rf4 = RandomForestRegressor(n_estimators=100, min_samples_split=3,
                           max_depth=15, random_state = 42)
rf4.fit(X_train_c, y_train_c)

predictions = rf4.predict(X_val_c)
mse = mean_squared_error(y_val_c, predictions, squared=False)
print('RMSE:', round(mse, 2))

RMSE: 18608.68


In [15]:
# n_estimators=500, min_samples_split=3, max_depth=15
rf5 = RandomForestRegressor(n_estimators=500, min_samples_split=3,
                           max_depth=15, random_state = 42)
rf5.fit(X_train_c, y_train_c)

predictions = rf5.predict(X_val_c)
mse = mean_squared_error(y_val_c, predictions, squared=False)
print('RMSE:', round(mse, 2))

RMSE: 18705.27


In [16]:
# n_estimators=500, min_samples_split=2, max_depth=15
rf6 = RandomForestRegressor(n_estimators=500, min_samples_split=2,
                           max_depth=15, random_state = 42)
rf6.fit(X_train_c, y_train_c)

predictions = rf5.predict(X_val_c)
mse = mean_squared_error(y_val_c, predictions, squared=False)
print('RMSE:', round(mse, 2))

RMSE: 18705.27


Najniższe RMSE dał model rf4 na zbiorze walidacyjnym, policzymy więc dla tego modelu RMSE na zbiorze testowym.

In [17]:
predictions = rf4.predict(X_test_c)
mse = mean_squared_error(y_test_c, predictions, squared=False)
print('RMSE:', round(mse, 2))

RMSE: 20946.95


## Zbiór airline_passenger_satisfaction (klasyfikacja)

In [18]:
rf1 = RandomForestClassifier(n_estimators=100, min_samples_split=3,
                             max_depth=5, random_state = 42)
rf1.fit(X_train_a, y_train_a)

RandomForestClassifier(max_depth=5, min_samples_split=3, random_state=42)

In [19]:
predictions = rf1.predict(X_val_a)
f1 = f1_score(y_val_a, predictions)
print('F1-score:', round(f1, 4))

F1-score: 0.9079


In [20]:
# zwiekszmy n_estimators

rf2 = RandomForestClassifier(n_estimators=1000, min_samples_split=3,
                             max_depth=5, random_state = 42)
rf2.fit(X_train_a, y_train_a)

predictions = rf2.predict(X_val_a)
f1 = f1_score(y_val_a, predictions)
print('F1-score:', round(f1, 4))

F1-score: 0.909


In [21]:
# zwiekszenie n_estimators nie pomoglo znaczaco, zwiekszmy glebokosc drzew

rf3 = RandomForestClassifier(n_estimators=200, min_samples_split=3,
                             max_depth=10, random_state = 42)
rf3.fit(X_train_a, y_train_a)

predictions = rf3.predict(X_val_a)
f1 = f1_score(y_val_a, predictions)
print('F1-score:', round(f1, 4))

F1-score: 0.9338


In [22]:
# poniewaz przy 3-cim modelu zwiekszenie glebokosci polepszylo wynik, to sprobujmy jeszcze ja zwiekszyc

rf4 = RandomForestClassifier(n_estimators=200, min_samples_split=3,
                             max_depth=15, random_state = 42)
rf4.fit(X_train_a, y_train_a)

predictions = rf4.predict(X_val_a)
f1 = f1_score(y_val_a, predictions)
print('F1-score:', round(f1, 4))

F1-score: 0.9518


In [23]:
# pozostaje pozmieniac troche min_samples_split

rf5 = RandomForestClassifier(n_estimators=200, min_samples_split=2,
                             max_depth=15, random_state = 42)
rf5.fit(X_train_a, y_train_a)

predictions = rf5.predict(X_val_a)
f1 = f1_score(y_val_a, predictions)
print('F1-score:', round(f1, 4))

F1-score: 0.9522


In [24]:
# i jeszcze przy zwiekszonym min_samples_split

rf6 = RandomForestClassifier(n_estimators=200, min_samples_split=5,
                             max_depth=15, random_state = 42)
rf6.fit(X_train_a, y_train_a)

predictions = rf6.predict(X_val_a)
f1 = f1_score(y_val_a, predictions)
print('F1-score:', round(f1, 4))

F1-score: 0.9516


In [25]:
# najlepsze wyniki dal ostatni model, wiec dla niego wyznaczmy jeszcze wartosc na zbiorze testowym

predictions = rf5.predict(X_test_a)
f1 = f1_score(y_test_a, predictions)
print('F1-score:', round(f1, 4))

F1-score: 0.9527


In [26]:
# przy wyborze modelu sugerowalismy sie F1-score, ale mozemy jeszcze policzyc na zbiorze
# testowym wartosc accuracy

acc = accuracy_score(y_test_a, predictions)
print('Accuracy:', round(acc, 4))

Accuracy: 0.959


# XGBoost

## Zbiór car_prices

In [27]:
xgb1 = xgb.XGBRegressor(max_depth=3, seed=42, use_label_encoder=False,
                        learning_rate=0.1, min_child_weight=1)

xgb1.fit(X_train_c, y_train_c, verbose=True, early_stopping_rounds=5, eval_set=[(X_val_c, y_val_c)])

[0]	validation_0-rmse:102841.49219
[1]	validation_0-rmse:94502.55469
[2]	validation_0-rmse:86842.00781
[3]	validation_0-rmse:80047.47656
[4]	validation_0-rmse:73797.69531
[5]	validation_0-rmse:68418.00000
[6]	validation_0-rmse:63422.94531
[7]	validation_0-rmse:58945.41797
[8]	validation_0-rmse:55042.80859
[9]	validation_0-rmse:51486.74609
[10]	validation_0-rmse:48304.72656
[11]	validation_0-rmse:45474.54297
[12]	validation_0-rmse:43101.23047
[13]	validation_0-rmse:40919.07031
[14]	validation_0-rmse:38946.36328
[15]	validation_0-rmse:37162.58594
[16]	validation_0-rmse:35612.57812
[17]	validation_0-rmse:34287.75781
[18]	validation_0-rmse:33178.92188
[19]	validation_0-rmse:32166.65820
[20]	validation_0-rmse:31208.36719
[21]	validation_0-rmse:30351.55664
[22]	validation_0-rmse:29646.26953
[23]	validation_0-rmse:29075.87695
[24]	validation_0-rmse:28565.76562
[25]	validation_0-rmse:28027.60156
[26]	validation_0-rmse:27673.65430
[27]	validation_0-rmse:27290.28711
[28]	validation_0-rmse:26985.

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             subsample=1, tree_method='exact', use_label_encoder=False,
             validate_parameters=1, verbosity=None)

In [28]:
predictions = xgb1.predict(X_val_c)
mse = mean_squared_error(y_val_c, predictions, squared=False)
print('RMSE:', round(mse, 2))

RMSE: 24027.41


In [29]:
# zwiekszamy max_depth

xgb2 = xgb.XGBRegressor(max_depth=6, seed=42, use_label_encoder=False,
                        learning_rate=0.1, min_child_weight=1)

xgb2.fit(X_train_c, y_train_c, verbose=True, early_stopping_rounds=5, eval_set=[(X_val_c, y_val_c)])

[0]	validation_0-rmse:101793.05469
[1]	validation_0-rmse:92419.85938
[2]	validation_0-rmse:84038.06250
[3]	validation_0-rmse:76581.49219
[4]	validation_0-rmse:69928.48438
[5]	validation_0-rmse:63923.45312
[6]	validation_0-rmse:58616.49609
[7]	validation_0-rmse:53864.31250
[8]	validation_0-rmse:49663.16016
[9]	validation_0-rmse:45944.17578
[10]	validation_0-rmse:42626.22656
[11]	validation_0-rmse:39750.31641
[12]	validation_0-rmse:37181.19531
[13]	validation_0-rmse:34898.74219
[14]	validation_0-rmse:32949.61719
[15]	validation_0-rmse:31242.51758
[16]	validation_0-rmse:29754.50977
[17]	validation_0-rmse:28481.95117
[18]	validation_0-rmse:27389.55859
[19]	validation_0-rmse:26455.64648
[20]	validation_0-rmse:25631.84961
[21]	validation_0-rmse:24966.11328
[22]	validation_0-rmse:24339.56836
[23]	validation_0-rmse:23870.72461
[24]	validation_0-rmse:23432.08008
[25]	validation_0-rmse:23028.31055
[26]	validation_0-rmse:22705.52148
[27]	validation_0-rmse:22447.22070
[28]	validation_0-rmse:22176.

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             subsample=1, tree_method='exact', use_label_encoder=False,
             validate_parameters=1, verbosity=None)

In [30]:
predictions = xgb2.predict(X_val_c)
mse = mean_squared_error(y_val_c, predictions, squared=False)
print('RMSE:', round(mse, 2))

RMSE: 19880.54


In [31]:
# jeszcze raz zwiekszamy max_depth, bo poprzednio to poprawilo wyniki

xgb3 = xgb.XGBRegressor(max_depth=9, seed=42, use_label_encoder=False,
                        learning_rate=0.1, min_child_weight=1)

xgb3.fit(X_train_c, y_train_c, verbose=True, early_stopping_rounds=5, eval_set=[(X_val_c, y_val_c)])

[0]	validation_0-rmse:101638.07812
[1]	validation_0-rmse:92132.62500
[2]	validation_0-rmse:83614.71875
[3]	validation_0-rmse:75974.99219
[4]	validation_0-rmse:69118.21094
[5]	validation_0-rmse:63014.29297
[6]	validation_0-rmse:57624.79297
[7]	validation_0-rmse:52802.51953
[8]	validation_0-rmse:48463.77344
[9]	validation_0-rmse:44594.93359
[10]	validation_0-rmse:41199.75781
[11]	validation_0-rmse:38166.15625
[12]	validation_0-rmse:35524.33203
[13]	validation_0-rmse:33162.82422
[14]	validation_0-rmse:31135.09766
[15]	validation_0-rmse:29366.90039
[16]	validation_0-rmse:27804.28906
[17]	validation_0-rmse:26451.78516
[18]	validation_0-rmse:25324.10547
[19]	validation_0-rmse:24331.21289
[20]	validation_0-rmse:23455.57812
[21]	validation_0-rmse:22699.50391
[22]	validation_0-rmse:22047.32422
[23]	validation_0-rmse:21515.02930
[24]	validation_0-rmse:21046.53320
[25]	validation_0-rmse:20679.68359
[26]	validation_0-rmse:20371.77734
[27]	validation_0-rmse:20092.66992
[28]	validation_0-rmse:19866.

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=9, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             subsample=1, tree_method='exact', use_label_encoder=False,
             validate_parameters=1, verbosity=None)

In [32]:
predictions = xgb3.predict(X_val_c)
mse = mean_squared_error(y_val_c, predictions, squared=False)
print('RMSE:', round(mse, 2))

RMSE: 17999.16


In [33]:
# sprawdzmy jak zmiana learning_rate wplynie na trenowanie

xgb4 = xgb.XGBRegressor(max_depth=9, seed=42, use_label_encoder=False,
                        learning_rate=0.01, min_child_weight=1)

xgb4.fit(X_train_c, y_train_c, verbose=True, early_stopping_rounds=5, eval_set=[(X_val_c, y_val_c)])

[0]	validation_0-rmse:111201.11719
[1]	validation_0-rmse:110151.25781
[2]	validation_0-rmse:109108.21094
[3]	validation_0-rmse:108081.12500
[4]	validation_0-rmse:107060.84375
[5]	validation_0-rmse:106054.08594
[6]	validation_0-rmse:105058.42188
[7]	validation_0-rmse:104069.39844
[8]	validation_0-rmse:103091.55469
[9]	validation_0-rmse:102121.82812
[10]	validation_0-rmse:101164.39844
[11]	validation_0-rmse:100214.94531
[12]	validation_0-rmse:99275.51562
[13]	validation_0-rmse:98345.43750
[14]	validation_0-rmse:97428.00000
[15]	validation_0-rmse:96517.51562
[16]	validation_0-rmse:95617.39844
[17]	validation_0-rmse:94724.24219
[18]	validation_0-rmse:93843.76562
[19]	validation_0-rmse:92971.89844
[20]	validation_0-rmse:92107.11719
[21]	validation_0-rmse:91249.53125
[22]	validation_0-rmse:90403.40625
[23]	validation_0-rmse:89566.07812
[24]	validation_0-rmse:88735.72656
[25]	validation_0-rmse:87914.50000
[26]	validation_0-rmse:87104.66406
[27]	validation_0-rmse:86301.69531
[28]	validation_0-

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=9, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             subsample=1, tree_method='exact', use_label_encoder=False,
             validate_parameters=1, verbosity=None)

In [34]:
predictions = xgb4.predict(X_val_c)
mse = mean_squared_error(y_val_c, predictions, squared=False)
print('RMSE:', round(mse, 2))

RMSE: 46166.07


In [35]:
# wynik się znacząco pogorszył, więc wracamy do poprzedniego learning_rate
# zwiększymy jeszcze min_child_weight

xgb5 = xgb.XGBRegressor(max_depth=9, seed=42, use_label_encoder=False,
                        learning_rate=0.1, min_child_weight=3)

xgb5.fit(X_train_c, y_train_c, verbose=True, early_stopping_rounds=5, eval_set=[(X_val_c, y_val_c)])

[0]	validation_0-rmse:101632.64844
[1]	validation_0-rmse:92108.19531
[2]	validation_0-rmse:83596.77344
[3]	validation_0-rmse:75941.35938
[4]	validation_0-rmse:69129.64844
[5]	validation_0-rmse:62995.04297
[6]	validation_0-rmse:57518.46875
[7]	validation_0-rmse:52685.89062
[8]	validation_0-rmse:48333.33203
[9]	validation_0-rmse:44509.54297
[10]	validation_0-rmse:41120.36719
[11]	validation_0-rmse:38084.35156
[12]	validation_0-rmse:35441.97266
[13]	validation_0-rmse:33117.86719
[14]	validation_0-rmse:31091.70508
[15]	validation_0-rmse:29362.00781
[16]	validation_0-rmse:27847.28711
[17]	validation_0-rmse:26530.89648
[18]	validation_0-rmse:25395.98047
[19]	validation_0-rmse:24431.92188
[20]	validation_0-rmse:23630.35156
[21]	validation_0-rmse:22930.41016
[22]	validation_0-rmse:22308.59375
[23]	validation_0-rmse:21746.56445
[24]	validation_0-rmse:21296.05859
[25]	validation_0-rmse:20922.40820
[26]	validation_0-rmse:20585.98438
[27]	validation_0-rmse:20353.48633
[28]	validation_0-rmse:20130.

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=9, min_child_weight=3, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             subsample=1, tree_method='exact', use_label_encoder=False,
             validate_parameters=1, verbosity=None)

In [36]:
predictions = xgb5.predict(X_val_c)
mse = mean_squared_error(y_val_c, predictions, squared=False)
print('RMSE:', round(mse, 2))

RMSE: 18483.63


Pozostaje obliczyć RMSE na najlepszym modelu, tzn. xgb3

In [37]:
predictions = xgb3.predict(X_test_c)
mse = mean_squared_error(y_test_c, predictions, squared=False)
print('RMSE:', round(mse, 2))

RMSE: 19980.61


## Zbiór airline_passenger_satisfaction

In [38]:
xgb1 = xgb.XGBClassifier(objective="binary:logistic", seed=42, use_label_encoder=False,
                         max_depth=3, min_child_weight=2, learning_rate=0.01)

xgb1.fit(X_train_a, y_train_a, verbose=True, early_stopping_rounds=5,
         eval_metric="error", eval_set=[(X_val_a, y_val_a)])

[0]	validation_0-error:0.11749
[1]	validation_0-error:0.11749
[2]	validation_0-error:0.11749
[3]	validation_0-error:0.11749
[4]	validation_0-error:0.11749


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=3, min_child_weight=2, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [39]:
predictions = xgb1.predict(X_test_a)
f1 = f1_score(y_test_a, predictions)
print('F1-score:', round(f1, 4))

F1-score: 0.8728


In [40]:
# zwiekszamy learning rate

xgb2 = xgb.XGBClassifier(objective="binary:logistic", seed=42, use_label_encoder=False,
                         max_depth=3, min_child_weight=2, learning_rate=0.3)

xgb2.fit(X_train_a, y_train_a, verbose=True, early_stopping_rounds=5,
         eval_metric="error", eval_set=[(X_val_a, y_val_a)])

[0]	validation_0-error:0.11749
[1]	validation_0-error:0.11626
[2]	validation_0-error:0.10448
[3]	validation_0-error:0.09709
[4]	validation_0-error:0.09978
[5]	validation_0-error:0.08208
[6]	validation_0-error:0.08292
[7]	validation_0-error:0.08231
[8]	validation_0-error:0.07923
[9]	validation_0-error:0.07784
[10]	validation_0-error:0.07468
[11]	validation_0-error:0.07199
[12]	validation_0-error:0.07014
[13]	validation_0-error:0.06775
[14]	validation_0-error:0.06745
[15]	validation_0-error:0.06760
[16]	validation_0-error:0.06637
[17]	validation_0-error:0.06437
[18]	validation_0-error:0.06444
[19]	validation_0-error:0.06452
[20]	validation_0-error:0.06391
[21]	validation_0-error:0.06414
[22]	validation_0-error:0.06313
[23]	validation_0-error:0.06190
[24]	validation_0-error:0.06136
[25]	validation_0-error:0.06183
[26]	validation_0-error:0.06213
[27]	validation_0-error:0.06198
[28]	validation_0-error:0.06106
[29]	validation_0-error:0.06159
[30]	validation_0-error:0.06052
[31]	validation_0-

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.3, max_delta_step=0,
              max_depth=3, min_child_weight=2, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [41]:
predictions = xgb2.predict(X_test_a)
f1 = f1_score(y_test_a, predictions)
print('F1-score:', round(f1, 4))

F1-score: 0.9502


In [42]:
# zwiekszamy maksymalną głębokość

xgb3 = xgb.XGBClassifier(objective="binary:logistic", seed=42, use_label_encoder=False,
                         max_depth=6, min_child_weight=2, learning_rate=0.3)

xgb3.fit(X_train_a, y_train_a, verbose=True, early_stopping_rounds=5,
         eval_metric="error", eval_set=[(X_val_a, y_val_a)])

[0]	validation_0-error:0.08161
[1]	validation_0-error:0.07530
[2]	validation_0-error:0.06806
[3]	validation_0-error:0.07022
[4]	validation_0-error:0.06391
[5]	validation_0-error:0.06090
[6]	validation_0-error:0.05728
[7]	validation_0-error:0.05544
[8]	validation_0-error:0.05305
[9]	validation_0-error:0.05220
[10]	validation_0-error:0.05135
[11]	validation_0-error:0.05089
[12]	validation_0-error:0.04981
[13]	validation_0-error:0.04889
[14]	validation_0-error:0.04881
[15]	validation_0-error:0.04720
[16]	validation_0-error:0.04597
[17]	validation_0-error:0.04612
[18]	validation_0-error:0.04604
[19]	validation_0-error:0.04573
[20]	validation_0-error:0.04450
[21]	validation_0-error:0.04443
[22]	validation_0-error:0.04250
[23]	validation_0-error:0.04204
[24]	validation_0-error:0.04204
[25]	validation_0-error:0.04188
[26]	validation_0-error:0.04119
[27]	validation_0-error:0.04119
[28]	validation_0-error:0.04096
[29]	validation_0-error:0.04034
[30]	validation_0-error:0.03981
[31]	validation_0-

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.3, max_delta_step=0,
              max_depth=6, min_child_weight=2, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [43]:
predictions = xgb3.predict(X_test_a)
f1 = f1_score(y_test_a, predictions)
print('F1-score:', round(f1, 4))

F1-score: 0.9566


In [44]:
# zmieniamy min_child_weight

xgb4 = xgb.XGBClassifier(objective="binary:logistic", seed=42, use_label_encoder=False,
                         max_depth=6, min_child_weight=10, learning_rate=0.3)

xgb4.fit(X_train_a, y_train_a, verbose=True, early_stopping_rounds=5,
         eval_metric="error", eval_set=[(X_val_a, y_val_a)])

[0]	validation_0-error:0.08161
[1]	validation_0-error:0.07538
[2]	validation_0-error:0.06799
[3]	validation_0-error:0.06960
[4]	validation_0-error:0.06290
[5]	validation_0-error:0.06013
[6]	validation_0-error:0.05751
[7]	validation_0-error:0.05513
[8]	validation_0-error:0.05359
[9]	validation_0-error:0.05266
[10]	validation_0-error:0.05174
[11]	validation_0-error:0.05105
[12]	validation_0-error:0.05112
[13]	validation_0-error:0.05020
[14]	validation_0-error:0.04820
[15]	validation_0-error:0.04820
[16]	validation_0-error:0.04835
[17]	validation_0-error:0.04704
[18]	validation_0-error:0.04573
[19]	validation_0-error:0.04466
[20]	validation_0-error:0.04496
[21]	validation_0-error:0.04327
[22]	validation_0-error:0.04296
[23]	validation_0-error:0.04296
[24]	validation_0-error:0.04219
[25]	validation_0-error:0.04165
[26]	validation_0-error:0.04119
[27]	validation_0-error:0.04081
[28]	validation_0-error:0.04111
[29]	validation_0-error:0.04111
[30]	validation_0-error:0.04042
[31]	validation_0-

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.3, max_delta_step=0,
              max_depth=6, min_child_weight=10, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [45]:
predictions = xgb4.predict(X_test_a)
f1 = f1_score(y_test_a, predictions)
print('F1-score:', round(f1, 4))

F1-score: 0.9554


In [46]:
# zmiana min_child_weight nie zmieniła znacząco wyniku, zwiększmy jeszcze raz max_depth

xgb5 = xgb.XGBClassifier(objective="binary:logistic", seed=42, use_label_encoder=False,
                         max_depth=10, min_child_weight=1, learning_rate=0.3)

xgb5.fit(X_train_a, y_train_a, verbose=True, early_stopping_rounds=5,
         eval_metric="error", eval_set=[(X_val_a, y_val_a)])

[0]	validation_0-error:0.05682
[1]	validation_0-error:0.05135
[2]	validation_0-error:0.04889
[3]	validation_0-error:0.04704
[4]	validation_0-error:0.04427
[5]	validation_0-error:0.04273
[6]	validation_0-error:0.04142
[7]	validation_0-error:0.04042
[8]	validation_0-error:0.04050
[9]	validation_0-error:0.03950
[10]	validation_0-error:0.03988
[11]	validation_0-error:0.03880
[12]	validation_0-error:0.03827
[13]	validation_0-error:0.03780
[14]	validation_0-error:0.03780
[15]	validation_0-error:0.03726
[16]	validation_0-error:0.03757
[17]	validation_0-error:0.03757
[18]	validation_0-error:0.03803
[19]	validation_0-error:0.03765


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.3, max_delta_step=0,
              max_depth=10, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [47]:
predictions = xgb5.predict(X_test_a)
f1 = f1_score(y_test_a, predictions)
print('F1-score:', round(f1, 4))

F1-score: 0.9561


In [48]:
# Najlepszy F1 uzyskaliśmy na xgb3, sprawdzamy go na zbiorze testowym

predictions = xgb3.predict(X_val_a)
f1 = f1_score(y_val_a, predictions)
print('F1-score:', round(f1, 4))
acc = accuracy_score(y_val_a, predictions)
print('Accuracy:', round(acc, 4))

F1-score: 0.9577
Accuracy: 0.9639


Uzyskane wyniki są trochę lepsze niż w przypadku Random Forest.