In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from tqdm.notebook import tqdm

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, roc_curve
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, BaggingClassifier
import xgboost as xgb


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# ustawia domyślną wielkość wykresów
plt.rcParams['figure.figsize'] = (12,8)
# to samo tylko dla tekstu
plt.rcParams['font.size'] = 16

In [2]:
air_test = pd.read_csv("./data/airline_passenger_satisfaction/test.csv")

In [3]:
air_train = pd.read_csv("./data/airline_passenger_satisfaction/train.csv")

In [4]:
air_test[air_test["Arrival Delay in Minutes"].isna()] = 0
air_train[air_train["Arrival Delay in Minutes"].isna()] = 0

In [5]:
air_test.columns

Index(['Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

In [6]:
air_test=air_test.drop(["Unnamed: 0","id"], axis=1)
air_test['Gender'] = (air_test['Gender'] == 'Female')*1
air_train=air_train.drop(["Unnamed: 0","id"], axis=1)
air_train['Gender'] = (air_train['Gender'] == 'Female')*1

In [7]:
air_test["Customer Type"].unique()# 0 ='disloyal Customer'

array(['Loyal Customer', 'disloyal Customer', 0], dtype=object)

In [8]:
air_train['Customer Type'] = (air_train['Customer Type'] == 'Loyal Customer')*1
air_test['Customer Type'] = (air_test['Customer Type'] == 'Loyal Customer')*1

In [9]:
air_test["Age"].unique() # zerowe wiersze

array([52, 36, 20, 44, 49, 16, 77, 43, 47, 46, 33, 60, 50, 31, 55, 25, 30,
       62, 24, 22, 51, 56, 41, 53, 12, 39, 32, 40, 42, 28, 59, 58, 27, 67,
       70, 66, 19, 69, 35, 54, 26, 61, 21, 37, 45, 38,  7, 34, 17, 15, 48,
       13, 11, 29,  8, 18, 23, 65, 57, 80, 64,  9, 14, 68, 10, 79, 63, 73,
       74,  0, 71, 72, 76, 78, 75, 85], dtype=int64)

In [10]:
air_test=air_test[air_test.Age!=0]
air_train=air_train[air_train.Age!=0]

In [11]:
air_train['satisfaction'] = (air_train['satisfaction'] == 'satisfied')*1
air_test['satisfaction'] = (air_test['satisfaction'] == 'satisfied')*1

In [12]:
air_train['Type of Travel'] = (air_train['Type of Travel'] == 'Personal Travel')*1
air_test['Type of Travel'] = (air_test['Type of Travel'] == 'Personal Travel')*1

In [13]:
air_test["Class"].unique()

array(['Eco', 'Business', 'Eco Plus'], dtype=object)

In [14]:
air_test["Class"] = (air_test['Class'] == 'Business')*2 + (air_test['Class'] == 'Eco Plus')*1
air_train["Class"] = (air_train['Class'] == 'Business')*2 + (air_train['Class'] == 'Eco Plus')*1

In [15]:
y_air_train = air_train['satisfaction']
X_air_train = air_train.drop("satisfaction", axis = 1)

X_air_test, X_air_valid, y_air_test, y_air_valid = train_test_split(air_test.drop("satisfaction", axis = 1), air_test['satisfaction'], random_state=420, train_size=0.7)

In [16]:
rf = RandomForestClassifier()
tree= rf.fit(X_air_train, y_air_train)

In [17]:
tree.score(X_air_test, y_air_test)

0.9629793103448275

In [18]:
tree.score(X_air_valid,y_air_valid)

0.9622811534500515

In [19]:
rf =RandomForestClassifier(max_depth=4,n_estimators=20)
tree= rf.fit(X_air_train, y_air_train)

In [20]:
tree.score(X_air_test, y_air_test)

0.8998068965517242

In [21]:
tree.score(X_air_valid,y_air_valid)

0.9025489186405767

In [22]:
rf =RandomForestClassifier(max_depth=10,n_estimators=200)
tree= rf.fit(X_air_train, y_air_train)

In [23]:
tree.score(X_air_test, y_air_test)

0.9456551724137932

In [24]:
tree.score(X_air_valid,y_air_valid)

0.9492790937178167

In [25]:
#nie poprawiliśmy wyniku

In [26]:
xgb_cls = xgb.XGBClassifier(objective="binary:logistic", seed = 42, use_label_encoder=False)

In [27]:
xgb_cls.fit(X_air_train, y_air_train, early_stopping_rounds=5, eval_metric="error", eval_set=[(X_air_test, y_air_test)])

[0]	validation_0-error:0.08055
[1]	validation_0-error:0.07382
[2]	validation_0-error:0.07189
[3]	validation_0-error:0.06720
[4]	validation_0-error:0.06168
[5]	validation_0-error:0.05975
[6]	validation_0-error:0.05699
[7]	validation_0-error:0.05302
[8]	validation_0-error:0.05203
[9]	validation_0-error:0.05087
[10]	validation_0-error:0.05004
[11]	validation_0-error:0.05048
[12]	validation_0-error:0.05037
[13]	validation_0-error:0.04833
[14]	validation_0-error:0.04739
[15]	validation_0-error:0.04668
[16]	validation_0-error:0.04601
[17]	validation_0-error:0.04557
[18]	validation_0-error:0.04469
[19]	validation_0-error:0.04436
[20]	validation_0-error:0.04397
[21]	validation_0-error:0.04392
[22]	validation_0-error:0.04342
[23]	validation_0-error:0.04188
[24]	validation_0-error:0.04116
[25]	validation_0-error:0.04083
[26]	validation_0-error:0.04044
[27]	validation_0-error:0.04011
[28]	validation_0-error:0.03917
[29]	validation_0-error:0.03890
[30]	validation_0-error:0.03884
[31]	validation_0-

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [28]:
xgb_cls.score(X_air_valid, y_air_valid)

0.9616374871266735

In [29]:
xgb_cls =xgb.XGBClassifier(max_depth=4,n_estimators =20)

In [30]:
xgb_cls.fit(X_air_train, y_air_train, early_stopping_rounds=5, eval_metric="error", eval_set=[(X_air_test, y_air_test)])

[0]	validation_0-error:0.11189
[1]	validation_0-error:0.09550
[2]	validation_0-error:0.09495




[3]	validation_0-error:0.08921
[4]	validation_0-error:0.09363
[5]	validation_0-error:0.09137
[6]	validation_0-error:0.08690
[7]	validation_0-error:0.07421
[8]	validation_0-error:0.07553
[9]	validation_0-error:0.07503
[10]	validation_0-error:0.07090
[11]	validation_0-error:0.07007
[12]	validation_0-error:0.06836
[13]	validation_0-error:0.06477
[14]	validation_0-error:0.06488
[15]	validation_0-error:0.06472
[16]	validation_0-error:0.06395
[17]	validation_0-error:0.06262
[18]	validation_0-error:0.06229
[19]	validation_0-error:0.06179


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=4, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=20, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [31]:
xgb_cls.score(X_air_valid, y_air_valid)

0.9407826982492276

In [32]:
# Najlepszy XGBoost z defaultowymi hiperparametrami

# Cars

In [86]:
cars = pd.read_csv("./data/car_prices_poland/Car_Prices_Poland.csv")
cars.head()

Unnamed: 0.1,Unnamed: 0,mark,model,generation_name,year,mileage,vol_engine,fuel,city,province,price
0,0,opel,combo,gen-d-2011,2015,139568,1248,Diesel,Janki,Mazowieckie,35900
1,1,opel,combo,gen-d-2011,2018,31991,1499,Diesel,Katowice,Śląskie,78501
2,2,opel,combo,gen-d-2011,2015,278437,1598,Diesel,Brzeg,Opolskie,27000
3,3,opel,combo,gen-d-2011,2016,47600,1248,Diesel,Korfantów,Opolskie,30800
4,4,opel,combo,gen-d-2011,2014,103000,1400,CNG,Tarnowskie Góry,Śląskie,35900


In [87]:
cars=cars.drop(["Unnamed: 0","generation_name","city","model","mark","province"], axis=1)

In [88]:
encoded = pd.get_dummies(cars[["fuel"]].astype(str))
encoded = encoded.drop(["fuel_CNG"], axis = 1)
cars = cars.drop(["fuel"], axis=1)
cars = pd.concat([cars, encoded], axis = 1)

In [90]:
cars

Unnamed: 0,year,mileage,vol_engine,price,fuel_Diesel,fuel_Electric,fuel_Gasoline,fuel_Hybrid,fuel_LPG
0,2015,139568,1248,35900,1,0,0,0,0
1,2018,31991,1499,78501,1,0,0,0,0
2,2015,278437,1598,27000,1,0,0,0,0
3,2016,47600,1248,30800,1,0,0,0,0
4,2014,103000,1400,35900,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
117922,2020,40000,1969,222790,0,0,0,1,0
117923,2017,51000,1969,229900,1,0,0,0,0
117924,2016,83500,1969,135000,0,0,1,0,0
117925,2017,174000,1969,154500,1,0,0,0,0


In [91]:
X_train_val, X_test, y_train_val, y_test = train_test_split(cars.drop("price", axis=1), cars["price"], random_state=21, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=22, test_size=0.125)

In [93]:
rf =RandomForestRegressor()
tree= rf.fit(X_train, y_train)

In [94]:
tree.score(X_test, y_test)

0.8672005258322979

In [95]:
tree.score(X_val,y_val)

0.8817988172075896

In [96]:
rf =RandomForestRegressor(max_depth=10,n_estimators=200)
tree= rf.fit(X_train, y_train)

In [97]:
tree.score(X_test, y_test)

0.8681302187240776

In [98]:
tree.score(X_val,y_val)

0.8858420617186482

In [99]:
#troszke poprawiliśmy wynik

In [100]:
xgb_cls = xgb.XGBRegressor()

In [101]:
xgb_cls.fit(X_train, y_train, early_stopping_rounds=5, eval_metric="error", eval_set=[(X_test, y_test)])

[0]	validation_0-error:-70956.02344
[1]	validation_0-error:-70956.02344
[2]	validation_0-error:-70956.02344
[3]	validation_0-error:-70956.02344
[4]	validation_0-error:-70956.02344
[5]	validation_0-error:-70956.02344


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [102]:
xgb_cls.score(X_val, y_val)

0.08163262120809911

In [103]:
xgb_cls =xgb.XGBRegressor(n_estimators=1000, max_depth=7)

In [104]:
xgb_cls.fit(X_train, y_train, early_stopping_rounds=5, eval_metric="error", eval_set=[(X_test, y_test)])

[0]	validation_0-error:-70956.02344
[1]	validation_0-error:-70956.02344
[2]	validation_0-error:-70956.02344
[3]	validation_0-error:-70956.02344
[4]	validation_0-error:-70956.02344


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [105]:
xgb_cls.score(X_val, y_val)

0.08604491471252318

In [106]:
#coś nie wyszło z xgboostem