In [None]:
#import sys
#!{sys.executable} -m pip install xgboost

In [84]:
import xgboost as xgb

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
import matplotlib.pyplot as plt

In [93]:
from preprocessing_wrapper import load_preprocessed_data
from PCA import airbnb_PCA_n

pd.options.mode.chained_assignment = None  # default='warn'

data = load_preprocessed_data(pca=False)
data = data.drop(columns = ["Listing_ID", "Host_ID", "Postal_Code"])

features = data.columns.drop("Price")
target = 'Price'

#data = airbnb_PCA_n(data, features, target, 80)

# Create X and Y, the target value from data
X = data.drop(columns=['Price'])
Y = data[['Price']]

In [85]:
from Reg import stratify

stratified_price = pd.qcut(Y['Price'], 10)
X['strat_price'] = stratified_price

# x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
x_train, x_test, y_train, y_test = train_test_split(X, Y, stratify=X['strat_price'], test_size=0.2)
x_train = x_train.drop(columns=["strat_price"])
x_test = x_test.drop(columns=["strat_price"])

print('strat_price' in x_train.columns)


False


In [86]:
import time as t

In [96]:
error = []
nb_iter = 50
max_exec_time = 0
best_rmse = 1000
best_sets = {}


for iter in range(nb_iter):
    
    chrono = 0
    begin = t.time()
    
    stratified_price = pd.qcut(Y['Price'], 20)
    X['strat_price'] = stratified_price

    # x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
    x_train, x_test, y_train, y_test = train_test_split(X, Y, stratify=X['strat_price'], test_size=0.2)
    x_train = x_train.drop(columns=["strat_price"])
    x_test = x_test.drop(columns=["strat_price"])
    
    xgb_model = xgb.XGBRegressor(objective="reg:squarederror",
                                 random_state=42,
                                 max_depth=5,
                                 subsample=1,
                                 colsample_bytree=0.4,
                                 learning_rate=0.05,
                                 gamma=0,
                                 eta=0.1,
                                 n_estimators=100
    ).fit(x_train, y_train)
    predictions = xgb_model.predict(x_test)
    rmse = mean_squared_error(y_test, predictions, squared = False)
    if rmse < best_rmse:
        best_rmse = rmse
        best_sets = {'x_train':x_train, 'x_test':x_test, 'y_train':y_train, 'y_test':y_test}
    error.append(rmse)
    end = t.time()
    chrono = end - begin
    if chrono > max_exec_time:
        max_exec_time = chrono

print("Total tries: ", error)
print("Average RMSE: ", sum(error)/len(error))
print("Worst RMSE: ", max(error))
print("Best RMSE: ", min(error))
print("Worst execution time : ", max_exec_time)


Total tries:  [36.548916787916, 36.017738140514474, 36.762124284573865, 37.36552693939761, 38.21841199256934, 32.66318398359303, 38.18023383620734, 40.71372335469315, 39.38558247756914, 39.44296235149488, 37.790419952708405, 42.05910719581591, 37.903556803809096, 34.90523783866643, 42.37784091399633, 37.570728093025124, 36.429693169943086, 37.808614267624804, 44.104468895318256, 40.9151595850172, 31.924255077120673, 33.32194002098821, 42.16948039560166, 33.62340174882225, 40.68513775777263, 43.222282516319225, 38.72281312298202, 36.75015323489749, 35.28238987612584, 39.838775239313435, 36.90344265813433, 30.1495536892566, 38.86747472784078, 33.43663974656662, 36.67873790546211, 37.24370970151971, 38.50298817643256, 38.63925043573927, 41.28836486588299, 35.072148228943526, 41.24153218681415, 38.64660820096492, 34.679108271137586, 41.02332800567444, 37.816899381239764, 40.04881435689359, 36.93475898726861, 38.31927295822426, 35.10785201213139, 34.73875903964591]
Average RMSE:  37.7608620

In [97]:
for set_name, set in best_sets.items():
    set.to_csv(f"Data/best_dataset_{set_name}.csv")

In [74]:
print("Parameter optimization")
xgb_model = xgb.XGBRegressor(n_jobs=1)

# On fixe eta = 0.1 pour le moment
params = {
        #'eta': [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
        'n_estimators': [50, 100, 200, 500],
        'gamma': [0.1, 1, 5],
        'max_depth': [3, 4, 5, 6, 7],
        'eta': [0.01, 0.1, 0.3, 0.7]
}

clf = GridSearchCV(xgb_model,
                   params,
                   verbose=1, n_jobs=1,)
clf.fit(X, Y)
print(clf.best_score_)
print(clf.best_params_)

Parameter optimization
Fitting 5 folds for each of 240 candidates, totalling 1200 fits


KeyboardInterrupt: 

In [21]:
print("Parameter optimization")
xgb_model = xgb.XGBRegressor(n_jobs=1)

# On fixe eta = 0.1 pour le moment
params = {
        #'eta': [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
        'n_estimators': [50, 100, 200, 500],
        'gamma': [0.00001, 0.001, 0.1],
        'max_depth': [3],
        'eta': [0.1]
}

clf = GridSearchCV(xgb_model,
                   params,
                   verbose=1, n_jobs=1,)
clf.fit(X, Y)
print(clf.best_score_)
print(clf.best_params_)

Parameter optimization
Fitting 5 folds for each of 12 candidates, totalling 60 fits
0.37640462366046945
{'eta': 0.1, 'gamma': 1e-05, 'max_depth': 3, 'n_estimators': 100}
