In [21]:
# Imports
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [22]:
# Fonctions de preprocessing
def organize_dates(data, dates_columns):  
    for col in dates_columns:
        data[col] = pd.to_datetime(data[col])
        data[f'{col}_year']  = data[col].dt.year
        data[f'{col}_month'] = data[col].dt.month
        data[f'{col}_day_name'] = data[col].dt.day_name()
        data[f'{col}_is_weekend'] = np.where(data[f'{col}_day_name'].isin(['Saturday', 'Sunday']), 1, 0)
    return data.drop(dates_columns, axis=1)

def replace_bool(data, boolean_cols): 
    for col in boolean_cols:
        data[col] = np.where(data[col]==True, 1, 0)
    return data

def replace_brand(data, brand_positionning): 
    data['hotel_brand_positionning'] = np.where(data['hotel_brand_positionning'] == 'Unknown Positioning', data['hotel_brand_code'], data['hotel_brand_positionning'])
    data['hotel_brand_positionning'].replace(brand_positionning, inplace=True)
    return data

def preprocessing_data(data, dates_columns, boolean_cols, drop_columns_na, drop_columns):
    print("**** PREPROCESSING *****")
    print("Organize dates")
    data = organize_dates(data, dates_columns)
    print("Replace booleans")
    data = replace_bool(data, boolean_cols)
    print("Add gap stay night")
    data['gap_bkg_stay_nights'] = data['bkg_nbroomnights'] - data['stay_nbroomnights']
    print("Replace brand positionning")
    data = replace_brand(data, brand_positionning)
    print("Drop colums NA") 
    data = data.drop(drop_columns_na, axis=1)
    print("Drop colums") 
    data = data.drop(drop_columns, axis=1)
    print("Dropna")
    data = data.dropna()
    print("Drop None target")
    data = data[data['declared_stay_type']!='None']
    return data

In [23]:
# Chargement des donnees et application du preprocessing
data=pd.read_csv("Fichier_Projet_DS_new.csv", sep= ';')

# Colonnes a supprimer car na
drop_columns_na = [
    'initialmaincro', 'initialsubcro', 'finalmaincro', 'final_subcro', 'bad_tr_ nb_nights', 
    'bad_tr_ turn_over_eur', 'rcu_codemarque', 
    'hotel_zip_code', 'flag_is_ota', 'bad_tr_nbroomnights', 'flag_aberrante_value',
    'bkg_catotal_eur','stay_cor_catotal_eur','bad_tr_ turn_over_eur',
    'flag_aberrante_value','bkg_caroom_eur','stay_caroom_eur','stay_cor_caroom_eur','sb_caroom_eur',
    'nb_child',
    'bkg_nbnights','stay_nbnights',
    'bkg_nbroomnights','stay_nbroomnights'
]

# Colonnes a supprimer car pas de correlation ou doublon
drop_columns = [ # Drop the too specific values not interesting for us
    'hotelcode', 'hotel_name', 'hotel_brand_code', 'hotel_country','hotel_city', 'hotel_country_code',
    'sb_catotal_eur','tr_turn_over_eur','checkout_date_index','sb_nbnights','tr_nbnights','sb_nbroomnights',
    'tr_nbroomnights','booking_date_day_name','calculated_stay_type'
]

# Colonnes de date a formater
dates_columns = ['booking_date', 'checkin_date', 'checkout_date']

# Colonnes Boolean a formater
boolean_columns = ['is_web_direct', 'child_presence', 'eligible_to_earn_points']

# Positionnement des marques
brand_positionning = {  
    'IBS' : 'Economy',
    'BKF' : 'Economy',
    'IBH' : 'Economy',
    'IBB' : 'Economy',
    'GRE' : 'Economy',
    'ADG' : 'Economy',
    'HOF' : 'Economy',
    'NOV' : 'Midscale', 
    'SUI' : 'Midscale',
    'MER' : 'Midscale',
    'SAM' : 'Midscale',
    'AHM' : 'Midscale',
    'TRI' : 'Midscale',
    'MTA' : 'Midscale',
    '21C' : 'Midscale', 
    'MSH' : 'Midscale',
    'BME' : 'Midscale',
    'MOL' : 'Midscale',
    'NOL' : 'Midscale',
    'MGS' : 'Luxury and Upscale',  
    'ADA' : 'Luxury and Upscale',  
    'SOL' : 'Luxury and Upscale', 
    'SEB' : 'Luxury and Upscale', 
    'MGA' : 'Luxury and Upscale', 
    'SOF' : 'Luxury and Upscale',
    'FAI' : 'Luxury and Upscale',
    'SWI' : 'Luxury and Upscale',
    'PUL' : 'Luxury and Upscale',
    'RAF' : 'Luxury and Upscale',
    'ART' : 'Luxury and Upscale',
    'CAS' : 'Luxury and Upscale',
    'MOV' : 'Luxury and Upscale',
    'MEI' : 'Luxury and Upscale',
    'ANG' : 'Luxury and Upscale',
    'PEP' : 'Luxury and Upscale',
    'SO'  : 'Luxury and Upscale',
    'SO/' : 'Luxury and Upscale',
    'SOR' : 'Luxury and Upscale',
    'SWL' : 'Luxury and Upscale',
    'MEL' : 'Luxury and Upscale',
    'TOR' : 'Luxury and Upscale',
    'FAE' : 'Luxury and Upscale'
}

# Preprocessing
data = preprocessing_data(data, dates_columns, boolean_columns, drop_columns_na, drop_columns)

# Creation des features et de la target
features = data.drop('declared_stay_type', axis = 1)
target = data['declared_stay_type']

# Remplacement dans Target par des valeurs numeriques
target = target.replace(['business'], 1)
target = target.replace(['leisure'], 0)

# Dummification
features = pd.get_dummies(features)

# Creation des jeux d'entrainement et de test
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state=321)

**** PREPROCESSING *****
Organize dates
Replace booleans
Add gap stay night
Replace brand positionning
Drop colums NA
Drop colums
Dropna
Drop None target


In [25]:
# Pipeline

# Scaler
scaler = StandardScaler()

# SelectKBest avec paremetres obtenu par GridSearch
selector = SelectKBest(k = 145)

# Model avec paremetres obtenu par GridSearch
clf = LogisticRegression()
xgboost = xgb.XGBClassifier(
booster="gbtree",
subsample=1,
colsample_bytree=1,
min_child_weight=1,
max_depth=12,
learning_rate=0.1,
n_estimators=200,
eval_metric="error")

## Définition de la pipeline
pycor_pipe = Pipeline(steps = [ ('scaler', scaler),                  # Mise a l'echelle des donnees
                                ('selector', selector),              # Selection des features
                                ('clf', xgboost)                     # Modele
                               ])

pycor_pipe.fit(X_train, y_train)
score_pipe = pycor_pipe.score(X_test, y_test)
print("le score de la pipeline est: ", score_pipe)

  f = msb / msw


le score de la pipeline est:  0.8227114716106605
