In [155]:
import pandas as pd
import os
import sklearn
from sklearn import ensemble 
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import impute
from sklearn import pipeline
from sklearn import compose
from sklearn import feature_selection
from sklearn import neighbors
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from feature_engine.selection import DropDuplicateFeatures
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn import metrics
import joblib
warnings.filterwarnings('ignore')
sklearn.set_config(display="diagram")


In [156]:
dir = "../data"
cachedir = os.path.join(dir, "pipeline_cache")


In [157]:
train_data = pd.read_csv(os.path.join(dir, "train.csv"))
train_data.shape
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42100 entries, 0 to 42099
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   id                                    42100 non-null  int64  
 1   no_of_adults                          42100 non-null  int64  
 2   no_of_children                        42100 non-null  int64  
 3   no_of_weekend_nights                  42100 non-null  int64  
 4   no_of_week_nights                     42100 non-null  int64  
 5   type_of_meal_plan                     42100 non-null  int64  
 6   required_car_parking_space            42100 non-null  int64  
 7   room_type_reserved                    42100 non-null  int64  
 8   lead_time                             42100 non-null  int64  
 9   arrival_year                          42100 non-null  int64  
 10  arrival_month                         42100 non-null  int64  
 11  arrival_date   

In [158]:
def cont_selector(df):
    return df.select_dtypes(include=['number']).columns

def cat_selector(df):
    return df.select_dtypes(exclude=['number']).columns

def cast_to_cat(df, features):
    for feature in features:
        df[feature] = df[feature].astype('category')

In [159]:
def get_features_to_drop_on_missingdata(df, threshold) :
    tmp = df.isnull().sum()
    return list(tmp[tmp/float(df.shape[0]) > threshold].index)

def drop_features(df, features):
    return df.drop(features, axis=1, inplace=True)

In [160]:
train_data['FamilySize'] = train_data['no_of_adults'] +  train_data['no_of_children'] + 1
def convert_familysize(size):
    if(size == 1): 
        return 'Single'
    elif(size <= 4):
        return 'Small'
    elif(size <= 6):
        return 'Medium'
    else: 
        return 'Large'
train_data['FamilyGroup'] = train_data['FamilySize'].map(convert_familysize)

In [161]:
features_to_cast = ["no_of_adults", "no_of_children", "no_of_week_nights","no_of_weekend_nights",
                   "type_of_meal_plan", "required_car_parking_space","room_type_reserved",
                   "market_segment_type","repeated_guest","no_of_special_requests", "arrival_month",
                   "arrival_date","booking_status", "arrival_year"]
features_to_cast.extend(cat_selector(train_data))
cast_to_cat(train_data, features_to_cast)
print(cont_selector(train_data))
print(cat_selector(train_data))

Index(['id', 'lead_time', 'no_of_previous_cancellations',
       'no_of_previous_bookings_not_canceled', 'avg_price_per_room',
       'FamilySize'],
      dtype='object')
Index(['no_of_adults', 'no_of_children', 'no_of_weekend_nights',
       'no_of_week_nights', 'type_of_meal_plan', 'required_car_parking_space',
       'room_type_reserved', 'arrival_year', 'arrival_month', 'arrival_date',
       'market_segment_type', 'repeated_guest', 'no_of_special_requests',
       'booking_status', 'FamilyGroup'],
      dtype='object')


In [162]:
train_data.head()

Unnamed: 0,id,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,...,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,FamilySize,FamilyGroup
0,0,2,0,0,2,1,0,0,9,2018,...,14,1,1,11,0,67.5,0,0,3,Small
1,1,2,0,1,2,0,0,0,117,2018,...,29,0,0,0,0,72.25,0,0,3,Small
2,2,2,0,0,1,0,0,0,315,2018,...,2,0,0,0,0,52.0,0,0,3,Small
3,3,1,0,0,2,1,0,0,32,2018,...,1,1,0,0,0,56.0,0,0,2,Small
4,4,2,0,1,0,0,0,0,258,2018,...,16,0,0,0,0,100.0,0,1,3,Small


In [163]:
target = train_data['booking_status']
features_to_drop = ['id', 'booking_status']
drop_features(train_data, features_to_drop)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42100 entries, 0 to 42099
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   no_of_adults                          42100 non-null  category
 1   no_of_children                        42100 non-null  category
 2   no_of_weekend_nights                  42100 non-null  category
 3   no_of_week_nights                     42100 non-null  category
 4   type_of_meal_plan                     42100 non-null  category
 5   required_car_parking_space            42100 non-null  category
 6   room_type_reserved                    42100 non-null  category
 7   lead_time                             42100 non-null  int64   
 8   arrival_year                          42100 non-null  category
 9   arrival_month                         42100 non-null  category
 10  arrival_date                          42100 non-null  category
 11  ma

In [164]:


#define train, target data and metric
X_train = train_data
y_train = target
scoring = metrics.make_scorer(metrics.roc_auc_score)



In [165]:
cat_features = cat_selector(train_data)
# steps = [('imp', impute.SimpleImputer(strategy="most_frequent")), ('ord', preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))]
steps = [('imp', impute.SimpleImputer(strategy="most_frequent")), ('ohe', preprocessing.OneHotEncoder(handle_unknown='ignore'))]
cat_pipe = pipeline.Pipeline(steps)

In [166]:
cont_features = cont_selector(train_data)
steps = [('imp', impute.SimpleImputer()),('scaler', preprocessing.StandardScaler())]
ord_pipe = pipeline.Pipeline(steps)


In [167]:

pre_pipe = compose.ColumnTransformer(
    [ ("categorical", cat_pipe, cat_features),("continous", ord_pipe, cont_features)]
)

In [168]:
estimators = [
     # ('knn', neighbors.KNeighborsClassifier()),
     ('rf', ensemble.RandomForestClassifier()),
     # ('lsvm', svm.LinearSVC())
]

In [169]:
pipe = pipeline.Pipeline([  
                    ('preprocess', pre_pipe),
                    ('zv_filter', feature_selection.VarianceThreshold()),
                    ('de-duplicated', DropDuplicateFeatures()),
                    ('vt', ensemble.VotingClassifier(estimators=estimators, voting='hard'))
                ], memory=cachedir)
pipe

In [170]:
pipe_grid = {
             # 'vt__lsvm__C':[0.1,0.5,1.0],
    # 'vt__lsvm__penalty':['l1', 'l2'], 
    # 'vt__knn__n_neighbors':[3, 5], 
    'vt__rf__n_estimators':[10, 20] 
}


cv = model_selection.KFold(10)
cv = model_selection.KFold(10)
clf = model_selection.GridSearchCV(pipe, pipe_grid, cv=cv, scoring="accuracy",return_train_score=True, n_jobs=-1)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.best_index_)
print(clf.best_estimator_)



ValueError: 
All the 20 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/lakshmanv/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/lakshmanv/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/lakshmanv/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
  File "/Users/lakshmanv/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/lakshmanv/opt/anaconda3/lib/python3.9/site-packages/joblib/memory.py", line 573, in __call__
    return self._cached_call(args, kwargs, shelving=False)
  File "/Users/lakshmanv/opt/anaconda3/lib/python3.9/site-packages/joblib/memory.py", line 530, in _cached_call
    return self._call(call_id, args, kwargs, shelving)
  File "/Users/lakshmanv/opt/anaconda3/lib/python3.9/site-packages/joblib/memory.py", line 762, in _call
    output = self.func(*args, **kwargs)
  File "/Users/lakshmanv/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/Users/lakshmanv/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 313, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/Users/lakshmanv/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 1101, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/Users/lakshmanv/opt/anaconda3/lib/python3.9/site-packages/feature_engine/selection/drop_duplicate_features.py", line 138, in fit
    X = check_X(X)
  File "/Users/lakshmanv/opt/anaconda3/lib/python3.9/site-packages/feature_engine/dataframe_checks.py", line 76, in check_X
    raise TypeError("This transformer does not support sparse matrices.")
TypeError: This transformer does not support sparse matrices.


In [None]:
feature_names = clf.best_estimator_[0].get_feature_names_out()
print(len(feature_names), feature_names)
feature_names = clf.best_estimator_[1].get_feature_names_out(input_features = feature_names)
print(len(feature_names), feature_names)

In [None]:
test_data = pd.read_csv(os.path.join(dir, "test.csv"))
print(test_data.shape)
test_data.head()

In [None]:
test_data['FamilySize'] = test_data['no_of_adults'] +  test_data['no_of_children'] + 1
test_data['FamilyGroup'] = test_data['FamilySize'].map(convert_familysize)

In [None]:
X_test = test_data
test_data['booking_status'] = clf.predict(X_test)
test_data.head()

In [None]:
test_data.to_csv(os.path.join(dir, "submission1.csv"), columns=["id", "booking_status"], index=False)

In [None]:
#persist model using joblib
objects = {
    'clf':clf
}
joblib.dump(objects, os.path.join(dir, "model_bagging_voting.pkl"))