In [1]:
import os
import sys
from dotenv import load_dotenv

from sqlalchemy import create_engine

import pandas as pd


from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

In [2]:
load_dotenv()
database_url = os.getenv('MYSQL_ENGINE_URL')

In [3]:
engine = create_engine(database_url)

In [15]:
query = "SELECT * FROM hotel_booking LIMIT 10000;"

In [16]:
data = pd.read_sql(query, engine)
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,Transient,0.0,0,0,Check-Out,2015-07-01,Ernest Barnes,Ernest.Barnes31@outlook.com,669-792-1661,************4322
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,Transient,0.0,0,0,Check-Out,2015-07-01,Andrea Baker,Andrea_Baker94@aol.com,858-637-6955,************9157
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,Transient,75.0,0,0,Check-Out,2015-07-02,Rebecca Parker,Rebecca_Parker@comcast.net,652-885-2745,************3734
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,Transient,75.0,0,0,Check-Out,2015-07-02,Laura Murray,Laura_M@gmail.com,364-656-8427,************5677
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,Transient,98.0,0,1,Check-Out,2015-07-03,Linda Hines,LHines@verizon.com,713-226-5883,************5498


In [17]:
# Dropping the features
data = data.drop(['reservation_status',
              'reservation_status_date',
              'assigned_room_type', 
              'arrival_date_year', 
              'country', 
              'agent', 
              'company', 
              'name', 
              'email', 
              'phone-number', 
              'credit_card'], 
              axis=1)

In [18]:
data.shape

(10000, 25)

In [19]:
data.isna().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests         0
dtype: int64

In [20]:
# Analyze noisy data
noisy_data = {
    'adr':      data[data['adr'] < 0],
    'adults':   data[data['adults'] == 0],
    'children': data[data['children'] == 10],
    'babies':   data[data['babies'] == 10],
}

noisy_data_count = {key: len(value) for key, value in noisy_data.items()}
noisy_data_count

{'adr': 0, 'adults': 7, 'children': 1, 'babies': 0}

In [21]:
data = data[data['children'] != 10]
data = data[data['adults'] != 0]

In [22]:
# Check if the noisy data has been handled
noisy_data_handled = {
    'adr': data[data['adr'] < 0],
    'adults': data[data['adults'] == 0],
    'children': data[data['children'] == 10],
    'babies': data[data['babies'] == 10],
}

noisy_data_handled_count = {key: len(value) for key, value in noisy_data_handled.items()}
noisy_data_handled_count

{'adr': 0, 'adults': 0, 'children': 0, 'babies': 0}

In [23]:
# categorical features
cat_col = [_ for _ in data.columns if data[_].dtype == 'O']
cat_df = data[cat_col]

# print unique values in categorical features
for i in cat_df.columns:
    print('{} :\n{} \n'.format(i,cat_df[i].unique()))

hotel :
['Resort Hotel'] 

arrival_date_month :
['July' 'August' 'September' 'October' 'November' 'December' 'January'
 'February' 'March' 'April' 'May' 'June'] 

meal :
['BB' 'FB' 'HB' 'SC' 'Undefined'] 

market_segment :
['Direct' 'Corporate' 'Online TA' 'Offline TA/TO' 'Complementary' 'Groups'] 

distribution_channel :
['Direct' 'Corporate' 'TA/TO'] 

reserved_room_type :
['C' 'A' 'D' 'E' 'G' 'F' 'H' 'L'] 

deposit_type :
['No Deposit' 'Refundable' 'Non Refund'] 

customer_type :
['Transient' 'Contract' 'Transient-Party' 'Group'] 



In [38]:
train_df, test_df = train_test_split(data, test_size=0.25, random_state=23)
train_df.shape, test_df.shape

((7494, 25), (2498, 25))

In [39]:
train_df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
4754,Resort Hotel,0,169,March,14,31,0,3,2,0.0,...,0,0,A,1,Refundable,0,Transient-Party,66.0,0,0
1399,Resort Hotel,0,32,August,35,28,1,2,2,0.0,...,0,0,A,0,No Deposit,0,Transient,122.0,0,1
439,Resort Hotel,0,7,July,29,16,0,1,2,2.0,...,0,0,C,1,No Deposit,0,Transient,169.0,1,1
1163,Resort Hotel,1,143,August,34,16,2,0,2,0.0,...,0,0,A,0,No Deposit,0,Transient,108.54,0,1
3324,Resort Hotel,1,66,December,49,5,1,1,2,0.0,...,0,0,A,0,No Deposit,0,Transient-Party,64.0,0,0


In [40]:
test_df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
699,Resort Hotel,1,31,July,30,25,3,6,2,0.0,...,0,0,D,0,No Deposit,0,Transient,154.0,0,1
4618,Resort Hotel,1,106,March,13,24,1,3,2,0.0,...,0,0,A,0,Non Refund,0,Transient,84.0,0,0
5456,Resort Hotel,1,101,April,18,29,2,5,2,0.0,...,0,0,A,0,No Deposit,0,Transient-Party,47.0,0,0
2564,Resort Hotel,0,0,October,43,22,2,5,2,0.0,...,0,0,A,0,No Deposit,0,Transient,42.5,0,1
5237,Resort Hotel,1,83,April,17,22,2,5,2,0.0,...,0,0,A,0,Non Refund,0,Transient,46.0,0,0


In [41]:
# for preprocessing
one_hot_cols = ['hotel', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type']
label_cols = ['arrival_date_month']
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

In [43]:
oh_enc = OneHotEncoder(sparse=False, drop='first')

label_enc = LabelEncoder()

In [42]:
def label_encoding_function(data: pd.DataFrame) -> pd.DataFrame:
    month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    label_cols = ['arrival_date_month']
    columns = label_cols if isinstance(label_cols, list) else [label_cols]
    for col in columns:
        data[col] = data[col].apply(lambda x: month_order.index(x) + 1)
    return data

def onehot_encoding_function(data: pd.DataFrame) -> pd.DataFrame:
    one_hot_cols = ['hotel', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type']
    data = pd.get_dummies(data, columns=one_hot_cols, drop_first=True)
    data = data.astype(int)
    return data

In [43]:
train_df_encoded = label_encoding_function(train_df)
train_df_encoded = onehot_encoding_function(train_df_encoded)

In [44]:
train_df_encoded.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
4754,0,169,3,14,31,0,3,2,0,0,...,0,0,0,0,0,0,1,0,0,1
1399,0,32,8,35,28,1,2,2,0,0,...,0,0,0,0,0,0,0,0,1,0
439,0,7,7,29,16,0,1,2,2,0,...,0,0,0,0,0,0,0,0,1,0
1163,1,143,8,34,16,2,0,2,0,0,...,0,0,0,0,0,0,0,0,1,0
3324,1,66,12,49,5,1,1,2,0,0,...,0,0,0,0,0,0,0,0,0,1


In [45]:
scaler = MinMaxScaler()

In [None]:
train_df_encoded

In [None]:
def get_preprocessing_functions(self) -> Pipeline: #tuple:
    try:
        # Fetch schema config
        onehot_encoding_columns = one_hot_cols
        label_encoding_columns = label_cols
        scaling_columns = self._schema_config.get('scaling_columns', [])

        # Define individual transformer functions
        def label_encoding_function(data: pd.DataFrame) -> pd.DataFrame:
            month_order = months
            columns = label_encoding_columns if isinstance(label_encoding_columns, list) else [label_encoding_columns]
            for col in columns:
                data[col] = data[col].apply(lambda x: month_order.index(x) + 1)
            return data

        def onehot_encoding_function(data: pd.DataFrame) -> pd.DataFrame:
            data = pd.get_dummies(data, columns=onehot_encoding_columns, drop_first=True)
            data = data.astype(int)
            return data
        
        def scaling_function(data: pd.DataFrame) -> pd.DataFrame:
            data[scaling_columns] = scaler.fit_transform(data[scaling_columns])
            return data

        # Initialize transformers
        label_encoder = FunctionTransformer(label_encoding_function)
        onehot_encoder = FunctionTransformer(onehot_encoding_function)
        scaler = MinMaxScaler()

        # Combine transformers in ColumnTransformer
        transformers = []
        if label_encoding_columns:
            transformers.append(('label_encoder', label_encoder, label_encoding_columns))
        if onehot_encoding_columns:
            transformers.append(('onehot_encoder', onehot_encoder, onehot_encoding_columns))
        if scaling_columns:
            transformers.append(('scaler', scaler, scaling_columns))
        
        preprocessor = ColumnTransformer(transformers=transformers, remainder='passthrough')

        # Create pipeline
        data_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])


        # Save the pipeline
        # save_object(self.data_preprocessing_config.preprocessed_object_file_path, data_pipeline)

        return data_pipeline
        # return label_encoding_function, onehot_encoding_function, scaling_function

    except Exception as e:
        raise BaseException(f"Error in get_data_preprocessor_object: {str(e)}", sys) from e
    

### Ingested Data checks 

In [2]:
import pandas as pd

In [3]:
# columns to drop
drop_cols = ['reservation_status',
              'reservation_status_date',
              'assigned_room_type', 
              'arrival_date_year', 
              'country', 
              'agent', 
              'company']

#### Train set

In [4]:
# check info about datasets
train_df = pd.read_csv('../artifacts/data/interim/train.csv')
test_df = pd.read_csv('../artifacts/data/interim/test.csv')
val_df = pd.read_csv('../artifacts/data/interim/val.csv')

print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Validation set shape: {val_df.shape}")

Train set shape: (83573, 32)
Test set shape: (17908, 32)
Validation set shape: (17909, 32)


In [5]:
# Dropping the features from the datasets

train_df = train_df.drop(drop_cols, axis=1)
test_df = test_df.drop(drop_cols, axis=1)
val_df = val_df.drop(drop_cols, axis=1)

print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Validation set shape: {val_df.shape}")

Train set shape: (83573, 25)
Test set shape: (17908, 25)
Validation set shape: (17909, 25)


In [6]:
def check_missing_values(df: pd.DataFrame):
    # check for missing values in datasets
    missing_values = df.isnull().sum()
    print(f"Missing values:  {missing_values[missing_values > 0]}")
    missing_values_percentage = df.isnull().mean() * 100
    print(f"Missing values percentage:  {missing_values_percentage[missing_values_percentage > 0]}")

check_missing_values(train_df)
check_missing_values(test_df)
check_missing_values(val_df)

Missing values:  children    1
dtype: int64
Missing values percentage:  children    0.001197
dtype: float64
Missing values:  children    3
dtype: int64
Missing values percentage:  children    0.016752
dtype: float64
Missing values:  Series([], dtype: int64)
Missing values percentage:  Series([], dtype: float64)


In [7]:
# Impute missing values in 'children' column with 0
train_df['children'] = train_df['children'].fillna(0)
test_df['children'] = test_df['children'].fillna(0)
val_df['children'] = val_df['children'].fillna(0)

# Check if there are any missing values left in 'children' column
print(train_df['children'].isna().sum())
print(test_df['children'].isna().sum())
print(val_df['children'].isna().sum())

0
0
0


In [8]:
def analyze_noisy_data(df: pd.DataFrame):
    # Analyze noisy data
    noisy_data = {
        'adr':      df[df['adr'] < 0],
        'adults':   df[df['adults'] == 0],
        'children': df[df['children'] == 10],
        'babies':   df[df['babies'] == 10],
    }

    noisy_data_count = {key: len(value) for key, value in noisy_data.items()}
    print(noisy_data_count)

analyze_noisy_data(train_df)
analyze_noisy_data(test_df)
analyze_noisy_data(val_df)

{'adr': 0, 'adults': 298, 'children': 1, 'babies': 0}
{'adr': 0, 'adults': 47, 'children': 0, 'babies': 1}
{'adr': 1, 'adults': 58, 'children': 0, 'babies': 0}


In [9]:
def handle_noisy_data(df: pd.DataFrame):
    # Replace negative adr with median of adr column
    df.loc[df['adr'] < 0, 'adr'] = df['adr'].median()

    # Remove rows with 0 adults
    df = df[df['adults'] != 0]

    # Remove rows with 10 children or 10 babies
    df = df[df['children'] != 10]
    df = df[df['babies'] != 10]

    # Reset the index
    df.reset_index(drop=True, inplace=True)

    # Check if the noisy data has been handled
    noisy_data_handled = {
        'adr': df[df['adr'] < 0],
        'adults': df[df['adults'] == 0],
        'children': df[df['children'] == 10],
        'babies': df[df['babies'] == 10],
    }

    noisy_data_handled_count = {key: len(value) for key, value in noisy_data_handled.items()}
    print(noisy_data_handled_count)


handle_noisy_data(train_df)
handle_noisy_data(test_df)
handle_noisy_data(val_df)


{'adr': 0, 'adults': 0, 'children': 0, 'babies': 0}
{'adr': 0, 'adults': 0, 'children': 0, 'babies': 0}
{'adr': 0, 'adults': 0, 'children': 0, 'babies': 0}


In [12]:
# encoding categorical values
def check_categorical_values(df: pd.DataFrame):
    # categorical features
    cat_col = [_ for _ in df.columns if df[_].dtype == 'O']
    cat_df = df[cat_col]

    # print unique values in categorical features
    for i in cat_df.columns:
        print(f'{i} : {cat_df[i].nunique()}\n{cat_df[i].unique()} \n')

In [13]:
check_categorical_values(train_df)

hotel : 2
['Resort Hotel' 'City Hotel'] 

arrival_date_month : 12
['August' 'September' 'October' 'December' 'July' 'February' 'May'
 'November' 'June' 'April' 'March' 'January'] 

meal : 5
['BB' 'HB' 'SC' 'Undefined' 'FB'] 

market_segment : 7
['Direct' 'Offline TA/TO' 'Groups' 'Online TA' 'Corporate' 'Complementary'
 'Aviation'] 

distribution_channel : 5
['Direct' 'TA/TO' 'Corporate' 'GDS' 'Undefined'] 

reserved_room_type : 10
['A' 'D' 'B' 'E' 'C' 'H' 'F' 'G' 'L' 'P'] 

deposit_type : 3
['No Deposit' 'Non Refund' 'Refundable'] 

customer_type : 4
['Transient' 'Contract' 'Transient-Party' 'Group'] 



In [14]:
check_categorical_values(test_df)

hotel : 2
['Resort Hotel' 'City Hotel'] 

arrival_date_month : 12
['December' 'January' 'May' 'March' 'September' 'June' 'April' 'August'
 'July' 'February' 'November' 'October'] 

meal : 5
['BB' 'SC' 'HB' 'Undefined' 'FB'] 

market_segment : 8
['Online TA' 'Corporate' 'Direct' 'Offline TA/TO' 'Complementary' 'Groups'
 'Aviation' 'Undefined'] 

distribution_channel : 5
['TA/TO' 'Corporate' 'Direct' 'GDS' 'Undefined'] 

reserved_room_type : 10
['A' 'P' 'D' 'F' 'E' 'G' 'B' 'C' 'H' 'L'] 

deposit_type : 3
['No Deposit' 'Non Refund' 'Refundable'] 

customer_type : 4
['Transient-Party' 'Transient' 'Contract' 'Group'] 



In [15]:
check_categorical_values(val_df)

hotel : 2
['Resort Hotel' 'City Hotel'] 

arrival_date_month : 12
['April' 'March' 'October' 'June' 'July' 'August' 'May' 'September'
 'January' 'February' 'November' 'December'] 

meal : 5
['BB' 'HB' 'FB' 'SC' 'Undefined'] 

market_segment : 7
['Online TA' 'Offline TA/TO' 'Direct' 'Groups' 'Corporate' 'Complementary'
 'Aviation'] 

distribution_channel : 4
['TA/TO' 'Direct' 'Corporate' 'GDS'] 

reserved_room_type : 10
['A' 'D' 'E' 'H' 'C' 'L' 'G' 'F' 'B' 'P'] 

deposit_type : 3
['No Deposit' 'Non Refund' 'Refundable'] 

customer_type : 4
['Transient' 'Transient-Party' 'Contract' 'Group'] 



In [16]:
# encoding features
def encoding_features(df: pd.DataFrame):
    # label encoding the 'arrival_date_month' feature
    months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    df['arrival_date_month'] = df['arrival_date_month'].apply(lambda x: months.index(x) + 1)

    # one-hot encoding the remaining categorical features
    one_hot_cols = ['hotel', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type']
    df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)

    # Check the encoded features
    df.info()
    return df

In [17]:
encoded_train_df = encoding_features(train_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83573 entries, 0 to 83572
Data columns (total 47 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   is_canceled                     83573 non-null  int64  
 1   lead_time                       83573 non-null  int64  
 2   arrival_date_month              83573 non-null  int64  
 3   arrival_date_week_number        83573 non-null  int64  
 4   arrival_date_day_of_month       83573 non-null  int64  
 5   stays_in_weekend_nights         83573 non-null  int64  
 6   stays_in_week_nights            83573 non-null  int64  
 7   adults                          83573 non-null  int64  
 8   children                        83573 non-null  float64
 9   babies                          83573 non-null  int64  
 10  is_repeated_guest               83573 non-null  int64  
 11  previous_cancellations          83573 non-null  int64  
 12  previous_bookings_not_canceled  

In [18]:
encoded_test_df = encoding_features(test_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17908 entries, 0 to 17907
Data columns (total 48 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   is_canceled                     17908 non-null  int64  
 1   lead_time                       17908 non-null  int64  
 2   arrival_date_month              17908 non-null  int64  
 3   arrival_date_week_number        17908 non-null  int64  
 4   arrival_date_day_of_month       17908 non-null  int64  
 5   stays_in_weekend_nights         17908 non-null  int64  
 6   stays_in_week_nights            17908 non-null  int64  
 7   adults                          17908 non-null  int64  
 8   children                        17908 non-null  float64
 9   babies                          17908 non-null  int64  
 10  is_repeated_guest               17908 non-null  int64  
 11  previous_cancellations          17908 non-null  int64  
 12  previous_bookings_not_canceled  

In [19]:
encoded_val_df = encoding_features(val_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17909 entries, 0 to 17908
Data columns (total 46 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   is_canceled                     17909 non-null  int64  
 1   lead_time                       17909 non-null  int64  
 2   arrival_date_month              17909 non-null  int64  
 3   arrival_date_week_number        17909 non-null  int64  
 4   arrival_date_day_of_month       17909 non-null  int64  
 5   stays_in_weekend_nights         17909 non-null  int64  
 6   stays_in_week_nights            17909 non-null  int64  
 7   adults                          17909 non-null  int64  
 8   children                        17909 non-null  float64
 9   babies                          17909 non-null  int64  
 10  is_repeated_guest               17909 non-null  int64  
 11  previous_cancellations          17909 non-null  int64  
 12  previous_bookings_not_canceled  

In [26]:
encoded_train_df.var()

is_canceled                           0.233286
lead_time                         11451.891421
arrival_date_month                    9.539554
arrival_date_week_number            184.949404
arrival_date_day_of_month            76.940419
stays_in_weekend_nights               1.001360
stays_in_week_nights                  3.653198
adults                                0.324908
children                              0.162840
babies                                0.008830
is_repeated_guest                     0.030412
previous_cancellations                0.761033
previous_bookings_not_canceled        2.411991
booking_changes                       0.416797
days_in_waiting_list                316.861142
adr                                2661.828209
required_car_parking_spaces           0.060474
total_of_special_requests             0.629455
hotel_Resort Hotel                    0.223038
meal_FB                               0.006585
meal_HB                               0.106403
meal_SC      

In [19]:
import os
os.chdir('../')
import sys

from src.core.logger.data_logger import logging
from src.core.exception import HotelBookingException

from src.core.entities.config_entity import (DataIngestionConfig,
                                             DataValidationConfig,
                                             DataPreprocessingConfig,
                                             DataSplitConfig) 
from src.core.entities.artifact_entity import (DataIngestionArtifact,
                                               DataValidationArtifact,
                                               DataPreprocessingArtifact,
                                               DataSplitArtifact)

from src.data.data_ingestion import DataIngestion
from src.data.data_validation import DataValidation
from src.data.data_preprocessing import DataPreprocessing
from src.data.data_split import DataSplit

In [8]:
df = pd.read_csv('artifacts/data/interim/data.csv')
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [12]:
df_sample = df.head(10000)

In [13]:
df_sample.shape

(10000, 32)

In [14]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           10000 non-null  object 
 1   is_canceled                     10000 non-null  int64  
 2   lead_time                       10000 non-null  int64  
 3   arrival_date_year               10000 non-null  int64  
 4   arrival_date_month              10000 non-null  object 
 5   arrival_date_week_number        10000 non-null  int64  
 6   arrival_date_day_of_month       10000 non-null  int64  
 7   stays_in_weekend_nights         10000 non-null  int64  
 8   stays_in_week_nights            10000 non-null  int64  
 9   adults                          10000 non-null  int64  
 10  children                        10000 non-null  float64
 11  babies                          10000 non-null  int64  
 12  meal                            1