In [None]:
# Import the required packages

import os
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Import pre-processed dataset and load it into a dataframe

# Create path variable
original_file = 'category_transformation.csv'

# Load csv in dataframe
dfo = pd.read_csv(original_file, index_col=0)

In [None]:
# Check # of columns and rows imported
dfo.shape

(78547, 37)

In [None]:
# Print info
dfo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78547 entries, 0 to 79329
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   LeadTime                     78547 non-null  float64
 1   ArrivalDateYear              78547 non-null  int64  
 2   ArrivalDateMonth             78547 non-null  object 
 3   ArrivalDateWeekNumber        78547 non-null  int64  
 4   ArrivalDateDayOfMonth        78547 non-null  int64  
 5   StaysInWeekendNights         78547 non-null  object 
 6   StaysInWeekNights            78547 non-null  object 
 7   Adults                       78547 non-null  float64
 8   Children                     78547 non-null  float64
 9   Babies                       78547 non-null  float64
 10  Meal                         78547 non-null  object 
 11  Country                      78547 non-null  object 
 12  MarketSegment                78547 non-null  object 
 13  DistributionChan

In [None]:
# Converts all non float variables in categorical
cat_cols = dfo.select_dtypes(exclude='float64').columns
dfo[cat_cols] = dfo[cat_cols].astype('category')

# Convert variables to categorical
dfo['Adults'] = dfo['Adults'].astype('category')
dfo['Children'] = dfo['Children'].astype('category')
dfo['Babies'] = dfo['Babies'].astype('category')

In [None]:
# Creates variable exclusion list for modeling
to_drop = ['ReservationDate', 'ReservationStatusDate', 'CheckOutDate', 
           'ArrivalDate', 'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth', 
           'ArrivalDateYear', 'AssignedRoomType', 'DaysInWaitingList', 'TotalStay']

# Removes features
dfo.drop(to_drop, axis=1, inplace=True)

In [None]:
# Selects stratifier variable and 
X = dfo.drop(['ArrivalDateMonth'], axis=1)
y = dfo['ArrivalDateMonth']

# Creates a generator for a stratified random shuffled sample of 20% of observations 
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=45)
split = sss.split(X, y)

# Selects sample dataset from split generator
for train_index, test_index in split:
    sample_df = dfo.iloc[test_index]

In [None]:
# Shows the resulting sample size
sample_df.shape

(15710, 27)

In [None]:
# Get dummy variables for each categorical variable
encoded_df = pd.get_dummies(sample_df, drop_first=True)

# Check number of total variables
encoded_df.shape

(15710, 91)

In [None]:
# Initialized the scaler
scaler = StandardScaler()
# Fit anf transform
scaler.fit(encoded_df)
scaled = scaler.transform(encoded_df)

In [None]:
# Creates dataframe with the scaled dataset
df_scaled = pd.DataFrame(scaled, columns=encoded_df.columns)

In [None]:
# Shows a sample of the final dataset
df_scaled.sample(10)

Unnamed: 0,LeadTime,ADR,ArrivalDateMonth_August,ArrivalDateMonth_December,ArrivalDateMonth_February,ArrivalDateMonth_January,ArrivalDateMonth_July,ArrivalDateMonth_June,ArrivalDateMonth_March,ArrivalDateMonth_May,ArrivalDateMonth_November,ArrivalDateMonth_October,ArrivalDateMonth_September,StaysInWeekendNights_1,StaysInWeekendNights_2,StaysInWeekendNights_3+,StaysInWeekNights_1,StaysInWeekNights_2,StaysInWeekNights_3,StaysInWeekNights_4,StaysInWeekNights_5,StaysInWeekNights_6+,Adults_1.0,Adults_2.0,Adults_3.0,Adults_4.0,Children_1.0,Babies_1.0,Meal_FB,Meal_HB,Meal_SC,Country_Germany,Country_National,Country_Other_Africa,Country_Other_Americas,Country_Other_Asia,Country_Other_Europe,Country_Other_Oceania,Country_Spain,Country_United_Kingdom,...,PreviousCancellations_0,PreviousCancellations_1,PreviousCancellations_2+,PreviousBookingsNotCanceled_0,PreviousBookingsNotCanceled_1-2,PreviousBookingsNotCanceled_3+,ReservedRoomType_D,ReservedRoomType_Other,BookingChanges_1,BookingChanges_2+,DepositType_Non Refund,Agent_Agent_B,Agent_Agent_C,Agent_Agent_D,Agent_Agent_E,Agent_no_Agent,Agent_other_Agent,Company_no_Company,Company_other_Company,CustomerType_Group,CustomerType_Transient,CustomerType_Transient-Party,RequiredCarParkingSpaces_1,TotalOfSpecialRequests_1,TotalOfSpecialRequests_2+,ReservationStatus_Check-Out,ReservationStatus_No-Show,ChangedRoom_1,StayChanges_No Changes,ReservationMonth_August,ReservationMonth_December,ReservationMonth_February,ReservationMonth_January,ReservationMonth_July,ReservationMonth_June,ReservationMonth_March,ReservationMonth_May,ReservationMonth_November,ReservationMonth_October,ReservationMonth_September
9702,-0.992581,0.472995,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,-0.241094,-0.325737,-0.320692,1.649808,-0.571029,-0.097521,-0.594161,-0.717191,-0.507845,-0.285498,-0.213683,-0.116116,-0.504174,0.604168,-0.256106,-0.021113,-0.262952,-0.074191,-0.023942,-0.301742,-0.388262,-0.292993,1.260383,-0.112109,-0.227178,-0.190113,-0.481225,-0.070182,-0.249983,-0.273317,...,-0.740124,-0.263367,-0.061398,0.869536,-0.085873,-0.1189,-0.419612,-0.265571,3.130056,-0.196317,-0.433994,-0.313641,-0.220435,-0.216114,-0.186306,3.012801,-0.635162,0.213683,-0.180935,-0.065445,0.586318,-0.535004,-0.158922,-0.602696,-0.379201,0.835141,-0.104281,-0.311925,0.011284,-0.271024,-0.301365,-0.358936,-0.397545,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,-0.340129,-0.267899
5094,-0.856918,-0.4355,-0.357239,4.258544,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,-0.241094,-0.325737,-0.320692,-0.606131,1.751225,-0.097521,-0.594161,-0.717191,-0.507845,-0.285498,4.679837,-0.116116,1.983443,-1.655169,-0.256106,-0.021113,-0.262952,-0.074191,-0.023942,-0.301742,-0.388262,-0.292993,-0.79341,-0.112109,-0.227178,-0.190113,2.078031,-0.070182,-0.249983,-0.273317,...,-0.740124,-0.263367,-0.061398,-1.150039,-0.085873,-0.1189,-0.419612,-0.265571,-0.319483,-0.196317,-0.433994,-0.313641,-0.220435,-0.216114,-0.186306,-0.331917,-0.635162,0.213683,-0.180935,-0.065445,0.586318,-0.535004,-0.158922,-0.602696,-0.379201,-1.197402,-0.104281,-0.311925,0.011284,-0.271024,3.318238,-0.358936,-0.397545,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,-0.340129,-0.267899
12833,-0.983537,1.927608,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,4.147763,-0.325737,-0.320692,-0.606131,-0.571029,-0.097521,1.683045,-0.717191,-0.507845,-0.285498,-0.213683,-0.116116,-0.504174,0.604168,-0.256106,-0.021113,3.802972,-0.074191,-0.023942,-0.301742,-0.388262,-0.292993,1.260383,-0.112109,-0.227178,-0.190113,-0.481225,-0.070182,-0.249983,-0.273317,...,1.351125,-0.263367,-0.061398,0.869536,-0.085873,-0.1189,-0.419612,3.765467,-0.319483,-0.196317,-0.433994,-0.313641,-0.220435,-0.216114,-0.186306,-0.331917,-0.635162,0.213683,-0.180935,-0.065445,0.586318,-0.535004,-0.158922,-0.602696,2.637125,0.835141,-0.104281,3.205904,0.011284,-0.271024,-0.301365,-0.358936,-0.397545,-0.30872,-0.213357,-0.296055,-0.262814,3.455531,-0.340129,-0.267899
9359,-0.19669,1.213061,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,3.004251,-0.297325,-0.339896,-0.241094,-0.325737,-0.320692,-0.606131,1.751225,-0.097521,-0.594161,-0.717191,-0.507845,-0.285498,4.679837,-0.116116,-0.504174,0.604168,-0.256106,-0.021113,-0.262952,-0.074191,-0.023942,-0.301742,-0.388262,-0.292993,-0.79341,-0.112109,-0.227178,-0.190113,-0.481225,-0.070182,-0.249983,-0.273317,...,-0.740124,-0.263367,-0.061398,0.869536,-0.085873,-0.1189,2.383154,-0.265571,-0.319483,-0.196317,-0.433994,-0.313641,-0.220435,-0.216114,-0.186306,-0.331917,-0.635162,0.213683,-0.180935,-0.065445,0.586318,-0.535004,-0.158922,1.659213,-0.379201,0.835141,-0.104281,-0.311925,0.011284,-0.271024,-0.301365,-0.358936,-0.397545,-0.30872,-0.213357,3.377745,-0.262814,-0.289391,-0.340129,-0.267899
7491,1.467446,-1.109216,-0.357239,-0.234822,-0.258917,-0.222963,2.978975,-0.332862,-0.297325,-0.339896,-0.241094,-0.325737,-0.320692,-0.606131,-0.571029,-0.097521,-0.594161,1.394329,-0.507845,-0.285498,-0.213683,-0.116116,-0.504174,0.604168,-0.256106,-0.021113,-0.262952,-0.074191,-0.023942,-0.301742,-0.388262,-0.292993,1.260383,-0.112109,-0.227178,-0.190113,-0.481225,-0.070182,-0.249983,-0.273317,...,-0.740124,3.796982,-0.061398,0.869536,-0.085873,-0.1189,-0.419612,-0.265571,-0.319483,-0.196317,-0.433994,3.188354,-0.220435,-0.216114,-0.186306,-0.331917,-0.635162,0.213683,-0.180935,-0.065445,-1.705559,-0.535004,-0.158922,-0.602696,-0.379201,-1.197402,-0.104281,-0.311925,0.011284,-0.271024,-0.301365,-0.358936,-0.397545,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,2.94006,-0.267899
15298,0.029415,1.581819,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,2.942075,-0.241094,-0.325737,-0.320692,-0.606131,1.751225,-0.097521,-0.594161,-0.717191,-0.507845,-0.285498,4.679837,-0.116116,-0.504174,0.604168,-0.256106,-0.021113,3.802972,-0.074191,-0.023942,-0.301742,-0.388262,-0.292993,-0.79341,-0.112109,-0.227178,-0.190113,2.078031,-0.070182,-0.249983,-0.273317,...,-0.740124,-0.263367,-0.061398,-1.150039,-0.085873,-0.1189,-0.419612,3.765467,-0.319483,-0.196317,-0.433994,-0.313641,-0.220435,-0.216114,-0.186306,-0.331917,-0.635162,0.213683,-0.180935,-0.065445,0.586318,-0.535004,-0.158922,-0.602696,-0.379201,-1.197402,-0.104281,-0.311925,0.011284,-0.271024,-0.301365,-0.358936,2.515437,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,-0.340129,-0.267899
5585,-0.187646,-1.109216,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,-0.241094,3.069965,-0.320692,-0.606131,1.751225,-0.097521,-0.594161,1.394329,-0.507845,-0.285498,-0.213683,-0.116116,-0.504174,0.604168,-0.256106,-0.021113,-0.262952,-0.074191,-0.023942,-0.301742,-0.388262,-0.292993,1.260383,-0.112109,-0.227178,-0.190113,-0.481225,-0.070182,-0.249983,-0.273317,...,-0.740124,-0.263367,-0.061398,-1.150039,-0.085873,-0.1189,-0.419612,-0.265571,-0.319483,-0.196317,2.304178,3.188354,-0.220435,-0.216114,-0.186306,-0.331917,-0.635162,0.213683,-0.180935,-0.065445,0.586318,-0.535004,-0.158922,-0.602696,-0.379201,-1.197402,-0.104281,-0.311925,0.011284,-0.271024,-0.301365,-0.358936,-0.397545,3.239185,-0.213357,-0.296055,-0.262814,-0.289391,-0.340129,-0.267899
8044,1.585021,0.041715,2.799245,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,-0.241094,-0.325737,-0.320692,-0.606131,-0.571029,-0.097521,-0.594161,-0.717191,-0.507845,-0.285498,4.679837,-0.116116,-0.504174,0.604168,-0.256106,-0.021113,-0.262952,-0.074191,-0.023942,-0.301742,-0.388262,-0.292993,-0.79341,-0.112109,-0.227178,-0.190113,-0.481225,-0.070182,4.000271,-0.273317,...,-0.740124,-0.263367,-0.061398,-1.150039,-0.085873,-0.1189,-0.419612,-0.265571,-0.319483,-0.196317,-0.433994,-0.313641,-0.220435,-0.216114,-0.186306,-0.331917,-0.635162,0.213683,-0.180935,-0.065445,0.586318,-0.535004,-0.158922,1.659213,-0.379201,-1.197402,-0.104281,-0.311925,0.011284,-0.271024,-0.301365,-0.358936,-0.397545,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,2.94006,-0.267899
2080,-0.36853,0.078208,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,-0.241094,-0.325737,-0.320692,1.649808,-0.571029,-0.097521,-0.594161,-0.717191,-0.507845,3.502653,-0.213683,-0.116116,-0.504174,0.604168,-0.256106,-0.021113,3.802972,-0.074191,-0.023942,-0.301742,-0.388262,-0.292993,1.260383,-0.112109,-0.227178,-0.190113,-0.481225,-0.070182,-0.249983,-0.273317,...,1.351125,-0.263367,-0.061398,0.869536,-0.085873,-0.1189,2.383154,-0.265571,-0.319483,-0.196317,-0.433994,-0.313641,-0.220435,4.627185,-0.186306,-0.331917,-0.635162,0.213683,-0.180935,-0.065445,0.586318,-0.535004,-0.158922,-0.602696,2.637125,0.835141,-0.104281,-0.311925,0.011284,-0.271024,-0.301365,2.78601,-0.397545,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,-0.340129,-0.267899
10283,1.684507,-0.168821,-0.357239,-0.234822,-0.258917,-0.222963,2.978975,-0.332862,-0.297325,-0.339896,-0.241094,-0.325737,-0.320692,1.649808,-0.571029,-0.097521,-0.594161,-0.717191,1.969103,-0.285498,-0.213683,-0.116116,-0.504174,0.604168,-0.256106,-0.021113,-0.262952,-0.074191,-0.023942,-0.301742,-0.388262,-0.292993,-0.79341,-0.112109,-0.227178,-0.190113,-0.481225,-0.070182,-0.249983,3.658749,...,-0.740124,-0.263367,-0.061398,0.869536,-0.085873,-0.1189,-0.419612,3.765467,3.130056,-0.196317,-0.433994,-0.313641,-0.220435,-0.216114,-0.186306,-0.331917,-0.635162,0.213683,-0.180935,-0.065445,0.586318,-0.535004,-0.158922,1.659213,-0.379201,0.835141,-0.104281,3.205904,0.011284,-0.271024,-0.301365,-0.358936,-0.397545,-0.30872,-0.213357,-0.296055,-0.262814,-0.289391,2.94006,-0.267899


In [None]:
from google.colab import files
df_scaled.to_csv('preprocessed.csv') 
files.download('preprocessed.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>