## 1. Business Understanding

#### Q3: based on the features available in the data, can we predict whether a cancellation will occur or not?

## 2. Data Understanding

### Gather

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, precision_score, recall_score, f1_score, accuracy_score 


%matplotlib inline

df = pd.read_csv('./hotel_bookings.csv')

### Assess

In [2]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [3]:
df.describe()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119386.0,119390.0,119390.0,119390.0,119390.0,119390.0,103050.0,6797.0,119390.0,119390.0,119390.0,119390.0
mean,0.370416,104.011416,2016.156554,27.165173,15.798241,0.927599,2.500302,1.856403,0.10389,0.007949,0.031912,0.087118,0.137097,0.221124,86.693382,189.266735,2.321149,101.831122,0.062518,0.571363
std,0.482918,106.863097,0.707476,13.605138,8.780829,0.998613,1.908286,0.579261,0.398561,0.097436,0.175767,0.844336,1.497437,0.652306,110.774548,131.655015,17.594721,50.53579,0.245291,0.792798
min,0.0,0.0,2015.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,-6.38,0.0,0.0
25%,0.0,18.0,2016.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,62.0,0.0,69.29,0.0,0.0
50%,0.0,69.0,2016.0,28.0,16.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,179.0,0.0,94.575,0.0,0.0
75%,1.0,160.0,2017.0,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,229.0,270.0,0.0,126.0,0.0,1.0
max,1.0,737.0,2017.0,53.0,31.0,19.0,50.0,55.0,10.0,10.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0


In [4]:
print('proportion of cancellations in the dataset:')
df[df['is_canceled']==1].shape[0]/df.shape[0] 

proportion of cancellations in the dataset:


0.37041628277075134

## 3. Data Preperation

### Clean

In [5]:

# Create a logistic regression model and score the model against the test data
# use functions that are useful from previous notebooks
def find_cat(col):
    
    a=set(col)
    a.discard(np.nan)
    try:
        if isinstance(list(a)[0], str): 
            return True
        else:
            return False
    except:
        return False


In [6]:
def create_dummy_df(df, cat_cols, dummy_na):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df
    '''
    
    cat_df= df[cat_cols]
    dummy_df= pd.get_dummies(cat_df, dummy_na=dummy_na)
    
    numeric_df= df.drop(labels= cat_cols, axis=1)
    
    df= pd.concat([numeric_df, dummy_df], axis=1) #put numeric cols and dummies in one dataframe
    
    
    

    return df

In [7]:
def clean_data(df):
    '''
    INPUT
    df - pandas dataframe 
    
    OUTPUT
    X - A matrix holding all of the variables you want to consider when predicting the response
    y - the corresponding response vector
    '''
    new_df= df.dropna(subset=['is_canceled']) # droping all rows with nans from response
    new_df= new_df.dropna(how='all', axis=1)
    # Cancellation can be inferred from 'reservation_status' and 'reservation_status_date'. Therefore remove 
    new_df=new_df.drop(labels=['reservation_status', 'reservation_status_date'], axis=1) 
    
    a=new_df.apply(find_cat, axis=0)
    cols=new_df[new_df.columns[a]].columns #categorical columns
    cat_df= create_dummy_df(new_df, cat_cols=cols, dummy_na=False)
    fill_mean= lambda col: col.fillna(value= col.mean()) 
    f_df= cat_df.apply(fill_mean, axis=0) # now missing numeric values are filled
    
    
    X = f_df.drop(labels=['is_canceled'], axis=1)
    y = f_df['is_canceled']

    
    return X, y
    
#Use the function to create X and y
X, y = clean_data(df) 

In [8]:
X.head()

Unnamed: 0,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,...,assigned_room_type_K,assigned_room_type_L,assigned_room_type_P,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
0,342,2015,27,1,0,0,2,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,737,2015,27,1,0,0,2,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,7,2015,27,1,0,1,1,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,13,2015,27,1,0,1,1,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,14,2015,27,1,0,2,2,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [9]:
# checking column values
print(set(X['babies']))
print(set(X['children']))
print(set(X['adults'])) 

# arrival_date_year, arrival_date_week_number, arrival_date_day_of_month don't have nans as well as adults and babies

X_new=X.replace(0.10388990333874994, 0) # replace fraction with 0
print(set(X_new['children'])) 
#all other numeric cols are good


{0, 1, 2, 9, 10}
{0.0, 1.0, 2.0, 3.0, 0.10388990333874994, 10.0}
{0, 1, 2, 3, 4, 5, 6, 40, 10, 50, 20, 55, 26, 27}
{0.0, 1.0, 2.0, 3.0, 10.0}


## 4. Modeling

### model

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = .30, random_state=42)

#fit the model and obtain pred response
lr_model = LogisticRegression(max_iter=400) #logistic regression model 400 iters for better convergence
lr_model.fit(X_train, y_train)
y_test_preds = lr_model.predict(X_test)
y_train_preds = lr_model.predict(X_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [11]:
#score model
print('model score on train data', lr_model.score(X_train, y_train))
print('model score on test data', lr_model.score(X_test, y_test))

model score on train data 0.8026396084859942
model score on test data 0.8047295976770807


In [12]:
# model predicts cancellation with 80% accuracy, even though model did not converge

In [13]:
# compute recall, precision and f1 score to further evaluate model
print('recall of model=',recall_score(y_test, y_test_preds))
print('precision of model=',precision_score(y_test, y_test_preds))
print('f1 score of model=',f1_score(y_test, y_test_preds))

recall of model= 0.607691731014319
precision of model= 0.8215262997871694
f1 score of model= 0.6986124278203913


In [14]:
'''lets see if a linear model can do well on this data'''

def to_zero_or_one(val): # predictions will go through this method to extract a classifaciton
    
    if val> 0.5:
        return 1
    else:
        return 0
    
lm_model = LinearRegression(normalize=True)
lm_model.fit(X_train, y_train)
y_test_preds_lm = lm_model.predict(X_test)
y_train_preds_lm = lm_model.predict(X_train)

In [15]:
vfunc = np.vectorize(to_zero_or_one) # mapping numpy array 
y_train_preds_lm =vfunc(y_train_preds_lm)
y_test_preds_lm =vfunc(y_test_preds_lm)

In [16]:
 y_test_preds_lm

array([0, 1, 0, ..., 0, 0, 1])

In [17]:
#score model
print('score of linear model on train data', accuracy_score(y_train, y_train_preds_lm))
print('score of linear model on test data', accuracy_score(y_test, y_test_preds_lm))

score of linear model on train data 0.8058344202074833
score of linear model on test data 0.8042828824301309


In [18]:
# compute recall, precision and f1 score to further evaluate model
print('recall of model=',recall_score(y_test, y_test_preds_lm))
print('precision of model=',precision_score(y_test, y_test_preds_lm))
print('f1 score of model=',f1_score(y_test, y_test_preds_lm))

recall of model= 0.5890246645175801
precision of model= 0.8371870005327651
f1 score of model= 0.691515578243267


## 5. Evaluate the Results

In [19]:
'''After creating and fitting the logistic regression model, one can assert that predicting cancellations can be done
with a high degree of accuracy using the booking information and a logistic regression model. 
In addition, linear model predicts approximately just as wellas the logistic regression model'''

'After creating and fitting the logistic regression model, one can assert that predicting cancellations can be done\nwith a high degree of accuracy using the booking information and a logistic regression model. \nIn addition, linear model predicts approximately just as wellas the logistic regression model'

### Further Analysis of Model

In [20]:
# method obtained from Putting it All Together

def coef_weights(coefficients, X_train):
    '''
    INPUT:
    coefficients - the coefficients of the linear model 
    X_train - the training data, so the column names can be used
    OUTPUT:
    coefs_df - a dataframe holding the coefficient, estimate, and abs(estimate)
    
    Provides a dataframe that can be used to understand the most influential coefficients
    in a linear model by providing the coefficient estimates along with the name of the 
    variable attached to the coefficient.
    '''
    coefs_df = pd.DataFrame()
    coefs_df['est_int'] = X_train.columns
    coefs_df['coefs'] = lm_model.coef_
    coefs_df['abs_coefs'] = np.abs(lm_model.coef_)
    coefs_df = coefs_df.sort_values('abs_coefs', ascending=False)
    return coefs_df

#Use the function
coef_df = coef_weights(lm_model.coef_, X_train)

#A quick look at the top results
coef_df.head(20)

Unnamed: 0,est_int,coefs,abs_coefs
46,country_ASM,-96714160000000.0,96714160000000.0
156,country_MYT,76970900000000.0,76970900000000.0
82,country_DJI,60177350000000.0,60177350000000.0
108,country_HND,-38739330000000.0,38739330000000.0
134,country_LCA,8945376000000.0,8945376000000.0
152,country_MRT,7471886000000.0,7471886000000.0
170,country_PLW,-4861419000000.0,4861419000000.0
186,country_SMR,2292340000000.0,2292340000000.0
36,meal_SC,342204000000.0,342204000000.0
34,meal_FB,342204000000.0,342204000000.0


In [21]:
# judging from table above, model gives weight to customer's country, meal plan, customer type, and deposit type