<h1 id="header">BookMe Company</h1>

### Supervised Project

In [2]:
# Relevant Package Importing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import RFE
import scipy.stats as stats
from scipy.stats import chi2_contingency
# Scaling
from sklearn.preprocessing import MinMaxScaler
# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
# Model Assessment
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

### Pre-Processing


In [22]:
#Import Data from cluster.csv
db = pd.read_csv('train.csv')
db[db.duplicated(keep = False)] # We don't have duplicates

Unnamed: 0,Cust_ID,Churn,Name,Longevity,Year_Birth,TypeTravel,RoomType,RewardPoints,Comfort,ReceptionSchedule,...,Wifi,Amenities,Staff,OnlineBooking,PriceQuality,RoomSpace,CheckOut,Checkin,Cleanliness,BarService


##### There aren't any duplicates in our data

##### Set Customer ID as Index

In [23]:
db.set_index('Cust_ID', inplace = True)

##### Devide Target from data

In [24]:
data = db.drop(['Churn'], axis=1)
target = db['Churn']

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15589 entries, 1 to 15589
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               15589 non-null  object 
 1   Longevity          15589 non-null  object 
 2   Year_Birth         15394 non-null  float64
 3   TypeTravel         15589 non-null  object 
 4   RoomType           15589 non-null  object 
 5   RewardPoints       15589 non-null  int64  
 6   Comfort            15589 non-null  int64  
 7   ReceptionSchedule  15589 non-null  int64  
 8   FoodDrink          15589 non-null  int64  
 9   Location           15589 non-null  int64  
 10  Wifi               15589 non-null  int64  
 11  Amenities          15589 non-null  int64  
 12  Staff              15589 non-null  int64  
 13  OnlineBooking      15589 non-null  int64  
 14  PriceQuality       15589 non-null  int64  
 15  RoomSpace          15589 non-null  int64  
 16  CheckOut           155

Data types:
- We'll need to change the data types of all the int64 to int4 or int8 after normalization
- We have nulls in the year of birth variable

##### Separate Numerical from Categorical Data

In [55]:
num_var = data.select_dtypes(include=np.number).set_index(data.index).columns
car_var = data.select_dtypes(exclude=np.number).set_index(data.index).columns
print(num_var)
print(cat_var)

Index(['Year_Birth', 'RewardPoints', 'Comfort', 'ReceptionSchedule',
       'FoodDrink', 'Location', 'Wifi', 'Amenities', 'Staff', 'OnlineBooking',
       'PriceQuality', 'RoomSpace', 'CheckOut', 'Checkin', 'Cleanliness',
       'BarService'],
      dtype='object')
Index(['Name', 'Longevity', 'TypeTravel', 'RoomType'], dtype='object')


##### Replace incorrect performance values

In [27]:
data[num_var].describe()

Unnamed: 0,Year_Birth,RewardPoints,Comfort,ReceptionSchedule,FoodDrink,Location,Wifi,Amenities,Staff,OnlineBooking,PriceQuality,RoomSpace,CheckOut,Checkin,Cleanliness,BarService
count,15394.0,15589.0,15589.0,15589.0,15589.0,15589.0,15589.0,15589.0,15589.0,15589.0,15589.0,15589.0,15589.0,15589.0,15589.0,15589.0
mean,1981.706444,5022.593816,2.841619,2.997242,2.84457,2.986016,3.245109,3.374816,3.506383,3.454231,3.459683,3.470845,3.700558,3.327282,3.692347,3.34736
std,15.179042,1027.962379,1.388624,1.518994,1.436948,1.299438,1.327026,1.352417,1.319565,1.310343,1.26813,1.293873,1.158644,1.266872,1.154437,1.300452
min,1936.0,409.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
25%,1970.0,4445.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,2.0
50%,1981.0,5088.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0
75%,1994.0,5649.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,4.0,5.0,5.0,4.0,5.0,4.0
max,2014.0,6950.0,5.0,5.0,5.0,5.0,6.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [53]:
print((data.loc[:, 'Comfort':] == 0).sum().sum())
print((data.loc[:, 'Comfort':] == 1).sum().sum())
print((data.loc[:, 'Comfort':] == 2).sum().sum())
print((data.loc[:, 'Comfort':] == 3).sum().sum())
print((data.loc[:, 'Comfort':] == 4).sum().sum())
print((data.loc[:, 'Comfort':] == 5).sum().sum())
print((data.loc[:, 'Comfort':] == 6).sum().sum())

2525
25447
34395
45349
62009
48485
36


About valuation of Customer Satisfaction
- 0.01% of valuation 6 --> To remove the 6
- 1.2% of valuation 0 --> seams legit. Keep the 0

##### Transformations to be made
- Remove 6 from Wifi Valuations
- Transform all 'y' to 'yes' in Longevity
- Create bins from Reward Points
- Create variable Gender from the name variable & Drop name variable

##### Make transformations onle after separating training from validation

In [11]:
data[num_var]['Wifi'] = data[num_var]['Wifi'].replace(6,5)
data[car_var]['Longevity'] = data[car_var]['Longevity'].replace('y','yes')

In [None]:
data[num_var]['RewardPoints_Bins'] = pd.cut(data[num_var]['RewardPoints'], 4, labels=['Bronze', 'Silver', 'Gold', 'Platinum'])
data[num_var]['RewardPoints_Bins'] = data[num_var]['RewardPoints_Bins'].replace('Bronze',0).replace('Silver',1).replace('Gold',2).replace('Platinum',3)
data[num_var].drop(columns = ['RewardPoints'], axis=1, inplace=True)

In [59]:
gender = data[car_var]['Name'].str.split(' ',expand = True,n=1)[0]
gender

Cust_ID
1        Ms.
2        Mr.
3        Mr.
4        Ms.
5        Mr.
        ... 
15585    Ms.
15586    Ms.
15587    Mr.
15588    Mr.
15589    Mr.
Name: 0, Length: 15589, dtype: object

In [56]:
gender = data[car_var]['Name'].str.split(' ',expand = True,n=1)[0]
data[car_var]['Gender'] = gender
data[car_var]['Gender'] = data[car_var]['Gender'].replace('Ms.',0).replace('Mr.',1)
data[car_var].drop(columns = ['Name'], axis = 1, inplace = True)

# gender = test['Name'].str.split(' ',expand = True,n=1)[0]
# test['Gender'] = gender
# test['Gender'] = test['Gender'].replace('Ms.',0).replace('Mr.',1)
# test.drop(columns = ['Name'], axis = 1, inplace = True)

KeyError: 'Gender'

In [None]:
# from datetime import date
# db['Age'] = date.today().year - db['Year_Birth']
# db.drop(columns = ['Year_Birth'], axis=1, inplace=True)
# test['Age'] = date.today().year - test['Year_Birth']
# test.drop(columns = ['Year_Birth'], axis=1, inplace=True)

In [None]:
# Fill year of brith NAs with median
db['Year_Birth'] = db['Year_Birth'].fillna(db['Year_Birth'].median()).astype(int).astype(np.int64)

In [13]:
# Converting every column that is int64 to int8 for performance issues
db.iloc[:,(db.dtypes=='int64').values] = db.iloc[:,(db.dtypes=='int64').values].astype(np.int16)
test.iloc[:,(test.dtypes=='int64').values] = test.iloc[:,(test.dtypes=='int64').values].astype(np.int16)

##### Change Year of Birth with Date

In [14]:
# from datetime import date
# db['Age'] = date.today().year - db['Year_Birth']
# db.drop(columns = ['Year_Birth'], axis=1, inplace=True)
# test['Age'] = date.today().year - test['Year_Birth']
# test.drop(columns = ['Year_Birth'], axis=1, inplace=True)

In [16]:
#db.describe().T

##### Notes from initial pre-processing
- We have a minimum age of 8 years and max of 81 in the test file
- Min 8 and max 86 in the train file

##### We should't make new variables, dummy variables, nor replace categorical values for numerical, since we need the end variables to be the same as the test variables

----
##### Start Division between Data and Target

In [21]:
#print(target.info())
#print(data.info())

##### If we want to use Hold-out method

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_val, y_train, y_val = train_test_split(data, 
                                                    target, 
                                                    test_size=0.2, 
                                                    random_state=15, 
                                                    shuffle=True, 
                                                    stratify=target
                                                   )

In [24]:
print('train:{}% | validation:{}%'.format(round(len(y_train)/len(target),2),
                                                     round(len(y_val)/len(target),2)
                                                    ))

train:0.8% | validation:0.2%


##### Dividing DataSet between Numerical and Categorical data

In [25]:
# Numerical data
X_train_num = X_train.select_dtypes(include=np.number).set_index(X_train.index)
X_val_num = X_val.select_dtypes(include=np.number).set_index(X_val.index)

# Categorical data
X_train_cat = X_train.select_dtypes(exclude=np.number).set_index(X_train.index)
X_val_cat = X_val.select_dtypes(exclude=np.number).set_index(X_val.index)

##### Scaling numerical data between 0 and 1

In [26]:
scaler = MinMaxScaler()
scaler.fit(X_train_num)
X_train_num_scaled = scaler.transform(X_train_num) # this will return an array
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns = X_train_num.columns, index = X_train.index) # Convert the array to a pandas dataframe
X_train_num_scaled.head(5)

Unnamed: 0_level_0,Year_Birth,RewardPoints,Comfort,ReceptionSchedule,FoodDrink,Location,Wifi,Amenities,Staff,OnlineBooking,PriceQuality,RoomSpace,CheckOut,Checkin,Cleanliness,BarService
Cust_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9587,0.846154,0.508026,0.6,0.4,0.6,0.5,0.8,0.8,0.75,0.8,0.75,0.8,1.0,0.5,0.75,0.8
4281,0.551282,0.695459,0.2,0.2,0.2,0.75,0.2,0.2,0.0,0.6,1.0,0.4,0.75,0.0,0.75,0.2
6256,0.576923,0.15334,0.6,0.6,0.6,0.5,0.6,0.2,0.0,0.6,1.0,0.8,0.75,0.0,1.0,0.2
10584,0.358974,0.941752,0.6,1.0,1.0,1.0,0.2,0.6,0.75,0.6,0.5,0.6,0.5,0.5,0.5,0.6
9339,0.384615,0.912399,1.0,1.0,1.0,1.0,1.0,1.0,0.75,1.0,1.0,1.0,1.0,0.75,1.0,0.6


In [27]:
X_val_num_scaled = scaler.transform(X_val_num)
X_val_num_scaled = pd.DataFrame(X_val_num_scaled, columns = X_val_num.columns, index = X_val.index)
X_val_num_scaled.head(5)

Unnamed: 0_level_0,Year_Birth,RewardPoints,Comfort,ReceptionSchedule,FoodDrink,Location,Wifi,Amenities,Staff,OnlineBooking,PriceQuality,RoomSpace,CheckOut,Checkin,Cleanliness,BarService
Cust_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
6798,0.346154,0.423024,0.6,0.8,0.8,0.75,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.25,0.5,0.6
7661,0.564103,0.753096,0.4,0.4,0.4,0.25,0.8,1.0,1.0,0.8,0.75,0.8,0.75,0.75,0.75,0.6
5965,0.602564,0.735514,0.6,0.6,0.4,0.5,1.0,0.8,1.0,0.6,0.5,0.6,0.5,0.5,0.5,1.0
1171,0.974359,0.689344,0.2,0.8,0.2,0.5,0.4,0.2,0.25,0.4,1.0,0.8,0.75,0.75,0.75,0.4
11840,0.75641,0.604953,0.4,0.4,0.4,0.75,0.8,0.4,0.75,0.8,0.5,1.0,0.75,0.75,1.0,0.8


### Feature Selection

----

#### Apply Logistic Regression

In [77]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

In [None]:
print('Train:', log_model.score(X_train, y_train))
print('Validation:', log_model.score(X_val, y_val))

----

##### K-Fold Cross Validation

In [None]:
from sklearn.linear_model import LogisticRegression