## **Bruno Andrade Schiavone 2142546**
## **Pedro Bernardi Alves 1914618**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics  as sts
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## 1) Carregamento da base dados

In [None]:
df=pd.read_csv('hotel-reservations.csv')
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['booking_status'].value_counts()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
df['booking_status'].value_counts()

## 2) Pré-processamento

In [None]:
df.drop(['Booking_ID'],axis=1, inplace=True)

In [None]:
df.select_dtypes(include='object').nunique()

In [None]:
df.info()
df.isnull().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
labelencoder = LabelEncoder()
df["booking_status"] = labelencoder.fit_transform(df["booking_status"])

In [None]:
df

In [None]:
df = pd.get_dummies(df, columns=['market_segment_type', 'room_type_reserved', 'type_of_meal_plan'])

In [None]:
df

In [None]:
cols_to_drop = ['arrival_date', 'arrival_year']
df.drop(cols_to_drop,axis=1, inplace=True)
df

### Normalização dos dados

In [None]:
df.nunique()

In [None]:
std=StandardScaler()

#columns = ['lead_time','avg_price_per_room']
columns = ['lead_time','avg_price_per_room', 'no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights', 'required_car_parking_space', 'arrival_month', 'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled', 'no_of_special_requests']
df[columns] = \
std.fit_transform(df[columns])

In [None]:
df

In [None]:
df.describe()

In [None]:
cor=df.corr()
plt.figure(figsize=(15,15),layout='constrained')
plt.title('Matriz de correlação')
sns.heatmap(cor,annot=True)
plt.show()

In [None]:
cor=df.corr()
target=cor['booking_status'].drop('booking_status')
target_s=target.sort_values(ascending=False)
plt.figure(figsize=(10,10),layout='constrained')
plt.title('Variáveis importantes de acordo com a variável alvo')
sns.heatmap(target_s.to_frame(),annot=True)
plt.show()

## 3) Algoritmos de Classificação
* Árvore de Decisão
* KNN

In [None]:
X = df.drop(['booking_status'], axis=1).values
y = df['booking_status'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

Decision Tree

In [None]:
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)

DT_score = DT.score(X_train, y_train)
DT_test = DT.score(X_test, y_test)

cm = confusion_matrix(y_test,y_pred)
print('Training Score',DT_score)
print('Testing Score \n',DT_test)
print(cm)

### KNN Classifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

train_score = knn.score(X_train,y_train)
test_score = knn.score(X_test, y_test)

cm = confusion_matrix(y_test,y_pred)
print('Training Score',train_score)
print('Testing Score \n',test_score)
print(cm)

## 4) validação cruzada

In [None]:
from sklearn.model_selection import StratifiedKFold

#### Decision Tree - Cross-Validation

In [None]:
# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
lst_accu_stratified = []

for train_index, test_index in skf.split(X, y):
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    DT.fit(x_train_fold, y_train_fold)
    lst_accu_stratified.append(DT.score(x_test_fold, y_test_fold))

# Print the output.
print('Lista de ACC:', lst_accu_stratified)
print('\nMaior ACC:',
      max(lst_accu_stratified)*100, '%')
print('\nMenor ACC:',
      min(lst_accu_stratified)*100, '%')
print('\nMédia ACC:',
      sts.mean(lst_accu_stratified)*100, '%')
print('\nDesvio Padrão:', sts.stdev(lst_accu_stratified))

#### KNN - Cross-Validation

In [None]:
# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
lst_accu_stratified = []

for train_index, test_index in skf.split(X, y):
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    knn.fit(x_train_fold, y_train_fold)
    lst_accu_stratified.append(knn.score(x_test_fold, y_test_fold))

# Print the output.
print('Lista de ACC:', lst_accu_stratified)
print('\nMaior ACC:',
      max(lst_accu_stratified)*100, '%')
print('\nMenor ACC:',
      min(lst_accu_stratified)*100, '%')
print('\nMédia ACC:',
      sts.mean(lst_accu_stratified)*100, '%')
print('\nDesvio Padrão:', sts.stdev(lst_accu_stratified))

## 5) Balanceamento das classes

In [None]:
df['booking_status'].value_counts()

In [None]:
from imblearn.over_sampling import SMOTE

#### Decision Tree - Balanceamento e Validação Cruzada

In [None]:
# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
lst_accu_stratified = []

for train_index, test_index in skf.split(X, y):
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    sm = SMOTE()
    x_train_oversampled, y_train_oversampled = sm.fit_resample(x_train_fold, y_train_fold)
    DT.fit(x_train_oversampled, y_train_oversampled)
    lst_accu_stratified.append(DT.score(x_test_fold, y_test_fold))

# Print the output.
print('Lista de ACC:', lst_accu_stratified)
print('\nMaior ACC:',
      max(lst_accu_stratified)*100, '%')
print('\nMenor ACC:',
      min(lst_accu_stratified)*100, '%')
print('\nMédia ACC:',
      sts.mean(lst_accu_stratified)*100, '%')
print('\nDesvio Padrão:', sts.stdev(lst_accu_stratified))

#### KNN - Balanceamento e Validação Cruzada

In [None]:
# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
lst_accu_stratified = []

for train_index, test_index in skf.split(X, y):
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    sm = SMOTE()
    x_train_oversampled, y_train_oversampled = sm.fit_resample(x_train_fold, y_train_fold)
    knn.fit(x_train_oversampled, y_train_oversampled)
    lst_accu_stratified.append(knn.score(x_test_fold, y_test_fold))

# Print the output.
print('Lista de ACC:', lst_accu_stratified)
print('\nMaior ACC:',
      max(lst_accu_stratified)*100, '%')
print('\nMenor ACC:',
      min(lst_accu_stratified)*100, '%')
print('\nMédia ACC:',
      sts.mean(lst_accu_stratified)*100, '%')
print('\nDesvio Padrão:', sts.stdev(lst_accu_stratified))