## Preparing for model training

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, classification_report, precision_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
from xgboost import XGBClassifier
from sklearn.svm import SVC 
from lightgbm import LGBMClassifier
import numpy as np

In [2]:
df = pd.read_csv(r'C:/Users/user/OneDrive/Desktop/data-science-internship/data/cleaned/cleaned_dataset.csv')

In [3]:
df = df.drop(columns = ['hotel', 'arrival_date_month', 'assigned_room_type', 'reserved_room_type', 'season'])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9218 entries, 0 to 9217
Data columns (total 60 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0                      9218 non-null   int64  
 1   is_canceled                     9218 non-null   int64  
 2   lead_time                       9218 non-null   float64
 3   arrival_date_year               9218 non-null   int64  
 4   arrival_date_week_number        9218 non-null   int64  
 5   arrival_date_day_of_month       9218 non-null   int64  
 6   stays_in_weekend_nights         9218 non-null   int64  
 7   stays_in_week_nights            9218 non-null   int64  
 8   adults                          9218 non-null   int64  
 9   children                        9218 non-null   int64  
 10  babies                          9218 non-null   int64  
 11  is_repeated_guest               9218 non-null   int64  
 12  previous_cancellations          92

In [5]:
# Curățăm toate numele de coloane (înlocuim caracterele speciale cu _)
df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)


In [6]:
x = df.drop(columns = 'is_canceled')
y = df['is_canceled']

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9218 entries, 0 to 9217
Data columns (total 60 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed_0                       9218 non-null   int64  
 1   is_canceled                     9218 non-null   int64  
 2   lead_time                       9218 non-null   float64
 3   arrival_date_year               9218 non-null   int64  
 4   arrival_date_week_number        9218 non-null   int64  
 5   arrival_date_day_of_month       9218 non-null   int64  
 6   stays_in_weekend_nights         9218 non-null   int64  
 7   stays_in_week_nights            9218 non-null   int64  
 8   adults                          9218 non-null   int64  
 9   children                        9218 non-null   int64  
 10  babies                          9218 non-null   int64  
 11  is_repeated_guest               9218 non-null   int64  
 12  previous_cancellations          92

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

## Logistic Regression

In [9]:
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter = 1000))

])

pipeline.fit(x_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [10]:
y_pred = pipeline.predict(x_test)
print('Accuracy of Logistic Regression', accuracy_score(y_test, y_pred))
print('Classification raport', classification_report(y_test, y_pred))
print('Precision score', precision_score(y_test, y_pred))
print('F1-Score', f1_score(y_test, y_pred))

Accuracy of Logistic Regression 0.8237527114967462
Classification raport               precision    recall  f1-score   support

           0       0.83      0.91      0.87      1195
           1       0.81      0.66      0.72       649

    accuracy                           0.82      1844
   macro avg       0.82      0.79      0.80      1844
weighted avg       0.82      0.82      0.82      1844

Precision score 0.8068181818181818
F1-Score 0.723874256584537


## Random Forest

In [11]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state = 42))

])

pipeline.fit(x_train , y_train)

0,1,2
,steps,"[('scaler', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
y_pred = pipeline.predict(x_test)
print('Accuracy of Logistic Regression', accuracy_score(y_test, y_pred))
print('Classification raport', classification_report(y_test, y_pred))
print('Precision score', precision_score(y_test, y_pred))
print('F1-Score', f1_score(y_test, y_pred))

Accuracy of Logistic Regression 0.8714750542299349
Classification raport               precision    recall  f1-score   support

           0       0.87      0.95      0.91      1195
           1       0.88      0.73      0.80       649

    accuracy                           0.87      1844
   macro avg       0.87      0.84      0.85      1844
weighted avg       0.87      0.87      0.87      1844

Precision score 0.8800738007380073
F1-Score 0.801007556675063


## XGBoost

In [13]:
pipeline = Pipeline([
    ('sclaer', StandardScaler()),
    ('classifier', XGBClassifier(eval_metric = 'logloss', random_state = 42))
])

pipeline.fit(x_train, y_train)

0,1,2
,steps,"[('sclaer', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [14]:
y_pred = pipeline.predict(x_test)
print('Accuracy of Logistic Regression', accuracy_score(y_test, y_pred))
print('Classification raport', classification_report(y_test, y_pred))
print('Precision score', precision_score(y_test, y_pred))
print('F1-Score', f1_score(y_test, y_pred))

Accuracy of Logistic Regression 0.868763557483731
Classification raport               precision    recall  f1-score   support

           0       0.88      0.92      0.90      1195
           1       0.84      0.78      0.81       649

    accuracy                           0.87      1844
   macro avg       0.86      0.85      0.85      1844
weighted avg       0.87      0.87      0.87      1844

Precision score 0.8363636363636363
F1-Score 0.8070175438596491


## SVM(Support Vector Machine)

In [15]:
pipeline =Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel = 'rbf', random_state = 42))
])

pipeline.fit(x_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('svm', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [16]:
y_pred = pipeline.predict(x_test)
print('Accuracy of Logistic Regression', accuracy_score(y_test, y_pred))
print('Classification raport', classification_report(y_test, y_pred))
print('Precision score', precision_score(y_test, y_pred))
print('F1-Score', f1_score(y_test, y_pred))

Accuracy of Logistic Regression 0.8373101952277657
Classification raport               precision    recall  f1-score   support

           0       0.84      0.93      0.88      1195
           1       0.84      0.67      0.74       649

    accuracy                           0.84      1844
   macro avg       0.84      0.80      0.81      1844
weighted avg       0.84      0.84      0.83      1844

Precision score 0.8362235067437379
F1-Score 0.7431506849315068


## LightGBM

In [17]:
df.columns = df.columns.str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)
x_train.columns = x_train.columns.str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)
x_test.columns = x_test.columns.str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)


In [18]:
model = LGBMClassifier(random_state = 42)
model.fit(x_train, y_train)

[LightGBM] [Info] Number of positive: 2714, number of negative: 4660
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002761 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1191
[LightGBM] [Info] Number of data points in the train set: 7374, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.368050 -> initscore=-0.540592
[LightGBM] [Info] Start training from score -0.540592


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [19]:
y_pred = model.predict(x_test)
print('Accuracy of Logistic Regression', accuracy_score(y_test, y_pred))
print('Classification raport', classification_report(y_test, y_pred))
print('Precision score', precision_score(y_test, y_pred))
print('F1-Score', f1_score(y_test, y_pred))

Accuracy of Logistic Regression 0.8671366594360087
Classification raport               precision    recall  f1-score   support

           0       0.88      0.92      0.90      1195
           1       0.84      0.77      0.80       649

    accuracy                           0.87      1844
   macro avg       0.86      0.85      0.85      1844
weighted avg       0.87      0.87      0.87      1844

Precision score 0.8366666666666667


F1-Score 0.8038430744595677


## Cross-Validation

In [22]:
models = {
    'Logistic Regression': LogisticRegression(max_iter = 100000),
    'Random Forest': RandomForestClassifier(random_state = 42),
    'XGBoost': XGBClassifier(eval_metric = 'logloss', random_state = 42),
    'SVM': SVC(kernel = 'rbf', max_iter = -1, random_state = 42),
    'LightGBM': LGBMClassifier(random_state = 42)
}
scoring = ['accuracy', 'f1', 'roc_auc']

In [23]:
results_df = pd.DataFrame([
    {
        'Model': name,
        'Accuracy': np.mean(scores['test_accuracy']),
        'F1 Score': np.mean(scores['test_f1']),
        'ROC AUC': np.mean(scores['test_roc_auc'])
    }
    for name, model in models.items()
    for scores in [cross_validate(model, x, y, cv=5, scoring=scoring)]
])

# Afișează tabelul final sortat
print(results_df.sort_values(by='F1 Score', ascending=False))

STOP: TOTAL NO. OF F,G EVALUATIONS EXCEEDS LIMIT

You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF F,G EVALUATIONS EXCEEDS LIMIT

You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF F,G EVALUATIONS EXCEEDS LIMIT

You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.

[LightGBM] [Info] Number of positive: 2690, number of negative: 4684
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000861 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1183
[LightGBM] [Info] Number of data points in the train set: 7374, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364795 -> initscore=-0.554611
[LightGBM] [Info] Start training from score -0.554611
[LightGBM] [Info] Number of positive: 2690, number of negative: 4684
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000971 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1196
[LightGBM] [Info] Number of data points in the train set: 7374, number of used features: 57
[LightGBM] [Info] [binar