## Preparing for model training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, precision_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
from xgboost import XGBClassifier
from sklearn.svm import SVC 
from lightgbm import LGBMClassifier
import numpy as np
import pickle
import matplotlib.pyplot as plt
import os

In [None]:
df = pd.read_csv(r'C:/Users/user/OneDrive/Desktop/data-science-internship/data/cleaned/cleaned_dataset.csv')

In [None]:
df = df.drop(columns = ['hotel', 'arrival_date_month', 'assigned_room_type', 'reserved_room_type', 'season'])

In [None]:
df.info()

In [None]:
df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

In [None]:
x = df.drop(columns = 'is_canceled')
y = df['is_canceled']

In [None]:
df.info()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

## Logistic Regression

In [None]:
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter = 1000))

])

pipeline.fit(x_train, y_train)

In [None]:
y_pred = pipeline.predict(x_test)
print('Accuracy of Logistic Regression', accuracy_score(y_test, y_pred))
print('Classification raport', classification_report(y_test, y_pred))
print('Precision score', precision_score(y_test, y_pred))
print('F1-Score', f1_score(y_test, y_pred))

## Random Forest

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state = 42))

])

pipeline.fit(x_train , y_train)

In [None]:
y_pred = pipeline.predict(x_test)
print('Accuracy of Logistic Regression', accuracy_score(y_test, y_pred))
print('Classification raport', classification_report(y_test, y_pred))
print('Precision score', precision_score(y_test, y_pred))
print('F1-Score', f1_score(y_test, y_pred))

## XGBoost

In [None]:
pipeline = Pipeline([
    ('sclaer', StandardScaler()),
    ('classifier', XGBClassifier(eval_metric = 'logloss', random_state = 42))
])

pipeline.fit(x_train, y_train)

In [None]:
y_pred = pipeline.predict(x_test)
print('Accuracy of Logistic Regression', accuracy_score(y_test, y_pred))
print('Classification raport', classification_report(y_test, y_pred))
print('Precision score', precision_score(y_test, y_pred))
print('F1-Score', f1_score(y_test, y_pred))

## SVM(Support Vector Machine)

In [None]:
pipeline =Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel = 'rbf', random_state = 42))
])

pipeline.fit(x_train, y_train)

In [None]:
y_pred = pipeline.predict(x_test)
print('Accuracy of Logistic Regression', accuracy_score(y_test, y_pred))
print('Classification raport', classification_report(y_test, y_pred))
print('Precision score', precision_score(y_test, y_pred))
print('F1-Score', f1_score(y_test, y_pred))

## LightGBM

In [None]:
df.columns = df.columns.str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)
x_train.columns = x_train.columns.str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)
x_test.columns = x_test.columns.str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)


In [None]:
model = LGBMClassifier(random_state = 42)
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)
print('Accuracy of Logistic Regression', accuracy_score(y_test, y_pred))
print('Classification raport', classification_report(y_test, y_pred))
print('Precision score', precision_score(y_test, y_pred))
print('F1-Score', f1_score(y_test, y_pred))

## Cross-Validation

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter = 100000),
    'Random Forest': RandomForestClassifier(random_state = 42),
    'XGBoost': XGBClassifier(eval_metric = 'logloss', random_state = 42),
    'SVM': SVC(kernel = 'rbf', max_iter = -1, random_state = 42),
    'LightGBM': LGBMClassifier(random_state = 42)
}
scoring = ['accuracy', 'f1', 'roc_auc']

In [None]:
results_df = pd.DataFrame([
    {
        'Model': name,
        'Accuracy': np.mean(scores['test_accuracy']),
        'F1 Score': np.mean(scores['test_f1']),
        'ROC AUC': np.mean(scores['test_roc_auc'])
    }
    for name, model in models.items()
    for scores in [cross_validate(model, x, y, cv=5, scoring=scoring)]
])

# Afișează tabelul final sortat
print(results_df.sort_values(by='F1 Score', ascending=False))

In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42),
                           param_grid,
                           cv=5,
                           scoring='f1',
                           n_jobs=-1)

grid_search.fit(x_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best F1 score:", grid_search.best_score_)


In [None]:
best_params = grid_search.best_params_

final_model = RandomForestClassifier(**best_params, random_state=42)
final_model.fit(x_train, y_train)


In [None]:

y_pred = final_model.predict(x_test)

print("Test F1 score:", f1_score(y_test, y_pred))
print("Test Accuracy:", accuracy_score(y_test, y_pred))
# Pentru roc_auc ai nevoie să faci predict_proba:
y_proba = final_model.predict_proba(x_test)[:, 1]
print("Test ROC AUC:", roc_auc_score(y_test, y_proba))


In [None]:
param_dist = {
    'n_estimators': [50, 100, 150, 200, 300, 500],
    'max_depth': [None, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [2, 5, 10, 20],
    'max_features': ['auto', 'sqrt', 'log2']
}


rf = RandomForestClassifier(random_state=42)


random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=100, 
    scoring='f1',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)


random_search.fit(x_train, y_train)


print("Best parameters:", random_search.best_params_)
print("Best F1 score:", random_search.best_score_)

In [None]:
best_params = {
    'n_estimators': 100,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'max_depth': 20,
    'random_state': 42
}


best_rf = RandomForestClassifier(**best_params)


best_rf.fit(x_train, y_train)

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

y_pred = best_rf.predict(x_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test F1 Score:", f1_score(y_test, y_pred))
print("Test Precision:", precision_score(y_test, y_pred))
print("Test Recall:", recall_score(y_test, y_pred))


In [None]:
from sklearn.metrics import f1_score

y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

print(f"F1 Score - Train: {f1_train:.4f}")
print(f"F1 Score - Test:  {f1_test:.4f}")


In [None]:
model = RandomForestClassifier(
    n_estimators=100,         
    max_depth=35,             
    min_samples_split=10,      
    min_samples_leaf=5,      
    max_features='sqrt',      
    random_state=42
)


model.fit(x_train, y_train)


y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

f1 = f1_score(y, y_pred)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)

print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

## XGBoost Tunning

In [None]:
model = XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)


param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 5, 7, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}


random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=30,             
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)


random_search.fit(x_train, y_train)


print("Best parameters:", random_search.best_params_)
print("Best F1 score (cv):", random_search.best_score_)

In [None]:
best_model = random_search.best_estimator_


y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

print(f"F1 Score - Train: {f1_train:.4f}")
print(f"F1 Score - Test:  {f1_test:.4f}")

In [None]:
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=30,
                                   scoring='f1', cv=5, verbose=1, n_jobs=-1, random_state=42)

random_search.fit(x_train, y_train)

print("Best parameters:", random_search.best_params_)
print("Best F1 score (CV):", random_search.best_score_)


best_model = random_search.best_estimator_

y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

print(f"F1 Score - Train: {f1_score(y_train, y_train_pred):.4f}")
print(f"F1 Score - Test:  {f1_score(y_test, y_test_pred):.4f}")

In [None]:

best_params = {
    'subsample': 0.6,
    'reg_lambda': 2,
    'reg_alpha': 1,
    'n_estimators': 200,
    'max_depth': 6,
    'learning_rate': 0.05,
    'colsample_bytree': 0.8,
    'random_state': 42
}


model = XGBClassifier(**best_params)


model.fit(x_train, y_train)


y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)


acc_train = accuracy_score(y_train, y_train_pred)
prec_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)


acc_test = accuracy_score(y_test, y_test_pred)
prec_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

print("Train set metrics:")
print(f"Accuracy:  {acc_train:.4f}")
print(f"Precision: {prec_train:.4f}")
print(f"Recall:    {recall_train:.4f}")
print(f"F1 Score:  {f1_train:.4f}")

print("\nTest set metrics:")
print(f"Accuracy:  {acc_test:.4f}")
print(f"Precision: {prec_test:.4f}")
print(f"Recall:    {recall_test:.4f}")
print(f"F1 Score:  {f1_test:.4f}")



model.fit(x, y)


y_pred = model.predict(x)


metrics = {
    'Accuracy': accuracy_score(y, y_pred),
    'Precision': precision_score(y, y_pred),
    'Recall': recall_score(y, y_pred),
    'F1 Score': f1_score(y, y_pred)
}


results_df = pd.DataFrame([metrics])
print(results_df.round(4))



In [None]:

pred_df = pd.DataFrame({'prediction': y_pred})
pred_df.to_csv('C:/Users/user/OneDrive/Desktop/data-science-internship/data/results/test_predictions.csv', index=False)


In [None]:

with open('C:/Users/user/OneDrive/Desktop/data-science-internship/outputs/best_model.pkl', 'wb') as file:
    pickle.dump(model, file)



In [None]:
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

print(accuracy)
print(precision)
print(recall)
print(f1)

In [None]:
output_folder = 'C:/Users/user/OneDrive/Desktop/data-science-internship/outputs/plots/model_comparison'


cm = confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.savefig(os.path.join(output_folder, "confusion_matrix.png"))
plt.close()

y_prob = model.predict_proba(x)[:, 1]  

fpr, tpr, thresholds = roc_curve(y, y_prob)
auc = roc_auc_score(y, y_prob)

plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {auc:.4f})")
plt.plot([0,1], [0,1], 'k--')  
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()

plt.savefig(os.path.join(output_folder, "roc_curve.png"))
plt.close()


metrics = {
    'Accuracy': accuracy,
    'F1 Score': f1,
    'Precision': precision,
    'Recall': recall
}

plt.bar(metrics.keys(), metrics.values(), color=['blue', 'green', 'orange', 'red'])
plt.ylim(0,1)
plt.title("Model Performance Metrics")
plt.savefig(os.path.join(output_folder, "performane_metrics.png"))
plt.close()


In [None]:



xgb_metrics = {
    'Accuracy': 0.8928,
    'Precision': 0.8854,
    'Recall': 0.8112,
    'F1 Score': 0.8467
}

rf_metrics = {
    'Accuracy': 0.8714,
    'Precision': 0.8800,
    'Recall': 0.7970,
    'F1 Score': 0.8010
}


metrics_names = list(xgb_metrics.keys())
xgb_values = list(xgb_metrics.values())
rf_values = list(rf_metrics.values())

x = np.arange(len(metrics_names))  
width = 0.35  


plt.figure(figsize=(10, 6))
plt.bar(x - width/2, xgb_values, width, label='XGBoost', color='royalblue')
plt.bar(x + width/2, rf_values, width, label='Random Forest', color='orange')

plt.xlabel('Metrics')
plt.ylabel('Value')
plt.title('Performance comparison: XGBoost vs Random Forest')
plt.xticks(x, metrics_names)
plt.ylim(0, 1)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig(os.path.join(output_folder, "XGBoost_vs_RDF.png"))
plt.close()

