# Predictions, part IV
- drop columns: no
- scaling: yes
- hyperparameter tuning: yes
- one-hot encoding: yes, the dataset was found encoded
- **resampling: yes**

In this session, I'm addressing class imbalance in the target category.\
Target column "is_canceled" has 37% vs 63% for canceled vs not canceled bookings.\
I'm using 3 techniques:
1. oversampling,
2. undersampling,
3. SMOTE.

The main takeaway from this session is that resampling doesn't improve performance of my models.

# preprocessing

In [1]:
# import libraries
%run common_imports.py

# load and split data
%run load_and_split_data.py
X_train, X_test, y_train, y_test = load_and_split_data()

# scale data
%run minmaxscaler.py
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)

Unnamed: 0,lead_time,arrival_date_week_number,arrival_date_day_of_month,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,...,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,deposit_type_No_Deposit,deposit_type_Non_Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
104182,23,2,11,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
110320,102,17,24,4,1,3,2,0,0,0,...,0,0,0,1,0,0,0,0,0,1
60388,489,46,10,11,0,2,2,0,0,0,...,0,0,0,0,1,0,0,0,1,0
105591,36,7,12,2,2,1,2,0,0,0,...,0,0,0,1,0,0,0,0,1,0
73207,101,33,17,8,1,3,2,0,0,0,...,0,0,0,1,0,0,0,0,1,0







104182    0
110320    0
60388     1
105591    0
73207     1
Name: is_canceled, dtype: int64

Unnamed: 0,lead_time,arrival_date_week_number,arrival_date_day_of_month,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,...,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,deposit_type_No_Deposit,deposit_type_Non_Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
0,0.031208,0.019231,0.333333,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.138399,0.307692,0.766667,0.272727,0.052632,0.06,0.036364,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.663501,0.865385,0.3,0.909091,0.0,0.04,0.036364,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.048847,0.115385,0.366667,0.090909,0.105263,0.02,0.036364,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.137042,0.615385,0.533333,0.636364,0.052632,0.06,0.036364,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0







Unnamed: 0,lead_time,arrival_date_week_number,arrival_date_day_of_month,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,...,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,deposit_type_No_Deposit,deposit_type_Non_Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
0,0.028494,0.423077,0.1,0.454545,0.105263,0.08,0.036364,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.005427,0.019231,0.433333,0.0,0.0,0.02,0.036364,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.02578,0.769231,0.166667,0.818182,0.0,0.06,0.018182,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.005427,0.019231,0.3,0.0,0.0,0.04,0.018182,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.187246,0.480769,0.866667,0.454545,0.0,0.06,0.036364,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


# oversampling

In [None]:
# Make a copy of X_train to avoid altering the original DataFrame
X_train_cl = X_train.copy()

# Add the 'is_canceled' column to X_train_cl
X_train_cl["is_canceled"] = y_train.values

# Separate canceled and not_canceled instances
canceled = X_train_cl[X_train_cl["is_canceled"] == 1]
not_canceled = X_train_cl[X_train_cl["is_canceled"] == 0]

# Visualize class distribution
canceled_plt = X_train_cl["is_canceled"].value_counts()
canceled_plt.plot(kind="bar")
plt.title("Class Distribution Before Oversampling")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

# Oversample the minority class
canceled_oversampled = resample(canceled,
                                replace=True,
                                n_samples=len(not_canceled),
                                random_state=0)

# Concatenate oversampled minority class with majority class
train_over = pd.concat([canceled_oversampled, not_canceled])

# Update X_train and y_train with the oversampled data
X_train = train_over.drop(columns=["is_canceled"])
y_train = train_over["is_canceled"]

# Standardize the oversampled data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Verify the shapes of X_train and y_train
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")

# Visualize class distribution after oversampling
canceled_plt = train_over["is_canceled"].value_counts()
canceled_plt.plot(kind="bar")
plt.title("Class Distribution After Oversampling")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

In [None]:
# Call the function to process the hyperparameter tuning results
from train_with_best_hyperparameters import process_hyperparameter_tuning_results

process_hyperparameter_tuning_results(
    input_file="../data/hyperparameter_tuning_results.csv", 
    output_file="../data/accuracies_with_parameters_oversampled.csv",
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    X_train_scaled=X_train_scaled,
    X_test_scaled=X_test_scaled,
    y_test=y_test
)

# undersampling

In [None]:
# Make a copy of X_train to avoid altering the original DataFrame
X_train_copy = X_train.copy()

# Add the 'is_canceled' column to X_train_copy
X_train_copy["is_canceled"] = y_train.values

# Separate canceled and not_canceled instances
canceled = X_train_copy[X_train_copy["is_canceled"] == 1]
not_canceled = X_train_copy[X_train_copy["is_canceled"] == 0]

# Visualize class distribution
canceled_plt = X_train_copy["is_canceled"].value_counts()
canceled_plt.plot(kind="bar")
plt.title("Class Distribution Before Undersampling")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

# Undersample the majority class
not_canceled_undersampled = resample(not_canceled,
                                     replace=False,
                                     n_samples=len(canceled),
                                     random_state=0)

# Concatenate undersampled majority class with minority class
train_under = pd.concat([not_canceled_undersampled, canceled])

# Update X_train and y_train with the undersampled data
X_train = train_under.drop(columns=["is_canceled"])
y_train = train_under["is_canceled"]

# Standardize the undersampled data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Verify the shapes of X_train and y_train
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")

# Visualize class distribution after undersampling
canceled_plt = train_under["is_canceled"].value_counts()
canceled_plt.plot(kind="bar")
plt.title("Class Distribution After Undersampling")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

In [None]:
# Call the function to process the hyperparameter tuning results
from train_with_best_hyperparameters import process_hyperparameter_tuning_results

process_hyperparameter_tuning_results(
    input_file="../data/hyperparameter_tuning_results.csv", 
    output_file="../data/accuracies_with_parameters_undersampled.csv",
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    X_train_scaled=X_train_scaled,
    X_test_scaled=X_test_scaled,
    y_test=y_test
)

# SMOTE

In [None]:
# Visualize class distribution before SMOTE
canceled_plt_before = y_train.value_counts()
canceled_plt_before.plot(kind="bar")
plt.title("Class Distribution Before SMOTE")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

# Apply SMOTE and reassign to the original variable names
sm = SMOTE(random_state=1, sampling_strategy=1.0)
X_train, y_train = sm.fit_resample(X_train, y_train)

# Standardize the resampled data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Assuming X_test needs to be standardized similarly

# Visualize class distribution after SMOTE
canceled_plt_after = y_train.value_counts()
canceled_plt_after.plot(kind="bar")
plt.title("Class Distribution After SMOTE")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

In [None]:
# Call the function to process the hyperparameter tuning results
from train_with_best_hyperparameters import process_hyperparameter_tuning_results

process_hyperparameter_tuning_results(
    input_file="../data/hyperparameter_tuning_results.csv", 
    output_file="../data/accuracies_with_parameters_smote.csv",
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    X_train_scaled=X_train_scaled,
    X_test_scaled=X_test_scaled,
    y_test=y_test
)

In [11]:
# Load the data
accuracies_with_parameters = pd.read_csv("../data/accuracies_with_parameters.csv")
accuracies_with_parameters_oversampled = pd.read_csv("../data/accuracies_with_parameters_oversampled.csv")
accuracies_with_parameters_undersampled = pd.read_csv("../data/accuracies_with_parameters_undersampled.csv")
accuracies_with_parameters_smote = pd.read_csv("../data/accuracies_with_parameters_smote.csv")

# Identify the best accuracy for each model in each dataframe
idx = accuracies_with_parameters.groupby("model")["accuracy_in_%"].idxmax()
best_params = accuracies_with_parameters.loc[idx, ["model", "best_parameters", "accuracy_in_%", "source"]].reset_index(drop=True)
best_params["dataset"] = "original"

idx_oversampled = accuracies_with_parameters_oversampled.groupby("model")["accuracy_in_%"].idxmax()
best_params_oversampled = accuracies_with_parameters_oversampled.loc[idx_oversampled, ["model", "best_parameters", "accuracy_in_%", "source"]].reset_index(drop=True)
best_params_oversampled["dataset"] = "oversampled"

idx_undersampled = accuracies_with_parameters_undersampled.groupby("model")["accuracy_in_%"].idxmax()
best_params_undersampled = accuracies_with_parameters_undersampled.loc[idx_undersampled, ["model", "best_parameters", "accuracy_in_%", "source"]].reset_index(drop=True)
best_params_undersampled["dataset"] = "undersampled"

idx_smote = accuracies_with_parameters_smote.groupby("model")["accuracy_in_%"].idxmax()
best_params_smote = accuracies_with_parameters_smote.loc[idx_smote, ["model", "best_parameters", "accuracy_in_%", "source"]].reset_index(drop=True)
best_params_smote["dataset"] = "smote"

# Combine the results into a single dataframe
combined_results = pd.concat([best_params, best_params_oversampled, best_params_undersampled, best_params_smote], ignore_index=True)

# Sort the combined results
accuracies_sorted = combined_results.sort_values(by=["model", "accuracy_in_%", "dataset"], ascending=[True, False, True])

# Save the sorted dataframe
accuracies_sorted.to_csv("../data/accuracies_resampled.csv", index=False)

# Display the dataframe
accuracies_sorted

Unnamed: 0,model,best_parameters,accuracy_in_%,source,dataset
0,AdaBoostClassifier,"{'n_estimators': 70, 'learning_rate': 1.0, 'algorithm': 'SAMME'}",80.86,unscaled,original
21,AdaBoostClassifier,"{'n_estimators': 70, 'learning_rate': 1.0, 'algorithm': 'SAMME'}",80.86,unscaled,smote
14,AdaBoostClassifier,"{'n_estimators': 70, 'learning_rate': 1.0, 'algorithm': 'SAMME'}",80.73,unscaled,undersampled
7,AdaBoostClassifier,"{'algorithm': 'SAMME', 'learning_rate': 1.0, 'n_estimators': 40}",80.71,unscaled,oversampled
1,BaggingClassifier,"{'n_estimators': 100, 'max_samples': 0.5, 'max_features': 1.0, 'bootstrap_features': True, 'bootstrap': False}",86.4,scaled,original
22,BaggingClassifier,"{'bootstrap': True, 'bootstrap_features': True, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 80}",86.34,unscaled,smote
8,BaggingClassifier,"{'bootstrap': True, 'bootstrap_features': True, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 80}",85.94,unscaled,oversampled
15,BaggingClassifier,"{'n_estimators': 100, 'max_samples': 0.5, 'max_features': 1.0, 'bootstrap_features': True, 'bootstrap': False}",85.36,scaled,undersampled
23,DecisionTreeClassifier,"{'min_samples_split': 3, 'min_samples_leaf': 2, 'max_depth': 16}",82.98,scaled,smote
2,DecisionTreeClassifier,"{'min_samples_split': 3, 'min_samples_leaf': 2, 'max_depth': 16}",82.93,unscaled,original
