In [None]:
# Model Comparisons using only bank provided information before to start the campaign and not including external indicators

import pandas as pd
import time
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the dataset
MktgCampaigns = pd.read_csv('data/bank-additional-full.csv', sep=';')

# Select the features and target column
features_bank_stdScaler = MktgCampaigns[['age']]
features_bank_oneHot = MktgCampaigns[['job', 'marital', 'default', 'housing', 'loan']]
features_bank_labelEncoder = MktgCampaigns[['education']]
features_nobank = MktgCampaigns[['contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']]
features_external = MktgCampaigns[['emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']]
target = MktgCampaigns['y']

# Apply StandardScaler to features_bank_stdScaler
scaler = StandardScaler()
scaled_age = scaler.fit_transform(features_bank_stdScaler)
scaled_age_df = pd.DataFrame(scaled_age, columns=['age_scaled'], index=features_bank_stdScaler.index)

# Apply OneHotEncoder to features_bank_oneHot
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_features = encoder.fit_transform(features_bank_oneHot)
encoded_feature_names = encoder.get_feature_names_out(features_bank_oneHot.columns)
encoded_features_df = pd.DataFrame(encoded_features, columns=encoded_feature_names, index=features_bank_oneHot.index)

# Apply LabelEncoder to features_bank_labelEncoder
label_encoder = LabelEncoder()
encoded_education = label_encoder.fit_transform(features_bank_labelEncoder['education'])
encoded_education_df = pd.DataFrame(encoded_education, columns=['education_encoded'], index=features_bank_labelEncoder.index)

# Combine transformed features with other features and target
final_data_only_bank = pd.concat([target, scaled_age_df, encoded_features_df, encoded_education_df], axis=1)

# Split the data into train and test sets
X = final_data_only_bank.drop('y', axis=1)
y = final_data_only_bank['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine": SVC(class_weight='balanced')
}

# Future DataFrame to store results, starting with a list to use results.append() Since append method was removed
# in Pandas version 2.0 and above
results_only_bank = []
# Fit and score each model
for model_name, model in models.items():
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()

    train_time = end_time - start_time
    train_accuracy = accuracy_score(y_train, model.predict(X_train))
    test_accuracy = accuracy_score(y_test, model.predict(X_test))

    results_only_bank.append({
        "Model": model_name,
        "Train Time": train_time,
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy
    })

results_only_bank_df = pd.DataFrame(results_only_bank)
print(results_only_bank_df)