#Uploading Raw Data

In [1]:
from google.colab import files
uploaded = files.upload()

Saving term-deposit-marketing-2020.csv to term-deposit-marketing-2020.csv


In [2]:
import pandas as pd
def read_data(file_path):
  data = pd.read_csv(file_path)
  return data

#Processing Data

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def preprocess_data(data, target):
  unknown_replacement = np.nan
  data.replace("unknown", unknown_replacement, inplace=True)

  column_name1 = "job"
  mode_value1 = data["job"].mode().iloc[0]
  data[column_name1].fillna(mode_value1, inplace=True)

  column_name2 = "education"
  mode_value2 = data["education"].mode().iloc[0]
  data[column_name2].fillna(mode_value2, inplace=True)

  data = data.drop(["contact"], axis=1)

  encoded_data = pd.get_dummies(data, columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'month'])

  mapping = {'yes': 1, 'no': 0}
  features_to_encode = ['y']
  encoded_data[features_to_encode] = encoded_data[features_to_encode].replace(mapping)

  features_to_standardize = ['age', 'balance', 'day', 'duration', 'campaign']
  scaler = StandardScaler()
  encoded_data[features_to_standardize] = scaler.fit_transform(encoded_data[features_to_standardize])

  minority_len = len(encoded_data[encoded_data[target]==1])
  majority_indices = encoded_data[encoded_data[target] == 0].index
  np.random.seed(42) #fixed random seed for reproducibility
  random_majority_indices = np.random.choice(majority_indices, minority_len, replace=False)
  minority_indices = encoded_data[encoded_data[target] == 1].index
  under_sample_indices = np.concatenate([minority_indices, random_majority_indices])
  balanced_data = encoded_data.loc[under_sample_indices]

  X = balanced_data.loc[:, encoded_data.columns!=target]
  y = balanced_data.loc[:, encoded_data.columns==target]

  X_train, X_temp, y_train, y_temp = train_test_split(
      X, y, test_size=0.3, random_state=42
  )
  X_val, X_test, y_val, y_test = train_test_split(
      X_temp, y_temp, test_size=0.5, random_state=42
  )
  return X_train, y_train, X_val, y_val, X_test, y_test

#Model Training

In [4]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def train_model(X_train, y_train, classifier):
  model = classifier(random_state=42)
  cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
  model.fit(X_train, y_train)
  return model, cv_scores

#Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

def tune_hyperparameter(X_train, y_train, classifier):
  n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 5)]  # contains 5 integers, evenly spaced between 100 and 1000
  learning_rate = [0.1, 0.005, 0.001]
  max_depth = [5, 15, 25, 35, 45]
  max_depth.append(None)
  subsample = [0.5, 1]
  random_state = [42]

  param_grid = {'n_estimators': n_estimators,
                 'random_state': random_state,
                 'max_depth': max_depth,
                 'learning_rate': learning_rate,
                 'subsample': subsample}
  base_model = classifier()
  grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, cv=5, scoring='f1')
  grid_search.fit(X_train, y_train)
  return grid_search.best_params_

#Model Evaluation on Validation Set

In [5]:
from sklearn.metrics import f1_score

def evaluate_model(X_train, y_train, X_val, y_val, classifier, best_params):
  tuned_model = classifier(**best_params)
  cv_scores_tuned = cross_val_score(tuned_model, X_train, y_train, cv=5, scoring='f1')
  tuned_model.fit(X_train, y_train)
  val_score = tuned_model.score(X_val, y_val)
  y_pred_val_tuned= tuned_model.predict(X_val)
  f1_val_tuned = f1_score(y_val, y_pred_val_tuned)
  return cv_scores_tuned, val_score, f1_val_tuned

#Final Model Evaluation

In [6]:
def evaluate_final_model(X_train, y_train, X_test, y_test, classifier, best_params):
  tuned_model = classifier(**best_params)
  tuned_model.fit(X_train, y_train)
  test_score = tuned_model.score(X_test, y_test)
  y_pred_test_tuned = tuned_model.predict(X_test)
  f1_test_tuned = f1_score(y_test, y_pred_test_tuned)
  return test_score, f1_test_tuned

#Call Functions

In [7]:
data = read_data("term-deposit-marketing-2020.csv")
X_train, y_train, X_val, y_val, X_test, y_test = preprocess_data(data, "y")
trained_model, cv_score = train_model(X_train, y_train, XGBClassifier)
print("Cross Validation Scores:", cv_score)
#best_params = tune_hyperparameter(X_train, y_train, XGBClassifier)
best_params = {'learning_rate': 0.1, 'max_depth': None, 'n_estimators': 100, 'random_state': 42, 'subsample': 1} ### ideally, delete this line and run line above
print("\nBest Hyperparameters:", best_params)
cv_scores_tuned, val_score, f1_val_tuned = evaluate_model(X_train, y_train, X_val, y_val, XGBClassifier, best_params)
print(f"\nCross-Validation Scores: {cv_scores_tuned:}\nMean F1 Score: {cv_scores_tuned.mean():}\n\nAccuracy on Validation Set: {val_score:}\nF1 Score on Validation Set: {f1_val_tuned:}")
test_score, f1_test_tuned = evaluate_final_model(X_train, y_train, X_test, y_test, XGBClassifier, best_params)
print(f"\nAccuracy on Test Set: {test_score:}\nF1 Score on Test Set: {f1_test_tuned:}")

Cross Validation Scores: [0.87148103 0.85679904 0.86625767 0.8808933  0.882494  ]

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': None, 'n_estimators': 100, 'random_state': 42, 'subsample': 1}

Cross-Validation Scores: [0.87667888 0.85644769 0.87970838 0.88532676 0.88461538]
Mean F1 Score: 0.8765554181819736

Accuracy on Validation Set: 0.8642117376294591
F1 Score on Validation Set: 0.8685968819599108

Accuracy on Test Set: 0.8768699654775605
F1 Score on Test Set: 0.8838219326818675
