In [1]:
def is_wthn_k_days(x, k):
    res = 0
    if (x > 0) & (x < k):
        res = 1
    return res

def predict_results(model, X_test, y_test):
    y_pred = model.predict(X_test)

    # Calculate AUC and confusion matrix
    auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict = True)
    cr = classification_report(y_test, y_pred, output_dict = False)

    return model.__class__.__name__, auc_score, conf_matrix, class_report, cr

def train_models(df, target_col, feat_cols, n_splits = 10, rand_state = 42023):

    X = df[feat_cols].values
    y = df[target_col].values

    naive_bayes_model = GaussianNB(var_smoothing = 1e-8)

    xgb_model = xgb.XGBClassifier(scale_pos_weight=99,
                                objective = 'binary:logistic',  
                                eval_metric = 'auc', 
                                random_state = rand_state)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=rand_state)

    model_list = [xgb_model, naive_bayes_model]

    for train_idx, test_idx in skf.split(X, y):
        for model in model_list:
            model.fit(X[train_idx], y[train_idx])
    
    return model_list

def test_models(valid_df, target_col, feat_cols, model_list):

    X = valid_df[feat_cols].values
    y = valid_df[target_col].values

    res_list = []
    for model in model_list:
        model_name, auc_score, conf_matrix, class_report, cr = predict_results(model, X, y)

        print("##########################################")
        print(cr)
        print("##########################################")

        res = {"MODEL_NAME": model_name,
               "features_used": feat_cols,
               "AUC": auc_score,
               "True_Neg": conf_matrix[0][0],
               "False_Pos": conf_matrix[0][1],
               "False_Neg": conf_matrix[1][0],
               "True_Pos": conf_matrix[1][1]
        }
        res.update(class_report)
        res_list.append(res)

    return res_list

def feature_selection(df, feature_cols, target_col, k=10):
    X = df[feature_cols].values
    y = df[target_col].values

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    kbest_selector = SelectKBest(score_func=chi2, k=k)
    kbest_selector.fit(X_scaled, y)
    
    kbest_feat_cols = df[feature_cols].columns[kbest_selector.get_support(indices=True)]

    removed_columns = [x for x in feature_cols if x not in kbest_feat_cols]
    
    feat_imp_df = pd.DataFrame([{col: scor for col, scor in zip(feature_cols, kbest_selector.scores_)}]).T.reset_index()

    feat_imp_df.columns = ["Variable", "feat_imp"]

    feat_imp_df.sort_values("feat_imp", ascending = False, inplace = True)

    return kbest_feat_cols, removed_columns, feat_imp_df
    
def plot_chi2_scores(df, var_col, score_col, normalize = False, top_k = None, figsize = (25, 20)):
    # Sort the DataFrame by Chi-squared scores in descending order
    sorted_df = df.sort_values(by=score_col, ascending=False)

    if normalize == True:    
        # Scale the Chi-squared scores between 0 and 1 using Min-Max scaling
        min_score = sorted_df[score_col].min()
        max_score = sorted_df[score_col].max()
        sorted_df[score_col] = (sorted_df[score_col] - min_score) / (max_score - min_score)

    if top_k != None:
        sorted_df = sorted_df.iloc[0:top_k]

    # Create a horizontal bar chart
    plt.figure(figsize = figsize)
    plt.barh(sorted_df[var_col], sorted_df[score_col], color='skyblue', edgecolor='k', linewidth=1)
    
    # Customize the chart
    plt.title('Scaled Chi-Squared Scores for Variables', fontsize=18)
    plt.ylabel('Variable Names', fontsize=14)
    plt.xlabel('Scaled Chi-Squared Score (0 to 1)', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    
    # Display the chart
    plt.tight_layout()
    plt.show()





In [2]:
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, auc
import pandas as pd
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import datetime


fold_pth = "~/ds7900_spring23_team2/data_process/data/subsets/"

df_pth = fold_pth + "final_email_subdf.csv"
val_pth = fold_pth + "final_validation_subdf.csv"

full_email = {"HASH_NBR": "int64", "CAMPAIGN_NBR": "int64", "UNSUB_IND": "int8", "SEND_DT": "object", "CLICK": "int8", "MBR_PRGM_ACTV": "int8", "week_of_year": "int16", "day_of_month": "int16", "day_of_year": "int16", "is_end_month": "int8", "is_start_month": "int8", "is_end_quarter": "int8", "is_start_quarter": "int8", "CC_2": "int8", "CC_3": "int8", "CC_4": "int8", "TIER_2": "int8", "TIER_3": "int8", "TIER_4": "int8", "quarter_1": "int8", "quarter_3": "int8", "quarter_4": "int8", "GNDR_1": "int8", "GNDR_2": "int8", "EN_1": "int8", "EN_2": "int8", "EN_3": "int8", "EN_4": "int8", "EN_6": "int8", "AGE_1": "int8", "AGE_2": "int8", "AGE_3": "int8", "AGE_4": "int8", "AGE_5": "int8", "INCOME_1": "int8", "INCOME_2": "int8", "INCOME_3": "int8", "INCOME_4": "int8", "INCOME_5": "int8", "January": "int8", "February": "int8", "April": "int8", "May": "int8", "June": "int8", "July": "int8", "August": "int8", "September": "int8", "October": "int8", "November": "int8", "December": "int8", "Monday": "int8", "Tuesday": "int8", "Wednesday": "int8", "Thursday": "int8", "Friday": "int8", "Saturday": "int8", "Fall": "int8", "Summer": "int8", "Winter": "int8", "Oceania": "int8", "Asia_Africa": "int8", "AUSTRALASIA_JAPAN": "int8", "CANADA": "int8", "CARIBBEAN": "int8", "EUROPE": "int8", "HONG_KONG": "int8", "INDIA_MIDDLE_EAST_AFRICA": "int8", "LATIN_AMERICA": "int8", "MAINLAND_CHINA": "int8", "MEXICO": "int8", "SOUTHEAST_ASIA_KOREA": "int8", "TAIWAN": "int8", "MACAU": "int8", "CONF_HASH_NBR": "int64", "HTL_HASH_NBR": "int64", "NUM_UNIQUE_CONF_HASH_NBR": "int16", "NUM_UNIQUE_HTL_HASH_NBR": "int16", "BUS_LEIS_IND": "int8", "STAY_REVENUE_USD": "float64", "TIME_BWTN_CONF_CKIN_DT": "int16", "TIME_BWTN_CKIN_CKOUT_DT": "int16", "CONF_DT_daysto_holiday": "int16", "CK_IN_DT_daysto_holiday": "int16", "CK_OUT_DT_daysto_holiday": "int16", "CONF_DT_nearest_holiday_dayofyear": "int16", "CK_IN_DT_nearest_holiday_dayofyear": "int16", "CK_OUT_DT_nearest_holiday_dayofyear": "int16", "DIFFERENT_CONTINENT": "int8", "MBR_TENURE_ASOF_CONF_DT": "int16", "MBR_TENURE_ASOF_CK_IN_DT": "int16", "MBR_TENURE_ASOF_CK_OUT_DT": "int16", "HTL_RGN_EMEAA": "int8", "HTL_RGN_GCHINA": "int8", "HTL_CHAIN_CATEGORY_CHN_CAT_1": "int8", "HTL_CHAIN_CATEGORY_CHN_CAT_2": "int8", "HTL_CHAIN_CATEGORY_CHN_CAT_3": "int8", "REWARD_NT_RN_1": "int8", "REWARD_NT_RN_2": "int8", "HTL_CHAIN_CHN_2": "int8", "HTL_CHAIN_CHN_3": "int8", "HTL_CHAIN_CHN_4": "int8", "HTL_CHAIN_CHN_5": "int8", "HTL_CHAIN_CHN_6": "int8", "HTL_CHAIN_CHN_7": "int8", "HTL_CHAIN_CHN_9": "int8", "HTL_CHAIN_CHN_10": "int8", "HTL_CHAIN_CHN_11": "int8", "HTL_CHAIN_CHN_12": "int8", "HTL_CHAIN_CHN_13": "int8", "HTL_CHAIN_CHN_14": "int8", "HTL_CHAIN_CHN_15": "int8", "HTL_CHAIN_CHN_18": "int8", "HTL_CHAIN_CHN_20": "int8", "HTL_CHAIN_CHN_22": "int8", "HTL_CHAIN_CHN_23": "int8", "NUM_STAYS": "int16", "TOTAL_STAY_REVENUE": "float64", "TOTAL_ROOMS": "int16", "TOTAL_GUEST_QTY": "int16", "TOTAL_TIME_BWTN_CONF_CKIN_DT": "int16", "TOTAL_TIME_BWTN_CKIN_CKOUT_DT": "int16", "TOTAL_CONF_DT_daysto_holiday": "int16", "TOTAL_CK_IN_DT_daysto_holiday": "int16", "TOTAL_CK_OUT_DT_daysto_holiday": "int16", "TOTAL_CONF_DT_nearest_holiday_dayofyear": "int16", "TOTAL_CK_IN_DT_nearest_holiday_dayofyear": "int16", "TOTAL_CK_OUT_DT_nearest_holiday_dayofyear": "int16", "NUM_BUS_LEIS_IND": "int16", "TOTAL_BUS_LEIS_IND_REVENUE": "float64", "TOTAL_BUS_LEIS_IND_ROOMS": "int16", "TOTAL_BUS_LEIS_IND_GUEST_QTY": "int16", "NUM_DIFFERENT_CONTINENT": "int16", "TOTAL_DIFFERENT_CONTINENT_REVENUE": "float64", "TOTAL_DIFFERENT_CONTINENT_ROOMS": "int16", "TOTAL_DIFFERENT_CONTINENT_GUEST_QTY": "int16", "NUM_HTL_RGN_EMEAA": "int16", "TOTAL_HTL_RGN_EMEAA_REVENUE": "float64", "TOTAL_HTL_RGN_EMEAA_ROOMS": "int16", "TOTAL_HTL_RGN_EMEAA_GUEST_QTY": "int16", "NUM_HTL_RGN_GCHINA": "int16", "TOTAL_HTL_RGN_GCHINA_REVENUE": "float64", "TOTAL_HTL_RGN_GCHINA_ROOMS": "int16", "TOTAL_HTL_RGN_GCHINA_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CATEGORY_CHN_CAT_1": "int16", "TOTAL_HTL_CHAIN_CATEGORY_CHN_CAT_1_REVENUE": "float64", "TOTAL_HTL_CHAIN_CATEGORY_CHN_CAT_1_ROOMS": "int16", "TOTAL_HTL_CHAIN_CATEGORY_CHN_CAT_1_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CATEGORY_CHN_CAT_2": "int16", "TOTAL_HTL_CHAIN_CATEGORY_CHN_CAT_2_REVENUE": "float64", "TOTAL_HTL_CHAIN_CATEGORY_CHN_CAT_2_ROOMS": "int16", "TOTAL_HTL_CHAIN_CATEGORY_CHN_CAT_2_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CATEGORY_CHN_CAT_3": "int16", "TOTAL_HTL_CHAIN_CATEGORY_CHN_CAT_3_REVENUE": "float64", "TOTAL_HTL_CHAIN_CATEGORY_CHN_CAT_3_ROOMS": "int16", "TOTAL_HTL_CHAIN_CATEGORY_CHN_CAT_3_GUEST_QTY": "int16", "NUM_REWARD_NT_RN_1": "int16", "TOTAL_REWARD_NT_RN_1_REVENUE": "float64", "TOTAL_REWARD_NT_RN_1_ROOMS": "int16", "TOTAL_REWARD_NT_RN_1_GUEST_QTY": "int16", "NUM_REWARD_NT_RN_2": "int16", "TOTAL_REWARD_NT_RN_2_REVENUE": "float64", "TOTAL_REWARD_NT_RN_2_ROOMS": "int16", "TOTAL_REWARD_NT_RN_2_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_2": "int16", "TOTAL_HTL_CHAIN_CHN_2_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_2_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_2_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_3": "int16", "TOTAL_HTL_CHAIN_CHN_3_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_3_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_3_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_4": "int16", "TOTAL_HTL_CHAIN_CHN_4_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_4_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_4_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_5": "int16", "TOTAL_HTL_CHAIN_CHN_5_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_5_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_5_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_6": "int16", "TOTAL_HTL_CHAIN_CHN_6_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_6_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_6_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_7": "int16", "TOTAL_HTL_CHAIN_CHN_7_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_7_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_7_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_9": "int16", "TOTAL_HTL_CHAIN_CHN_9_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_9_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_9_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_10": "int16", "TOTAL_HTL_CHAIN_CHN_10_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_10_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_10_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_11": "int16", "TOTAL_HTL_CHAIN_CHN_11_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_11_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_11_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_12": "int16", "TOTAL_HTL_CHAIN_CHN_12_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_12_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_12_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_13": "int16", "TOTAL_HTL_CHAIN_CHN_13_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_13_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_13_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_14": "int16", "TOTAL_HTL_CHAIN_CHN_14_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_14_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_14_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_15": "int16", "TOTAL_HTL_CHAIN_CHN_15_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_15_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_15_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_18": "int16", "TOTAL_HTL_CHAIN_CHN_18_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_18_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_18_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_20": "int16", "TOTAL_HTL_CHAIN_CHN_20_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_20_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_20_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_22": "int16", "TOTAL_HTL_CHAIN_CHN_22_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_22_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_22_GUEST_QTY": "int16", "NUM_HTL_CHAIN_CHN_23": "int16", "TOTAL_HTL_CHAIN_CHN_23_REVENUE": "float64", "TOTAL_HTL_CHAIN_CHN_23_ROOMS": "int16", "TOTAL_HTL_CHAIN_CHN_23_GUEST_QTY": "int16", "MBR_TENURE": "int16"}


In [3]:
%%time

df = pd.read_csv(df_pth, parse_dates = ["SEND_DT"], dtype = full_email)
valid_df = pd.read_csv(val_pth, parse_dates = ["SEND_DT"], dtype = full_email)

for k in [60, 90, 180]:
    df[f"DAYS_SINCE_LAST_CLICK_WTHN_{k}_DAYS"] = df["DAYS_SINCE_LAST_CLICK"] * df[f"CLICKED_WTHN_{k}_DAYS"]
    valid_df[f"DAYS_SINCE_LAST_CLICK_WTHN_{k}_DAYS"] = valid_df["DAYS_SINCE_LAST_CLICK"] * valid_df[f"CLICKED_WTHN_{k}_DAYS"]



print("ROWS:", "{:,}".format(df.shape[0]))
df.head(3)


ROWS: 18,060,378
Wall time: 3min 17s


Unnamed: 0,HASH_NBR,CAMPAIGN_NBR,UNSUB_IND,SEND_DT,CLICK,MBR_PRGM_ACTV,week_of_year,day_of_month,day_of_year,is_end_month,...,CLICK_THROUGH_RATE,CLICKED_WTHN_7_DAYS,CLICKED_WTHN_14_DAYS,CLICKED_WTHN_30_DAYS,CLICKED_WTHN_60_DAYS,CLICKED_WTHN_90_DAYS,CLICKED_WTHN_180_DAYS,DAYS_SINCE_LAST_CLICK_WTHN_60_DAYS,DAYS_SINCE_LAST_CLICK_WTHN_90_DAYS,DAYS_SINCE_LAST_CLICK_WTHN_180_DAYS
0,-8491742361351297106,-6189659794402972413,0,2022-01-11,0,1,2,11,11,0,...,0.006757,0,0,0,0,0,0,0,0,0
1,-772995889030949650,1601139767253584975,0,2022-01-31,0,1,5,31,31,1,...,0.0,0,0,0,0,0,0,0,0,0
2,-263305046243702610,-5169751979471054759,0,2022-01-25,0,1,4,25,25,0,...,0.0,0,0,0,0,0,0,0,0,0


In [None]:
target_col = "CLICK"

feature_cols = [x for x in df.columns if (("_NBR" not in x) & (x != "SEND_DT") & (x != "UNSUB_IND") & (x != "CLICK") & (x not in ["CLICKED_WTHN_7_DAYS", "CLICKED_WTHN_14_DAYS", 'DAYS_SINCE_LAST_CLICK_WTHN_60_DAYS', 'DAYS_SINCE_LAST_CLICK_WTHN_90_DAYS', 'DAYS_SINCE_LAST_CLICK_WTHN_180_DAYS']))]

print(f"NUMBER OF FEATURES: {len(feature_cols)}")

k = int(len(feature_cols) * 0.25)

print(f"k == {k}")

kbest_feat_cols, removed_columns, feat_imp_df = feature_selection(df,
                                                                  feature_cols,
                                                                  target_col,
                                                                  k = k
                                                                 )

plot_chi2_scores(feat_imp_df, 'Variable', 'feat_imp', normalize = False, top_k = 20, figsize = (24, 12))


In [None]:


rand_state = 42023
n_splits = 5

chosen_feats = ['CLICK_THROUGH_RATE', 'DAYS_SINCE_LAST_CLICK', 'CLICK_WTHN_60_DAYS', 'GNDR_1', 'GNDR_2', 'EN_1', 'EN_2', 'EN_3', 'EN_4', 'EN_6', 'AGE_1', 'AGE_2', 'AGE_3', 'AGE_4', 'AGE_5', 'INCOME_1', 'INCOME_2', 'INCOME_3', 'INCOME_4', 'INCOME_5']
target_col = "CLICK"
var_smoothing = 1e-8
scale_pos_weight=99,
objective = 'binary:logistic',  
eval_metric = 'auc'

In [None]:
%%time

from sklearn.model_selection import StratifiedKFold

from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, auc
import pandas as pd
import numpy as np


# Parameters
rand_state = 42023
n_splits = 20


# Models
decision_tree_model = DecisionTreeClassifier(criterion = "entropy",
                                             random_state = rand_state)

naive_bayes_model = GaussianNB(var_smoothing = var_smoothing)

xgb_model = xgb.XGBClassifier(scale_pos_weight = scale_pos_weight,
                              objective = objective,  
                              eval_metric = eval_metric, 
                              random_state = rand_state)

logreg = LogisticRegression(random_state=rand_state)

rf_clf = RandomForestClassifier(n_estimators = 100, 
                                criterion = "entropy",
                                random_state = rand_state
                               )



model_list = [xgb_model, 
              naive_bayes_model,
              decision_tree_model,
              logreg,
              rf_clf
             ]

# Perform stratified k-folds cross-validation
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=rand_state)



In [None]:
%%time

X = df[chosen_feats].values
y = df[target_col].values

for train_idx, test_idx in skf.split(X, y):
    print(f"FOLD: {i + 1}/{num_feat} -- {datetime.datetime.now()}")
    for model in model_list:
        model.fit(X[train_idx], y[train_idx])

In [None]:
%%time

X = valid_df[chosen_feats].values
y = valid_df[target_col].values

res_list = []
for model in model_list:
    model_name, auc_score, conf_matrix, class_report, cr = predict_results(model, X, y)

    print("##########################################")
    print(f"MODEL: {model_name}")
    print(cr)
    print("##########################################")

    res = {"MODEL_NAME": model_name, "features_used": feat_cols, "AUC": auc_score,
           "True_Neg": conf_matrix[0][0], "False_Pos": conf_matrix[0][1], 
           "False_Neg": conf_matrix[1][0], "True_Pos": conf_matrix[1][1]
    }
    res.update(class_report)
    res_list.append(res)


In [None]:
pd.json_normalize(res_list, sep = "_")