In [None]:
#DATA PREPROCESSING

import pandas as pd
import numpy as np
import datetime

''' Feature Selection with drop'''
def drop(df):
    drop_cols = [
        "email",
        'Current_membership_service_ID',
        'Cust_Subsidiary',
        'Subsidiary_ID',
        'Subsidiary',
        'billaddr_city',
        'billaddr_zip',
        'billaddr_country',
        'billaddr_country_desc',
        'Customer_ExtID',
        'probability_lapsed',
    ]
    print(f"dropped columns: {len(drop_cols)} out of {len(df.columns)}")
    df.drop(drop_cols, axis=1, inplace=True, errors='ignore')  # added errors='ignore' to handle if columns are not present
    return df

def read_csv(file_path: str) -> pd.DataFrame:
    """
    Reads a CSV file and returns a DataFrame.
    """
    return pd.read_csv(file_path, low_memory=False)

def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Handles missing values in the DataFrame.
    """
    df['distance'] = df['distance'].replace(-1000, np.NaN)  # Reverse "-1000" in "distance" to nan
    return df

def categorical_imputer(df: pd.DataFrame) -> pd.DataFrame:
    """
    Imputes missing values in categorical columns.
    """
    New_cate_Fill = ["Stadard_Job_Title", "gender", "ethnicity", "education",
                     "organization_unit", "organization_type",
                     "native_language", "organization_unit", "organization_type",
                     "Industry_Category",
                     "Emp_Exemption_Status",
                     "native_language"]

    Major_Fill = ["department_size", "company_size", "employee_oversee", "Monthly_Donor"]  # MODE

    # Remove the line that tries to access 'Customer_ID' after it has been removed
    # customer_ids_test = df['Customer_ID'].copy()

    to_remove = ["billaddr_state",
                 "Customer_ID",
                 "native_language"]  # drop ID

    for c in New_cate_Fill:
        df[c] = df[c].fillna('new_cate')

    for c in Major_Fill:
        major_category = df[c].value_counts().keys()[0]
        df[c] = df[c].fillna(major_category)

    for c in to_remove:
        if c in df.columns:
            df.drop(c, axis=1, inplace=True)

    if "employee_oversee" in df.columns:
        df.drop("employee_oversee", axis=1, inplace=True)

    return df


def datetime_imputer(df: pd.DataFrame, date_cols: list) -> pd.DataFrame:
    """
    Imputes missing values in datetime columns and converts them to a numerical format.
    """
    for c in date_cols:
        df[c] = pd.to_datetime(df[c])
        df[c] = (datetime.datetime.now() - df[c]).apply(lambda x: x.days)

    impute_membership_paid_through = df.membership_paid_through.max() + 365
    impute_SHRM_Join_Date = -365
    impute_Current_Start_Date = df.Current_Start_Date.min() - 365
    impute_likely_next_transaction = df.likely_next_transaction.min() - 365

    df['membership_paid_through'] = df['membership_paid_through'].fillna(impute_membership_paid_through)
    df['SHRM_Join_Date'] = df['SHRM_Join_Date'].fillna(impute_SHRM_Join_Date)
    df['Current_Start_Date'] = df['Current_Start_Date'].fillna(impute_Current_Start_Date)
    df['likely_next_transaction'] = df['likely_next_transaction'].fillna(impute_likely_next_transaction)

    return df

def define_cols(df: pd.DataFrame) -> tuple:
    """
    Defines the type of the variables.
    """
    cate_cols = ['Stadard_Job_Title', 'DISALLOW_ALL_COMMUNICATION',
                 'disallow_email_communication', 'disallow_phone_communication',
                 'disallow_regular_mail_communi', 'disallow_third_party_communic',
                 'department_size', 'gender', 'ethnicity', 'education',
                 'company_size', 'native_language', 'organization_unit',
                 'organization_type', 'employee_oversee', 'shrm_organization_multination',
                 'shrm_organization_unionized', 'billaddr_state', 'Is_Person',
                 'IsInActive', 'LoginAccess', 'Monthly_Donor',
                 'Customer_Segment', 'if_member', 'if_active_member',
                 'Emp_Exemption_Status', 'Industry_Category']

    date_cols = ['membership_paid_through', 'SHRM_Join_Date', 'Current_Start_Date',
                 'first_order_date', 'last_order_date', 'likely_next_transaction']

    num_cols = ['retina_clv_10yr', 'retina_clv_1yr', 'retina_clv_2yr', 'retina_clv_3yr',
                'retina_clv_5yr', 'retina_clv_3mo', 'retina_clv_6mo', 'retina_clv_9mo',
                'retina_residual_value_10yr', 'retina_residual_value_1yr',
                'retina_residual_value_2yr', 'retina_residual_value_3yr',
                'retina_residual_value_5yr', 'retina_residual_value_3mo',
                'retina_residual_value_6mo', 'retina_residual_value_9mo',
                'total_num_orders', 'total_order_value', 'typical_order_value',
                'probability_alive', #'distance', ###undo for annual conference
                'SHRM_ENTERED_PROFESSION']

    label = ['attend']

    ID = ['Customer_ID']

    return cate_cols, date_cols, num_cols, label, ID

#This function is for feature selection dimensionality reduction. It removes some features manually before perform feature selection. Works best!
def data_clean(df: pd.DataFrame) -> tuple:

    df = drop(df)  # Drop specified columns

    # Extract and store Customer_ID before preprocessing
    customer_ids = df['Customer_ID'].copy()

    # Drop the 'Customer_ID' column from the DataFrame for further processing
    df.drop('Customer_ID', axis=1, inplace=True)

    cate_cols, date_cols, num_cols, label, ID = define_cols(df)
    df = categorical_imputer(df)
    df = datetime_imputer(df, date_cols)
    pd.set_option('future.no_silent_downcasting', True)
    df['attend'] = df['attend'].replace({"Yes": 1, "No": 0})

    cols = df.columns.tolist()
    cols.remove("attend")

    num_index = []
    cat_index = []
    numeric_features = []
    categorical_features = []
    for i, col in enumerate(cols):
        if col in cate_cols:
            cat_index.append(i)
            categorical_features.append(col)
        else:
            num_index.append(i)
            numeric_features.append(col)

    X = df[cols]

    return X, df['attend'], numeric_features, categorical_features, customer_ids

In [None]:
#MODEL EVALUATION

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, recall_score, confusion_matrix, roc_curve, precision_recall_curve, f1_score, precision_score

def evaluate_basic_metrics(y_true, y_pred, propensity_scores):
    """
    Evaluate basic metrics including AUC, Recall, and Confusion Matrix.
    """
    auc = roc_auc_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    confusion = confusion_matrix(y_true, y_pred)

    print(f"*****  AUC = {auc:.2%}    Recall ={recall:.2%}  ***** \n")
    print('\nConfusion Matrix:\n', confusion)

    return auc, recall, confusion


def plot_roc_curve(y_true, propensity_scores):
    """
    Plot ROC curve for the model.
    """
    fpr, tpr, _ = roc_curve(y_true, propensity_scores)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc_score(y_true, propensity_scores))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()


def plot_precision_recall_curve(y_true, propensity_scores):
    """
    Plot Precision-Recall curve for the model.
    """
    precision, recall, _ = precision_recall_curve(y_true, propensity_scores)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, label='Precision-Recall curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower left")
    plt.show()


def evaluate_model(clf, X_test, y_test):
    """
    Comprehensive model evaluation.
    """
    # Basic metrics
    propensity_scores = clf.predict_proba(X_test)[:, 1]
    y_pred = (propensity_scores > 0.5).astype('float')
    auc, recall, confusion = evaluate_basic_metrics(y_test, y_pred, propensity_scores)

    # ROC Curve
    plot_roc_curve(y_test, propensity_scores)

    # Precision-Recall Curve
    plot_precision_recall_curve(y_test, propensity_scores)

    # Additional metrics can be added here as needed
    return auc, recall, confusion

# Additional evaluation functions and visualizations can be added here based on further requirements.

In [None]:
#MODEL TRAINING

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, recall_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


def full_imputer(X, numeric_features, categorical_features):
    """Pipeline for preprocessing data: imputation and encoding."""

    # Convert numeric columns to float type and coerce errors (convert non-convertible values to NaN)
    for col in numeric_features:
        X.loc[:, col] = pd.to_numeric(X[col], errors='coerce')

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
        ('onehot', OneHotEncoder(drop='first'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    clf = Pipeline(steps=[('preprocessor', preprocessor)])
    encoded = clf.fit_transform(X)

    OneHot_features = clf.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(
        categorical_features)
    features = numeric_features + OneHot_features.tolist()

    return encoded, features


def train(X_encoded, y, features= None, model="Gradient Boosting", test=True, return_importances = False):
    """Train the model based on the specified algorithm."""

    y = y.astype(int)

    if model == 'Decision Tree':
        clf = DecisionTreeClassifier(max_depth=None, random_state=42)
        clf.fit(X_encoded, y)

    elif model == "Logistic Regression":
        clf = LogisticRegression(max_iter=1000, random_state=42)
        clf.fit(X_encoded, y)

    elif model == 'Gradient Boosting':
        clf = LGBMClassifier(learning_rate=0.09, max_depth=-1, random_state=42, n_estimators=1000, num_leaves=40)

        if test:
            clf.fit(X_encoded, y)
        else:
            from sklearn.model_selection import train_test_split
            X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
            clf.fit(X_train, y_train, eval_set=[(X_test, y_test), (X_train, y_train)], eval_metric='auc')

    if return_importances:
        return clf, get_feature_importances(clf, features)

    return clf

def evaluate_model(clf, X_test, y_test):
    """Evaluate the model and print metrics."""

    # Predict probabilities for the positive class
    propensity_scores = clf.predict_proba(X_test)[:, 1]

    # Generate binary predictions based on a default threshold of 0.5
    predictions = (propensity_scores > 0.5).astype(int)

    # Calculate AUC using the probability scores
    y_test = y_test.astype(int)
    auc_score = roc_auc_score(y_test, propensity_scores)

    # Calculate recall using the binary predictions
    recall = recall_score(y_test, predictions)

    # Print the basic evaluation metrics
    print(f"***** AUC = {auc_score:.2%}    Recall = {recall:.2%}  ***** \n")

    # Threshold analysis
    thresholds = [0.5, 0.4, 0.3, 0.2, 0.1, 0.01, 0.005, 0.001, 0.0001]
    auc_list, recall_list, target_list, confusion_matrices = [], [], [], []

    for threshold in thresholds:
        # Generate binary predictions based on the current threshold
        y_pred_threshold = (propensity_scores > threshold).astype(int)

        # Calculate metrics for the current threshold
        auc_list.append(roc_auc_score(y_test, y_pred_threshold))
        recall_list.append(recall_score(y_test, y_pred_threshold))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred_threshold).ravel()
        target_list.append((fp + tp) / len(y_test))
        confusion_matrices.append(f"[{tn},{fp}],[{fn},{tp}]")

    # Prepare and print the summary table
    summary_table = pd.DataFrame({
        'Threshold': thresholds,
        'AUC:': [f"{x:.2%}" for x in auc_list],
        'Recall:': [f"{x:.2%}" for x in recall_list],
        'Target %': [f"{x:.3%}" for x in target_list],
        'Confusion Matrix': confusion_matrices
    })
    print(summary_table)


# Additional functions and logic can be added here based on further requirements.
def get_feature_importances(clf, features):
    """Get feature importances from the trained model."""
    importances = clf.feature_importances_
    sorted_idx = np.argsort(importances)[::-1]
    return np.array(features)[sorted_idx], importances[sorted_idx]


def apply_lda(X_train, y_train, n_components=None):  # <-- New function
    lda = LinearDiscriminantAnalysis(n_components=n_components)
    X_train_lda = lda.fit_transform(X_train.toarray(), y_train)
    return X_train_lda, lda


In [None]:
#RUN MODEL NO TEST - NO PREDICTIONS

'''This is for Feature Selection'''

from sklearn.model_selection import train_test_split
from data_preprocessing import read_csv, data_clean
from model_training import full_imputer, train, evaluate_model
from output_handling import save_propensity_scores

def main():
    df = read_csv('files/ANN24.csv')

    # Preprocess the data
    X, y, numeric_features, categorical_features, customer_ids = data_clean(df)
    print(1)
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test, customer_ids_train, customer_ids_test = train_test_split(X, y, customer_ids, test_size=0.2, random_state=42)
    print(2)
    # Preprocess the training data

    X_train_encoded, features = full_imputer(X_train, numeric_features, categorical_features)

    print("y_train type: ",y_train.dtype)
    y_train = y_train.astype(int)

    # Train the model and get feature importances
    clf, (sorted_features, importances) = train(X_train_encoded, y_train, features=features, model="Decision Tree", test=False, return_importances=True)

    # Select top N features (for example, top 50 or you can set a threshold)
    selected_features = sorted_features[:65]
    print(selected_features)

    # Update X_train_encoded and X_test_encoded to only include selected features
    feature_indices = [features.index(feat) for feat in selected_features]
    X_train_encoded = X_train_encoded[:, feature_indices]
    X_test_encoded, _ = full_imputer(X_test, numeric_features, categorical_features)
    X_test_encoded = X_test_encoded[:, feature_indices]

    # Retrain the model using only the selected features
    clf = train(X_train_encoded, y_train, model="Gradient Boosting", test=False)

    # Evaluate the model
    evaluate_model(clf, X_test_encoded, y_test)

    # Save propensity scores
    propensity_scores = clf.predict_proba(X_test_encoded)[:, 1]
    save_propensity_scores(customer_ids_test, propensity_scores)

if __name__ == "__main__":
    main()
'''
#This is for the normal model without feature selection and manual dropping of fields
from sklearn.model_selection import train_test_split
from data_preprocessing import read_csv, data_clean
from model_training import full_imputer, train, evaluate_model
#from model_evaluation import evaluate_model
from output_handling import save_propensity_scores

def main():

    df = read_csv('trainDataSet.csv')

    # Preprocess the data
    X, y, numeric_features, categorical_features, customer_ids = data_clean(df)

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test, customer_ids_train, customer_ids_test = train_test_split(X, y, customer_ids, test_size=0.2, random_state=42)

    # Preprocess the training data
    X_train_encoded, features = full_imputer(X_train, numeric_features, categorical_features)

    # Train the model
    clf = train(X_train_encoded, y_train, model="Gradient Boosting", test=False)

    # Preprocess the test data
    X_test_encoded, _ = full_imputer(X_test, numeric_features, categorical_features)

    # Evaluate the model
    evaluate_model(clf, X_test_encoded, y_test)

    # Save propensity scores
    # propensity_scores = clf.predict_proba(X_test_encoded)[:, 1]
    # save_propensity_scores(customer_ids_test, propensity_scores)

if __name__ == "__main__":
    main()
'''

In [None]:
#RUN MODEL YEST TEST - OUTPUT IS PREDICTIONS

# To incorporate Feature Selection in this file we have the following code
import pandas as pd
from data_preprocessing import read_csv, data_clean
from model_training import full_imputer, train, evaluate_model
from output_handling import save_propensity_scores

def main():
    # Load training and testing data
    train_file_path = "files/ANN23.csv"
    test_file_path = "files/TestANN24.csv"


    train_df = pd.read_csv(train_file_path,low_memory=False)
    test_df = pd.read_csv(test_file_path,low_memory=False)
    print("Files read")

    # Preprocess training data
    X_train, y_train, numeric_features_train, categorical_features_train, _ = data_clean(train_df)
    X_train_encoded, features_train = full_imputer(X_train, numeric_features_train, categorical_features_train)

    print("preprocess complete")

    y_train = y_train.astype(int)
    # Train the model on training data and get feature importances
    clf, (sorted_features, importances) = train(X_train_encoded, y_train, features=features_train, model="Gradient Boosting",return_importances=True)
    print("training - training complete")
    # Select top N features (for example, top 50 or you can set a threshold)
    selected_features = sorted_features[:65]  # <-- This number can be adjusted based on your needs
    print(selected_features)

    # Update X_train_encoded to only include selected features
    feature_indices = [features_train.index(feat) for feat in selected_features]
    X_train_encoded = X_train_encoded[:, feature_indices]

    # Preprocess testing data
    X_test, y_test, numeric_features_test, categorical_features_test, customer_ids = data_clean(test_df)
    X_test_encoded, features_test = full_imputer(X_test, numeric_features_test, categorical_features_test)
    X_test_encoded = X_test_encoded[:, feature_indices]  # Use the same selected features for the test set

    # Retrain the model on training data using only the selected features
    clf = train(X_train_encoded, y_train)

    y_test = y_test.astype(int)
    # Evaluate the model on testing data
    evaluate_model(clf, X_test_encoded, y_test)

    # Save and visualize propensity scores
    propensity_scores = clf.predict_proba(X_test_encoded)[:, 1]
    save_propensity_scores(customer_ids, propensity_scores)

if __name__ == "__main__":
    main()

In [None]:
#SHARING WITH MARKETING TEAM - FILE PREPARATION

import pandas as pd
from datetime import datetime

# Read your main dataframe
df = pd.read_csv('propensity_scores.csv')

# Define bins and labels
bins = [0.004, 0.01, 0.1, 0.2, 0.4, 1]
labels = ["5_Next_Best", "4_Very_Low", "3_Low", "2_Medium", "1_High"]

# Create a new 'Propensity_Level' column in df
df['Propensity_Level'] = pd.cut(df['Propensity'], bins=bins, labels=labels, include_lowest=True, right=True)

# Filter out rows with NaN in 'Propensity_Level' column
df = df.dropna(subset=['Propensity_Level'])

# Order the DataFrame by 'Propensity_Level'
df = df.sort_values(by='Propensity_Level', ascending=False)

# Read XFILE
xfile_df = pd.read_csv('files/ANN24.csv')

# Merge df with xfile_df on 'Customer_ID'
merged_df = pd.merge(df, xfile_df, on='Customer_ID', how='inner')

# Get the current UTC time in the ISO 8601 format as required by Marketo
#current_date_marketo_format = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
current_date_marketo_format = datetime.utcnow().strftime('%Y-%m-%d')

# Convert the string to a datetime object and assign it to your DataFrame
merged_df['Propensity_Date'] = pd.to_datetime(current_date_marketo_format)

# Drop 'Customer_ID' column and keep 'email' and 'Propensity_Level' columns
final_df = merged_df[['email', 'Propensity_Level', 'Propensity_Date']]

# Save to CSV
folder_path = "S:/Collaboration/Business Intelligence/Muzammil-E-Haque Mahmud/feb23-June23/CAR/PropensityListsSendsDirectory/"

current_date_formatted = datetime.utcnow().strftime('%Y_%m_%d')

# Create the file name by concatenating the string "PropensityList_" with the formatted date
file_name = f"{folder_path}PropensityList_{current_date_formatted}.csv"

# Export your DataFrame to CSV using the constructed file name
final_df.to_csv(file_name, index=False)