In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, PrecisionRecallDisplay, precision_score, recall_score, roc_auc_score, RocCurveDisplay, roc_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression
from datetime import timedelta
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV #GridSearch is for hyperparameter tuning
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, PrecisionRecallDisplay, RocCurveDisplay
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import load_iris

In [8]:
# read all tables
# Use certifi to set the SSL certificate path
customers = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/customers_final.csv')
engagement = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/engagements_final.csv')
marketing = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/marketing_final.csv')
transactions = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/transactions_final.csv')

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)>

MERGE DATA

In [None]:
# step 1: marketing data at a customer level
marketing_agg = marketing[marketing['response']=='Yes'].groupby('customer_id')['campaign_id'].count().to_frame()
# step 2: aggregate transaction data at a customer level
transactions_agg = transactions.groupby('customer_id').aggregate({'transaction_id':'count','transaction_amount':'sum'})

In [None]:
# step 3: set customers and engagement index as customer_id
customers.set_index('customer_id', inplace=True)
engagement.set_index('customer_id', inplace=True)
# step 4: join all tables
joint_data = customers.join(engagement).join(transactions_agg).join(marketing_agg)
joint_data.head()

## DATA CLEANING & FEATURE ENGINEERING

LTV Calculation

In [None]:
# Calculation of LTV
joint_data.groupby('customer_id')['transaction_amount'].sum()
joint_data['LTV'] = joint_data.groupby('customer_id')['transaction_amount'].sum()
joint_data.head()


In [None]:
joint_data['LTV'].describe()
# note: 75th percentile will be used as binary output for all the models

In [None]:
# 10% of age entries are blank 
null_counts = joint_data['age'].isnull().sum()
null_counts

In [None]:
# Create KNNImputer instance - this is to clean the age data since it will be used in modelling
imputer = KNNImputer(n_neighbors=2)

In [None]:
# Apply the imputer to the age column
joint_data[['age']] = imputer.fit_transform(joint_data[['age']])

In [None]:
joint_data['campaign_id'].value_counts()

In [None]:
# over 20% of customers were not sent any marketing campaigns
campaign_null_counts = joint_data['campaign_id'].isnull().sum()
campaign_null_counts

In [None]:
# KNN used to make up for the null values
imputer = KNNImputer(n_neighbors=5)
joint_data[['campaign_id']] = imputer.fit_transform(joint_data[['campaign_id']])

In [None]:
# Convert transaction_date to datetime
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])

In [None]:
# Customer Join Time (How long they have been a member)
joint_data['Customer_Join_Time'] = pd.to_datetime(joint_data['last_purchase_date']) - pd.to_datetime(joint_data['join_date'])
joint_data['Customer_Join_Time']

In [None]:
# Avg transaction amount
joint_data['Avg_Transaction_Amount'] = (joint_data['LTV'])/(joint_data['transaction_id'])
joint_data.head()

In [None]:
# Most Recent Purchase (in days)
joint_data['most_recent_purchase_date'] = pd.to_datetime(joint_data['last_purchase_date'].max()) - pd.to_datetime(joint_data['last_purchase_date'])

In [None]:
# convert to an integer number of days for most recent purchase
joint_data['most_recent_purchase_in_days'] = joint_data['most_recent_purchase_date'].dt.days

In [None]:
# convert to an integer number of days for customer age
joint_data['Customer_Jointime_in_days']=joint_data['Customer_Join_Time'].dt.days

In [None]:
# Gender for CATEGORICAL COLUMN 
def Gender_Categorical(x):
    if x == 'Male':
        return 1
    elif x == 'Female':
        return 2
    else:
        return 0

In [None]:
joint_data['Gender_Categorical'] = joint_data['gender'].apply(Gender_Categorical)
joint_data.head()

In [None]:
# Scoring Frequency and Monetary: Higher values are better
joint_data['SiteVisit_Score'] = pd.qcut(joint_data['number_of_site_visits'], 4, labels=[1, 2, 3, 4])
joint_data['EmailOpen_Score'] = pd.qcut(joint_data['number_of_emails_opened'], 4, labels=[1, 2, 3, 4])
joint_data['Click_Score'] = pd.qcut(joint_data['number_of_clicks'], 4, labels=[1, 2, 3, 4])

# Combine scores to a single score
joint_data['Engagement_Score'] = joint_data['SiteVisit_Score'].astype(int) + joint_data['EmailOpen_Score'].astype(int) + joint_data['Click_Score'].astype(int)
joint_data.head()

In [None]:
# Avg Transaction Time = how often do they make a transaction (in days)
joint_data['Avg_Transaction_Time'] = joint_data['Customer_Jointime_in_days']/joint_data['transaction_id']
joint_data.head()

In [None]:
joint_data['location'].value_counts()

In [None]:
# Create the LabelEncoder instance
label_encoder = LabelEncoder()

In [None]:
# Fit and transform the data
joint_data['location_encoded'] = label_encoder.fit_transform(joint_data['location'])

In [None]:
# Convert the encoded integers to floats
joint_data['location_encoded'] = joint_data['location_encoded'].astype(float)
# it was found that this hurt when inserted as an input variable for all of the models as the accuracy scores for '1' were
# well under 0.7

In [None]:
joint_data.head()

## RF (1 MONTH, 3 MONTHS, 6 MONTHS, 12 MONTHS)

RF PREP

In [None]:
# Convert the 'date' column to datetime type
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])

In [None]:
# set reference date
last_date = transactions['transaction_date'].max()

In [None]:
def calculate_rf(data, end_date, days_label):
    rf = data.groupby('customer_id').agg(
        recency = ('transaction_date', lambda x: (end_date - x.max()).days),
        frequency = ('transaction_id', 'count'),
        monetary = ('transaction_amount', 'sum')
    ).rename(columns={
        'recency': f'Recency_{days_label}',
        'frequency': f'Frequency_{days_label}',
        'monetary': f'Monetary_{days_label}'
    })
    return rf

1 MONTH

In [None]:
# Define the time periods
days_30 = last_date - timedelta(days=30)
last_30_days = transactions[(transactions['transaction_date'] > days_30) & (transactions['transaction_date'] <= last_date)]

In [None]:
rf_30 = calculate_rf(last_30_days, last_date, '30')
rf_30
# Conclusion: not enough data to conduct any modeling for rf on last 30 days

3 MONTHS

In [None]:
days_90 = last_date - timedelta(days=90)
last_90_days = transactions[(transactions['transaction_date'] > days_90) & (transactions['transaction_date'] <= last_date)]
rf_90 = calculate_rf(last_90_days, last_date, '90')
rf_90

In [None]:
# rf Score Creation for 3 months
# Scoring Recency: Lower recency is better
rf_90['R_Score'] = pd.qcut(rf_90['Recency_90'], 4, labels=[4, 3, 2, 1])

# Scoring Frequency and Monetary: Higher values are better
rf_90['F_Score'] = pd.qcut(rf_90['Frequency_90'], 4, labels=[1, 2, 3, 4])
rf_90['M_Score'] = pd.qcut(rf_90['Monetary_90'], 4, labels=[1, 2, 3, 4])

# Combine scores to a single score
rf_90['rf_Score'] = rf_90['R_Score'].astype(int) + rf_90['F_Score'].astype(int)
rf_90.head()

In [None]:
rf_90 = pd.merge(rf_90, joint_data[['LTV','age', 'Customer_Join_Time', 'Avg_Transaction_Amount', 'Customer_Jointime_in_days',
                                      'SiteVisit_Score', 'EmailOpen_Score', 'Click_Score',
                                      'Engagement_Score', 'Gender_Categorical',
                                      'Avg_Transaction_Time', 'campaign_id',
                                      'transaction_id']], on='customer_id', how='left')
rf_90.head()

6 MONTHS

In [None]:
days_180 = last_date - timedelta(days=180)
last_180_days = transactions[(transactions['transaction_date'] > days_180) & (transactions['transaction_date'] <= last_date)]
rf_180 = calculate_rf(last_180_days, last_date, '180')
rf_180.head()

In [None]:
# rf Score Creation for 6 months
# Scoring Recency: Lower recency is better
rf_180['R_Score'] = pd.qcut(rf_180['Recency_180'], 4, labels=[4, 3, 2, 1])

# Scoring Frequency and Monetary: Higher values are better
rf_180['F_Score'] = pd.qcut(rf_180['Frequency_180'], 4, labels=[1, 2, 3, 4])
rf_180['M_Score'] = pd.qcut(rf_180['Monetary_180'], 4, labels=[1, 2, 3, 4])

# Combine scores to a single score
rf_180['rf_Score'] = rf_180['R_Score'].astype(int) + rf_180['F_Score'].astype(int)
rf_180.head()

In [None]:
rf_180 = pd.merge(rf_180, joint_data[['LTV','age', 'Customer_Join_Time', 'Avg_Transaction_Amount', 'Customer_Jointime_in_days',
                                      'SiteVisit_Score', 'EmailOpen_Score', 'Click_Score',
                                      'Engagement_Score', 'Gender_Categorical',
                                      'Avg_Transaction_Time', 'campaign_id',
                                      'transaction_id']], on='customer_id', how='left')
rf_180.head()

1 YEAR

In [None]:
days_365 = last_date - timedelta(days=365)
last_365_days = transactions[(transactions['transaction_date'] > days_365) & (transactions['transaction_date'] <= last_date)]
rf_365 = calculate_rf(last_365_days, last_date, '365')
rf_365.head()

In [None]:
# rf Score Creation for 1 year
# Scoring Recency: Lower recency is better
rf_365['R_Score'] = pd.qcut(rf_365['Recency_365'], 4, labels=[4, 3, 2, 1])

# Scoring Frequency and Monetary: Higher values are better
rf_365['F_Score'] = pd.qcut(rf_365['Frequency_365'], 4, labels=[1, 2, 3, 4])
rf_365['M_Score'] = pd.qcut(rf_365['Monetary_365'], 4, labels=[1, 2, 3, 4])

# Combine scores to a single score
rf_365['rf_Score'] = rf_365['R_Score'].astype(int) + rf_365['F_Score'].astype(int)
rf_365.head()

In [None]:
rf_365 = pd.merge(rf_365, joint_data[['LTV','age', 'Customer_Join_Time', 'Avg_Transaction_Amount', 'Customer_Jointime_in_days',
                                      'SiteVisit_Score', 'EmailOpen_Score', 'Click_Score',
                                      'Engagement_Score', 'Gender_Categorical',
                                      'Avg_Transaction_Time', 'campaign_id',
                                      'transaction_id']], on='customer_id', how='left')
rf_365.head()

## MODEL BUILDING AND EVALUATION

3 MONTHS RF - LOGISTICAL REGRESSION (THIS WAS CHOSEN AS THE MDOEL OF CHOICE FOR RF)

In [None]:
joint_data['LTV'].describe()

In [None]:
# 75th percentile or better for the joint_data['LTV'] is used for binary output because we are trying to predict the highest value customers
# and this is reflective of the highest $$$ value customers over the lifetime of transactions
rf_90['binary_output'] = rf_90['LTV'].apply(lambda x: 1 if x>=11275.797500 else 0)
rf_90['binary_output'].value_counts()

In [None]:
# This model will predict whether there will be a large amount of high value customers 
X_90 = rf_90[['age','Customer_Jointime_in_days', 'Engagement_Score', 'transaction_id',
                'Gender_Categorical','Avg_Transaction_Time', 'campaign_id',
                'R_Score','F_Score']]
y_90 = rf_90['binary_output']

In [None]:
# reserve 30% for testing
X_train_90, X_test_90, y_train_90, y_test_90 = train_test_split(X_90,y_90, test_size=0.3, random_state=42)

In [None]:
# build our pipeline that includes these transformations
numeric_columns_90 = ['age','Customer_Jointime_in_days', 'Engagement_Score', 
                      'transaction_id','Avg_Transaction_Time', 
                      'campaign_id', 'R_Score','F_Score']
categorical_columns_90 = ['Gender_Categorical']

In [None]:
# create a pre-processing pipeline which includes the steps of Scaling numeric variables and encoding categoricals
preprocessor_90 = ColumnTransformer(
    transformers=[
        ('num',MinMaxScaler(), numeric_columns_90),
        ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_columns_90)
    ]
)

In [None]:
# test 3 models with cross validation to see which ones work best for this data
knn_90 = KNeighborsClassifier()
logreg_90 = LogisticRegression()
nb_90 = GaussianNB()

In [None]:
# cross validation to determine in general which model works best for the given problem
knn_scores_90 = cross_val_score(knn_90, X_train_90, y_train_90, scoring='f1', cv=5)
logreg_scores_90 = cross_val_score(logreg_90, X_train_90, y_train_90, scoring='f1', cv=5)
nb_scores_90 = cross_val_score(nb_90, X_train_90, y_train_90, scoring='f1', cv=5)
print(f"knn_scores_90: {np.mean(knn_scores_90)}")
print(f"logreg_scores_90: {np.mean(logreg_scores_90)}")
print(f"nb_scores_90: {np.mean(nb_scores_90)}")

# KNN ruled out because it has lowest score Logreg will be put through the pipeline

In [None]:
# build our pipeline
pipeline_90 = Pipeline(steps=[
    ('preprocessor', preprocessor_90),
    ('classifier', LogisticRegression(max_iter=1000, solver='lbfgs'))])

In [None]:
# use GRID SEARCH to find the best combination of hyperparameters for our problem
param_grid_90 = {
  'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],  # Algorithms to use in the optimization problem
    'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Norm used in the penalization
    'classifier__max_iter': [100, 200, 300]  # Maximum number of iterations taken for the solvers to converge
}

In [None]:
grid_search_90 = GridSearchCV(pipeline_90, param_grid_90, cv=5, verbose=1, scoring='f1')
grid_search_90.fit(X_train_90,y_train_90)

In [None]:
final_classifier_90 = LogisticRegression(
    C=grid_search_90.best_params_['classifier__C'],
    solver=grid_search_90.best_params_['classifier__solver'],
    penalty=grid_search_90.best_params_['classifier__penalty'],
    max_iter=grid_search_90.best_params_['classifier__max_iter']
)

In [None]:
final_pipeline_90 = Pipeline(steps=
                          [
                              ('preprocessor',preprocessor_90),
                              ('classifier',final_classifier_90)
                          ])

In [None]:
final_pipeline_90.fit(X_train_90,y_train_90)

In [None]:
pred_90 = final_pipeline_90.predict(X_test_90)
probs_90 = final_pipeline_90.predict_proba(X_test_90)

In [None]:
# F1 Scores for KNN, LogReg, and NB - Using RF Scores over 6 month period (before building pipeline and using GridSearch)
# Logreg chosen due to having highest score
report_90 = classification_report(y_test_90, pred_90)
print(report_90)

RandomForestRegressor - 3 MONTH RF

In [None]:
# Create a RandomForestRegressor object
model_90RFR = RandomForestRegressor(n_estimators=100, random_state=42)

# Splitting the data into training and testing sets
X_train_90RFR, X_test_90RFR, y_train_90RFR, y_test_90RFR = train_test_split(X_90, y_90, test_size=0.3, random_state=42)

In [None]:
# Fit the model to the training data
model_90RFR.fit(X_train_90RFR, y_train_90RFR)

In [None]:
y_pred_90RFR= model_90RFR.predict(X_test_90RFR)
y_pred_90RFR

In [None]:
# Convert predictions to binary by applying a threshold
threshold_90RFR = 0.5
y_pred_binary_90RFR = (y_pred_90RFR > threshold_90RFR).astype(int)


#  RandomForestRegressor Scores - Using RF Scores over 3 month period
print(classification_report(y_test_90RFR, y_pred_binary_90RFR))

6 MONTHS

In [None]:
# 75th percentile or better for the joint_data['LTV'] is used for binary output because we are trying to predict the highest value customers
# and this is reflective of the highest $$$ value customers over the lifetime of transactions
rf_180['binary_output'] = rf_180['LTV'].apply(lambda x: 1 if x>=11275.797500 else 0)
rf_180['binary_output'].value_counts()

In [None]:
# This model will predict whether there will be a large amount of high value customers 
X_180 = rf_180[['age','Customer_Jointime_in_days', 'Engagement_Score', 'transaction_id',
                'Gender_Categorical','Avg_Transaction_Time', 'campaign_id',
                'R_Score','F_Score']]
y_180 = rf_180['binary_output']

In [None]:
# reserve 30% for testing
X_train_180, X_test_180, y_train_180, y_test_180 = train_test_split(X_180,y_180, test_size=0.3, random_state=42)

In [None]:
# build our pipeline that includes these transformations
numeric_columns_180 = ['age','Customer_Jointime_in_days', 'Engagement_Score', 'transaction_id',
                       'Avg_Transaction_Time', 'campaign_id', 'R_Score','F_Score']
categorical_columns_180 = ['Gender_Categorical']

In [None]:
# create a pre-processing pipeline which includes the steps of Scaling numeric variables and encoding categoricals
preprocessor_180 = ColumnTransformer(
    transformers=[
        ('num',MinMaxScaler(), numeric_columns_180),
        ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_columns_180)
    ]
)

In [None]:
# test 3 models with cross validation to see which ones work best for this data
knn_180 = KNeighborsClassifier()
logreg_180 = LogisticRegression()
nb_180 = GaussianNB()

In [None]:
# cross validation to determine in general which model works best for the given problem
knn_scores_180 = cross_val_score(knn_180, X_train_180, y_train_180, scoring='f1', cv=5)
logreg_scores_180 = cross_val_score(logreg_180, X_train_180, y_train_180, scoring='f1', cv=5)
nb_scores_180 = cross_val_score(nb_180, X_train_180, y_train_180, scoring='f1', cv=5)
print(f"knn_scores_180: {np.mean(knn_scores_180)}")
print(f"logreg_scores_180: {np.mean(logreg_scores_180)}")
print(f"nb_scores_180: {np.mean(nb_scores_180)}")

In [None]:
# build our pipeline
pipeline_180 = Pipeline(steps=[
    ('preprocessor', preprocessor_180),
    ('classifier', LogisticRegression(max_iter=1000, solver='lbfgs'))])

In [None]:
# use GRID SEARCH to find the best combination of hyperparameters for our problem
param_grid_180 = {
  'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],  # Algorithms to use in the optimization problem
    'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Norm used in the penalization
    'classifier__max_iter': [100, 200, 300]  # Maximum number of iterations taken for the solvers to converge
}

In [None]:
grid_search_180 = GridSearchCV(pipeline_180, param_grid_180, cv=5, verbose=1, scoring='f1')
grid_search_180.fit(X_train_180,y_train_180)

In [None]:
final_classifier_180 = LogisticRegression(
    C=grid_search_180.best_params_['classifier__C'],
    solver=grid_search_180.best_params_['classifier__solver'],
    penalty=grid_search_180.best_params_['classifier__penalty'],
    max_iter=grid_search_180.best_params_['classifier__max_iter']
)

In [None]:
final_pipeline_180 = Pipeline(steps=
                          [
                              ('preprocessor',preprocessor_180),
                              ('classifier',final_classifier_180)
                          ])

In [None]:
final_pipeline_180.fit(X_train_180,y_train_180)

In [None]:
pred_180 = final_pipeline_180.predict(X_test_180)
probs_180 = final_pipeline_180.predict_proba(X_test_180)

In [None]:
# F1 Scores for KNN, LogReg, and NB - Using RF Scores over 6 month period (before building pipeline and using GridSearch)
# Logreg chosen due to having highest score
report_180 = classification_report(y_test_180, pred_180)
print(report_180)

RandomForestRegressor - 6 MONTH RFM

In [None]:
# Create a RandomForestRegressor object
model_180RFR = RandomForestRegressor(n_estimators=100, random_state=42)

# Splitting the data into training and testing sets
X_train_180RFR, X_test_180RFR, y_train_180RFR, y_test_180RFR = train_test_split(X_180, y_180, test_size=0.3, random_state=42)

In [None]:
# Fit the model to the training data
model_180RFR.fit(X_train_180RFR, y_train_180RFR)

In [None]:
y_pred_180RFR= model_180RFR.predict(X_test_180RFR)
y_pred_180RFR

In [None]:
# Convert predictions to binary by applying a threshold
threshold_180RFR = 0.5
y_pred_binary_180RFR = (y_pred_180RFR > threshold_180RFR).astype(int)


#  RandomForestRegressor Scores - Using RF Scores over 6 month period
print(classification_report(y_test_180RFR, y_pred_binary_180RFR))

1 YEAR

In [None]:
# 75th percentile or better for the joint_data['LTV'] is used for binary output because we are trying to predict the highest value customers
# and this is reflective of the highest $$$ value customers over the lifetime of transactions
rf_365['binary_output'] = rf_365['LTV'].apply(lambda x: 1 if x>=11275.797500 else 0)
rf_365['binary_output'].value_counts()

In [None]:
# This model will predict whether there will be a large amount of high value customers 
X_365 = rf_365[['age','Customer_Jointime_in_days', 'Engagement_Score', 'transaction_id'
                ,'Avg_Transaction_Time', 'campaign_id', 
                'R_Score','F_Score', 'Gender_Categorical']]
y_365 = rf_365['binary_output']

In [None]:
# reserve 30% for testing
X_train_365, X_test_365, y_train_365, y_test_365 = train_test_split(X_365,y_365, test_size=0.3, random_state=42)

In [None]:
# build our pipeline that includes these transformations
numeric_columns_365 = ['age','Customer_Jointime_in_days', 'Engagement_Score', 
                      'transaction_id','Avg_Transaction_Time', 
                      'campaign_id', 'R_Score','F_Score']
categorical_columns_365 = ['Gender_Categorical']

In [None]:
# create a pre-processing pipeline which includes the steps of Scaling numeric variables and encoding categoricals
preprocessor_365 = ColumnTransformer(
    transformers=[
        ('num',MinMaxScaler(), numeric_columns_365),
        ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_columns_365)
    ]
)

In [None]:
# test 3 models with cross validation to see which ones work best for this data
knn_365 = KNeighborsClassifier()
logreg_365 = LogisticRegression()
nb_365 = GaussianNB()

In [None]:
# cross validation to determine in general which model works best for the given problem
knn_scores_365 = cross_val_score(knn_365, X_train_365, y_train_365, scoring='f1', cv=5)
logreg_scores_365 = cross_val_score(logreg_365, X_train_365, y_train_365, scoring='f1', cv=5)
nb_scores_365 = cross_val_score(nb_365, X_train_365, y_train_365, scoring='f1', cv=5)
print(f"knn_scores_365: {np.mean(knn_scores_365)}")
print(f"logreg_scores_365: {np.mean(logreg_scores_365)}")
print(f"nb_scores_365: {np.mean(nb_scores_365)}")

In [None]:
# build our pipeline
pipeline_365 = Pipeline(steps=[
    ('preprocessor', preprocessor_365),
    ('classifier', LogisticRegression(max_iter=1000, solver='lbfgs'))])

In [None]:
# use GRID SEARCH to find the best combination of hyperparameters for our problem
param_grid_365 = {
  'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],  # Algorithms to use in the optimization problem
    'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Norm used in the penalization
    'classifier__max_iter': [100, 200, 300]  # Maximum number of iterations taken for the solvers to converge
}

In [None]:
grid_search_365 = GridSearchCV(pipeline_365, param_grid_365, cv=5, verbose=1, scoring='f1')
grid_search_365.fit(X_train_365,y_train_365)

In [None]:
final_classifier_365 = LogisticRegression(
    C=grid_search_365.best_params_['classifier__C'],
    solver=grid_search_365.best_params_['classifier__solver'],
    penalty=grid_search_365.best_params_['classifier__penalty'],
    max_iter=grid_search_365.best_params_['classifier__max_iter']
)

In [None]:
final_pipeline_365 = Pipeline(steps=
                          [
                              ('preprocessor',preprocessor_365),
                              ('classifier',final_classifier_365)
                          ])

In [None]:
final_pipeline_365.fit(X_train_365,y_train_365)

In [None]:
pred_365 = final_pipeline_365.predict(X_test_365)
probs_365 = final_pipeline_365.predict_proba(X_test_365)

In [None]:
# F1 Scores for KNN, LogReg, and NB - Using RF Scores over 1 year period (before building pipeline and using GridSearch)
# Logreg chosen due to having highest score
report_365 = classification_report(y_test_365, pred_365)
print(report_365)

RandomForestRegressor - 1 YEAR RFM

In [None]:
# Create a RandomForestRegressor object
model_365RFR = RandomForestRegressor(n_estimators=100, random_state=42)
# Splitting the data into training and testing sets
X_train_365RFR, X_test_365RFR, y_train_365RFR, y_test_365RFR = train_test_split(X_365, y_365, test_size=0.3, random_state=42)

In [None]:
# Fit the model to the training data
model_365RFR.fit(X_train_365RFR, y_train_365RFR)

In [None]:
y_pred_365RFR= model_365RFR.predict(X_test_365RFR)
y_pred_365RFR

In [None]:
# Convert predictions to binary by applying a threshold
threshold_365RFR = 0.5
y_pred_binary_365RFR = (y_pred_365RFR > threshold_365RFR).astype(int)
# Logistical Regression - Using RF Scores over 1 year period (after building pipeline and using GridSearch)
print(classification_report(y_test_365RFR, y_pred_binary_365RFR))

## SCORES

3 MONTHS

In [None]:
# F1 Scores for KNN, LogReg, and NB - Using RF Scores over 3 month period (before building pipeline and using GridSearch)
# Logreg chosen due to having highest score
print(f"knn_scores_90: {np.mean(knn_scores_90)}")
print(f"logreg_scores_90: {np.mean(logreg_scores_90)}")
print(f"nb_scores_90: {np.mean(nb_scores_90)}")

In [None]:
# Logistical Regression - Using RF Scores over 3 month period (after building pipeline and using GridSearch)
report_90 = classification_report(y_test_90, pred_90)
print(report_90)


In [None]:
# Extract the preprocessor and the classifier from the pipeline
preprocessor_90 = final_pipeline_90.named_steps['preprocessor']
classifier_90 = final_pipeline_90.named_steps['classifier']

# Get the feature names after preprocessing
feature_names_90 = np.hstack([
    preprocessor_90.transformers_[0][1].get_feature_names_out(),
    preprocessor_90.transformers_[1][1].get_feature_names_out()
])

# Get the coefficients with feature names
coefficients_90 = pd.DataFrame({
    'Feature': feature_names_90,
    'Coefficient': classifier.coef_[0]
})

print(coefficients_90)

In [None]:
#  RandomForestRegressor Scores - Using RF Scores over 3 month period
print(classification_report(y_test_90RFR, y_pred_binary_90RFR))

6 MONTHS

In [None]:
# F1 Scores for KNN, LogReg, and NB - Using RF Scores over 6 month period (before building pipeline and using GridSearch)
# Logreg chosen due to having highest score
print(f"knn_scores_180: {np.mean(knn_scores_180)}")
print(f"logreg_scores_180: {np.mean(logreg_scores_180)}")
print(f"nb_scores_180: {np.mean(nb_scores_180)}")

In [None]:
# Logistical Regression - Using RF Scores over 6 month period (after building pipeline and using GridSearch)
report_180 = classification_report(y_test_180, pred_180)
print(report_180)

In [None]:
# Extract the preprocessor and the classifier from the pipeline
preprocessor_180 = final_pipeline_180.named_steps['preprocessor']
classifier_180 = final_pipeline_180.named_steps['classifier']

# Get the feature names after preprocessing
feature_names_180 = np.hstack([
    preprocessor_180.transformers_[0][1].get_feature_names_out(),
    preprocessor_180.transformers_[1][1].get_feature_names_out()
])

# Get the coefficients with feature names
coefficients_180 = pd.DataFrame({
    'Feature': feature_names_180,
    'Coefficient': classifier.coef_[0]
})

print(coefficients_180)

In [None]:
#  RandomForestRegressor Scores - Using RF Scores over 6 month period
print(classification_report(y_test_180RFR, y_pred_binary_180RFR))

1 YEAR

In [None]:
# F1 Scores for KNN, LogReg, and NB - Using RF Scores over 1 year period (before building pipeline and using GridSearch)
# Logreg chosen due to having highest score
print(f"knn_scores_365: {np.mean(knn_scores_365)}")
print(f"logreg_scores_365: {np.mean(logreg_scores_365)}")
print(f"nb_scores_365: {np.mean(nb_scores_365)}")

In [None]:
# Logistical Regression - Using RF Scores over 1 year period (after building pipeline and using GridSearch)
report_365 = classification_report(y_test_365, pred_365)
print(report_365)

In [None]:
rf_365.head()

In [None]:
# Extract the preprocessor and the classifier from the pipeline
preprocessor_365 = final_pipeline_365.named_steps['preprocessor']
classifier_365 = final_pipeline_365.named_steps['classifier']

# Get the feature names after preprocessing
feature_names_365 = np.hstack([
    preprocessor_365.transformers_[0][1].get_feature_names_out(),
    preprocessor_365.transformers_[1][1].get_feature_names_out()
])

# Get the coefficients with feature names
coefficients_365 = pd.DataFrame({
    'Feature': feature_names_365,
    'Coefficient': classifier.coef_[0]
})

print(coefficients_365)

In [None]:
#  RandomForestRegressor Scores - Using RF Scores over 1 year period
print(classification_report(y_test_365RFR, y_pred_binary_365RFR))