In [821]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, PrecisionRecallDisplay, precision_score, recall_score, roc_auc_score, RocCurveDisplay, roc_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression
from datetime import timedelta
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV #GridSearch is for hyperparameter tuning
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, PrecisionRecallDisplay, RocCurveDisplay
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

In [822]:
# read all tables
customers = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/customers_final.csv')
engagement = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/engagements_final.csv')
marketing = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/marketing_final.csv')
transactions = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/transactions_final.csv')

MERGE DATA

In [823]:
# step 1: marketing data at a customer level
marketing_agg = marketing[marketing['response']=='Yes'].groupby('customer_id')['campaign_id'].count().to_frame()
# step 2: aggregate transaction data at a customer level
transactions_agg = transactions.groupby('customer_id').aggregate({'transaction_id':'count','transaction_amount':'sum'})

In [824]:
# step 3: set customers and engagement index as customer_id
customers.set_index('customer_id', inplace=True)
engagement.set_index('customer_id', inplace=True)
# step 4: join all tables
joint_data = customers.join(engagement).join(transactions_agg).join(marketing_agg)
joint_data.head()

Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,number_of_site_visits,number_of_emails_opened,number_of_clicks,transaction_id,transaction_amount,campaign_id
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,10,15,1,6,3509.48,1.0
2,2021-09-08,2023-10-25,,Male,Hillville,285,49,51,9,6081.32,2.0
3,2021-06-01,2022-11-27,,,North Latoyatown,192,73,25,6,1454.87,1.0
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,110,30,17,20,7874.68,1.0
5,2022-01-24,2023-06-02,,Male,East Matthewfort,161,2,7,24,15524.55,


## DATA CLEANING & FEATURE ENGINEERING

LTV Calculation

In [825]:
# Calculation of LTV
joint_data.groupby('customer_id')['transaction_amount'].sum()
joint_data['LTV'] = joint_data.groupby('customer_id')['transaction_amount'].sum()
joint_data.head()


Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,number_of_site_visits,number_of_emails_opened,number_of_clicks,transaction_id,transaction_amount,campaign_id,LTV
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,10,15,1,6,3509.48,1.0,3509.48
2,2021-09-08,2023-10-25,,Male,Hillville,285,49,51,9,6081.32,2.0,6081.32
3,2021-06-01,2022-11-27,,,North Latoyatown,192,73,25,6,1454.87,1.0,1454.87
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,110,30,17,20,7874.68,1.0,7874.68
5,2022-01-24,2023-06-02,,Male,East Matthewfort,161,2,7,24,15524.55,,15524.55


In [826]:
joint_data['LTV'].describe()
# note: 75th percentile will be used as binary output for all the models

count    10000.000000
mean      7737.562981
std       4628.799469
min         23.810000
25%       3839.915000
50%       7498.890000
75%      11275.797500
max      24298.220000
Name: LTV, dtype: float64

In [827]:
# 10% of age entries are blank 
null_counts = joint_data['age'].isnull().sum()
null_counts

1009

In [828]:
# Create KNNImputer instance - this is to clean the age data since it will be used in modelling
imputer = KNNImputer(n_neighbors=2)

In [829]:
# Apply the imputer to the age column
joint_data[['age']] = imputer.fit_transform(joint_data[['age']])

In [830]:
joint_data['campaign_id'].value_counts()

campaign_id
1.0    4011
2.0    2563
3.0     923
4.0     168
Name: count, dtype: int64

In [831]:
# over 20% of customers were not sent any marketing campaigns
campaign_null_counts = joint_data['campaign_id'].isnull().sum()
campaign_null_counts

2335

In [832]:
# KNN used to make up for the null values
imputer = KNNImputer(n_neighbors=5)
joint_data[['campaign_id']] = imputer.fit_transform(joint_data[['campaign_id']])

In [833]:
# Convert transaction_date to datetime
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])

In [834]:
# Customer Join Time (How long they have been a member)
joint_data['Customer_Join_Time'] = pd.to_datetime(joint_data['last_purchase_date']) - pd.to_datetime(joint_data['join_date'])
joint_data['Customer_Join_Time']

customer_id
1       118 days
2       777 days
3       544 days
4       243 days
5       494 days
          ...   
9996    240 days
9997    200 days
9998    135 days
9999     66 days
10000    98 days
Name: Customer_Join_Time, Length: 10000, dtype: timedelta64[ns]

In [835]:
# Avg transaction amount
joint_data['Avg_Transaction_Amount'] = (joint_data['LTV'])/(joint_data['transaction_id'])
joint_data.head()

Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,number_of_site_visits,number_of_emails_opened,number_of_clicks,transaction_id,transaction_amount,campaign_id,LTV,Customer_Join_Time,Avg_Transaction_Amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,10,15,1,6,3509.48,1.0,3509.48,118 days,584.913333
2,2021-09-08,2023-10-25,43.467467,Male,Hillville,285,49,51,9,6081.32,2.0,6081.32,777 days,675.702222
3,2021-06-01,2022-11-27,43.467467,,North Latoyatown,192,73,25,6,1454.87,1.0,1454.87,544 days,242.478333
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,110,30,17,20,7874.68,1.0,7874.68,243 days,393.734
5,2022-01-24,2023-06-02,43.467467,Male,East Matthewfort,161,2,7,24,15524.55,1.640965,15524.55,494 days,646.85625


In [836]:
# Tier Avg Spend
joint_data['Avg_Spending_Tier'] = pd.qcut(joint_data['Avg_Transaction_Amount'], 3, labels=['low', 'medium', 'high'])

In [837]:
# PREP Avg Spend for CATEGORICAL COLUMN 
def Avg_Spend_Categorical(x):
    if x == 'low':
        return 1
    elif x == 'medium':
        return 2
    elif x == 'high':
        return 3

In [838]:
joint_data['Avg_Spend_Categorical'] = joint_data['Avg_Spending_Tier'].apply(Avg_Spend_Categorical)
joint_data.head()

Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,number_of_site_visits,number_of_emails_opened,number_of_clicks,transaction_id,transaction_amount,campaign_id,LTV,Customer_Join_Time,Avg_Transaction_Amount,Avg_Spending_Tier,Avg_Spend_Categorical
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,10,15,1,6,3509.48,1.0,3509.48,118 days,584.913333,medium,2
2,2021-09-08,2023-10-25,43.467467,Male,Hillville,285,49,51,9,6081.32,2.0,6081.32,777 days,675.702222,high,3
3,2021-06-01,2022-11-27,43.467467,,North Latoyatown,192,73,25,6,1454.87,1.0,1454.87,544 days,242.478333,low,1
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,110,30,17,20,7874.68,1.0,7874.68,243 days,393.734,low,1
5,2022-01-24,2023-06-02,43.467467,Male,East Matthewfort,161,2,7,24,15524.55,1.640965,15524.55,494 days,646.85625,medium,2


In [839]:
# Most Recent Purchase (in days)
joint_data['most_recent_purchase_date'] = pd.to_datetime(joint_data['last_purchase_date'].max()) - pd.to_datetime(joint_data['last_purchase_date'])

In [840]:
# convert to an integer number of days for most recent purchase
joint_data['most_recent_purchase_in_days'] = joint_data['most_recent_purchase_date'].dt.days

In [841]:
# convert to an integer number of days for customer age
joint_data['Customer_Jointime_in_days']=joint_data['Customer_Join_Time'].dt.days

In [842]:
# Gender for CATEGORICAL COLUMN 
def Gender_Categorical(x):
    if x == 'Male':
        return 1
    elif x == 'Female':
        return 2
    else:
        return 0

In [843]:
joint_data['Gender_Categorical'] = joint_data['gender'].apply(Gender_Categorical)
joint_data.head()

Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,number_of_site_visits,number_of_emails_opened,number_of_clicks,transaction_id,transaction_amount,campaign_id,LTV,Customer_Join_Time,Avg_Transaction_Amount,Avg_Spending_Tier,Avg_Spend_Categorical,most_recent_purchase_date,most_recent_purchase_in_days,Customer_Jointime_in_days,Gender_Categorical
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,10,15,1,6,3509.48,1.0,3509.48,118 days,584.913333,medium,2,75 days,75,118,2
2,2021-09-08,2023-10-25,43.467467,Male,Hillville,285,49,51,9,6081.32,2.0,6081.32,777 days,675.702222,high,3,219 days,219,777,1
3,2021-06-01,2022-11-27,43.467467,,North Latoyatown,192,73,25,6,1454.87,1.0,1454.87,544 days,242.478333,low,1,551 days,551,544,0
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,110,30,17,20,7874.68,1.0,7874.68,243 days,393.734,low,1,638 days,638,243,1
5,2022-01-24,2023-06-02,43.467467,Male,East Matthewfort,161,2,7,24,15524.55,1.640965,15524.55,494 days,646.85625,medium,2,364 days,364,494,1


In [844]:
# Scoring Frequency and Monetary: Higher values are better
joint_data['SiteVisit_Score'] = pd.qcut(joint_data['number_of_site_visits'], 4, labels=[1, 2, 3, 4])
joint_data['EmailOpen_Score'] = pd.qcut(joint_data['number_of_emails_opened'], 4, labels=[1, 2, 3, 4])
joint_data['Click_Score'] = pd.qcut(joint_data['number_of_clicks'], 4, labels=[1, 2, 3, 4])

# Combine scores to a single score
joint_data['Engagement_Score'] = joint_data['SiteVisit_Score'].astype(int) + joint_data['EmailOpen_Score'].astype(int) + joint_data['Click_Score'].astype(int)
joint_data.head()

Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,number_of_site_visits,number_of_emails_opened,number_of_clicks,transaction_id,transaction_amount,...,Avg_Spending_Tier,Avg_Spend_Categorical,most_recent_purchase_date,most_recent_purchase_in_days,Customer_Jointime_in_days,Gender_Categorical,SiteVisit_Score,EmailOpen_Score,Click_Score,Engagement_Score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,10,15,1,6,3509.48,...,medium,2,75 days,75,118,2,1,2,1,4
2,2021-09-08,2023-10-25,43.467467,Male,Hillville,285,49,51,9,6081.32,...,high,3,219 days,219,777,1,4,3,4,11
3,2021-06-01,2022-11-27,43.467467,,North Latoyatown,192,73,25,6,1454.87,...,low,1,551 days,551,544,0,4,4,3,11
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,110,30,17,20,7874.68,...,low,1,638 days,638,243,1,3,3,3,9
5,2022-01-24,2023-06-02,43.467467,Male,East Matthewfort,161,2,7,24,15524.55,...,medium,2,364 days,364,494,1,4,1,2,7


In [845]:
joint_data['Engagement_Score'].describe()

count    10000.00000
mean         7.44240
std          2.89733
min          3.00000
25%          5.00000
50%          8.00000
75%         10.00000
max         12.00000
Name: Engagement_Score, dtype: float64

In [846]:
# PREP Engagement for CATEGORICAL COLUMN 
# 25th percentile = 1, 50th percentile = 2, 75th percentile = 3
def Engage_Categorical(x):
    if x <=5:
        return 1
    elif x <=8:
        return 2
    elif x >= 10:
        return 3
    else:
        return 0

In [847]:
joint_data['Engage_Categorical'] = joint_data['Engagement_Score'].apply(Engage_Categorical)
joint_data.head()

Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,number_of_site_visits,number_of_emails_opened,number_of_clicks,transaction_id,transaction_amount,...,Avg_Spend_Categorical,most_recent_purchase_date,most_recent_purchase_in_days,Customer_Jointime_in_days,Gender_Categorical,SiteVisit_Score,EmailOpen_Score,Click_Score,Engagement_Score,Engage_Categorical
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,10,15,1,6,3509.48,...,2,75 days,75,118,2,1,2,1,4,1
2,2021-09-08,2023-10-25,43.467467,Male,Hillville,285,49,51,9,6081.32,...,3,219 days,219,777,1,4,3,4,11,3
3,2021-06-01,2022-11-27,43.467467,,North Latoyatown,192,73,25,6,1454.87,...,1,551 days,551,544,0,4,4,3,11,3
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,110,30,17,20,7874.68,...,1,638 days,638,243,1,3,3,3,9,0
5,2022-01-24,2023-06-02,43.467467,Male,East Matthewfort,161,2,7,24,15524.55,...,2,364 days,364,494,1,4,1,2,7,2


In [848]:
# Avg Transaction Time = how often do they make a transaction (in days)
joint_data['Avg_Transaction_Time'] = joint_data['Customer_Jointime_in_days']/joint_data['transaction_id']
joint_data.head()

Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,number_of_site_visits,number_of_emails_opened,number_of_clicks,transaction_id,transaction_amount,...,most_recent_purchase_date,most_recent_purchase_in_days,Customer_Jointime_in_days,Gender_Categorical,SiteVisit_Score,EmailOpen_Score,Click_Score,Engagement_Score,Engage_Categorical,Avg_Transaction_Time
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,10,15,1,6,3509.48,...,75 days,75,118,2,1,2,1,4,1,19.666667
2,2021-09-08,2023-10-25,43.467467,Male,Hillville,285,49,51,9,6081.32,...,219 days,219,777,1,4,3,4,11,3,86.333333
3,2021-06-01,2022-11-27,43.467467,,North Latoyatown,192,73,25,6,1454.87,...,551 days,551,544,0,4,4,3,11,3,90.666667
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,110,30,17,20,7874.68,...,638 days,638,243,1,3,3,3,9,0,12.15
5,2022-01-24,2023-06-02,43.467467,Male,East Matthewfort,161,2,7,24,15524.55,...,364 days,364,494,1,4,1,2,7,2,20.583333


In [849]:
joint_data['location'].value_counts()

location
Lake James           15
North Christopher    12
Port John            11
West David           11
New John             10
                     ..
Adamstown             1
Lake Deborahburgh     1
New Joanna            1
New Lynnburgh         1
Hollytown             1
Name: count, Length: 7695, dtype: int64

In [850]:
# Create the LabelEncoder instance
label_encoder = LabelEncoder()

In [851]:
# Fit and transform the data
joint_data['location_encoded'] = label_encoder.fit_transform(joint_data['location'])

In [852]:
# Convert the encoded integers to floats
joint_data['location_encoded'] = joint_data['location_encoded'].astype(float)
# it was found that this hurt when inserted as an input variable for all of the models as the accuracy scores for '1' were
# well under 0.7

## RF (1 MONTH, 3 MONTHS, 6 MONTHS, 12 MONTHS)

RF PREP

In [853]:
# Convert the 'date' column to datetime type
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])

In [854]:
# set reference date
last_date = transactions['transaction_date'].max()

In [855]:
def calculate_rf(data, end_date, days_label):
    rf = data.groupby('customer_id').agg(
        recency = ('transaction_date', lambda x: (end_date - x.max()).days),
        frequency = ('transaction_id', 'count'),
        monetary = ('transaction_amount', 'sum')
    ).rename(columns={
        'recency': f'Recency_{days_label}',
        'frequency': f'Frequency_{days_label}',
        'monetary': f'Monetary_{days_label}'
    })
    return rf

1 MONTH

In [856]:
# Define the time periods
days_30 = last_date - timedelta(days=30)
last_30_days = transactions[(transactions['transaction_date'] > days_30) & (transactions['transaction_date'] <= last_date)]

In [857]:
rf_30 = calculate_rf(last_30_days, last_date, '30')
rf_30
# Conclusion: not enough data to conduct any modeling for rf on last 30 days

Unnamed: 0_level_0,Recency_30,Frequency_30,Monetary_30
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7,13,3,1018.16
48,19,1,439.69
59,24,8,4848.62
66,8,20,10795.72
90,1,19,11212.55
...,...,...,...
9869,28,1,408.71
9873,1,20,8405.83
9875,28,1,114.28
9913,16,7,4337.71


3 MONTHS

In [858]:
days_90 = last_date - timedelta(days=90)
last_90_days = transactions[(transactions['transaction_date'] > days_90) & (transactions['transaction_date'] <= last_date)]
rf_90 = calculate_rf(last_90_days, last_date, '90')
rf_90

Unnamed: 0_level_0,Recency_90,Frequency_90,Monetary_90
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80,1,146.86
7,13,3,1018.16
21,42,1,148.29
37,56,4,2136.82
38,65,2,299.14
...,...,...,...
9956,74,1,318.59
9973,50,6,4729.27
9976,49,2,456.78
9980,76,1,1380.85


In [859]:
# rf Score Creation for 3 months
# Scoring Recency: Lower recency is better
rf_90['R_Score'] = pd.qcut(rf_90['Recency_90'], 4, labels=[4, 3, 2, 1])

# Scoring Frequency and Monetary: Higher values are better
rf_90['F_Score'] = pd.qcut(rf_90['Frequency_90'], 4, labels=[1, 2, 3, 4])
rf_90['M_Score'] = pd.qcut(rf_90['Monetary_90'], 4, labels=[1, 2, 3, 4])

# Combine scores to a single score
rf_90['rf_Score'] = rf_90['R_Score'].astype(int) + rf_90['F_Score'].astype(int)
rf_90.head()

Unnamed: 0_level_0,Recency_90,Frequency_90,Monetary_90,R_Score,F_Score,M_Score,rf_Score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,80,1,146.86,1,1,1,2
7,13,3,1018.16,4,2,1,6
21,42,1,148.29,3,1,1,4
37,56,4,2136.82,2,2,2,4
38,65,2,299.14,2,1,1,3


In [860]:
rf_90 = pd.merge(rf_90, joint_data[['LTV','age', 'Customer_Join_Time', 'Avg_Spend_Categorical', 'Customer_Jointime_in_days',
                                      'Engage_Categorical', 'Gender_Categorical','Avg_Transaction_Time', 'campaign_id']], on='customer_id', how='left')
rf_90.head()

Unnamed: 0_level_0,Recency_90,Frequency_90,Monetary_90,R_Score,F_Score,M_Score,rf_Score,LTV,age,Customer_Join_Time,Avg_Spend_Categorical,Customer_Jointime_in_days,Engage_Categorical,Gender_Categorical,Avg_Transaction_Time,campaign_id
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,80,1,146.86,1,1,1,2,3509.48,56.0,118 days,2,118,1,2,19.666667,1.0
7,13,3,1018.16,4,2,1,6,1339.82,43.467467,170 days,1,170,2,2,34.0,1.0
21,42,1,148.29,3,1,1,4,6547.69,20.0,406 days,3,406,3,1,40.6,3.0
37,56,4,2136.82,2,2,2,4,7184.76,54.0,310 days,2,310,2,2,25.833333,1.640965
38,65,2,299.14,2,1,1,3,7710.39,43.467467,397 days,2,397,3,2,30.538462,1.0


6 MONTHS

In [861]:
days_180 = last_date - timedelta(days=180)
last_180_days = transactions[(transactions['transaction_date'] > days_180) & (transactions['transaction_date'] <= last_date)]
rf_180 = calculate_rf(last_180_days, last_date, '180')
rf_180.head()

Unnamed: 0_level_0,Recency_180,Frequency_180,Monetary_180
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80,6,3509.48
7,13,5,1339.82
12,150,4,1587.62
15,162,2,1304.38
17,121,4,1314.81


In [862]:
# rf Score Creation for 6 months
# Scoring Recency: Lower recency is better
rf_180['R_Score'] = pd.qcut(rf_180['Recency_180'], 4, labels=[4, 3, 2, 1])

# Scoring Frequency and Monetary: Higher values are better
rf_180['F_Score'] = pd.qcut(rf_180['Frequency_180'], 4, labels=[1, 2, 3, 4])
rf_180['M_Score'] = pd.qcut(rf_180['Monetary_180'], 4, labels=[1, 2, 3, 4])

# Combine scores to a single score
rf_180['rf_Score'] = rf_180['R_Score'].astype(int) + rf_180['F_Score'].astype(int)
rf_180.head()

Unnamed: 0_level_0,Recency_180,Frequency_180,Monetary_180,R_Score,F_Score,M_Score,rf_Score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,80,6,3509.48,3,2,3,5
7,13,5,1339.82,4,2,2,6
12,150,4,1587.62,1,2,2,3
15,162,2,1304.38,1,1,2,2
17,121,4,1314.81,2,2,2,4


In [863]:
rf_180 = pd.merge(rf_180, joint_data[['age', 'LTV', 'Customer_Join_Time', 'Avg_Spend_Categorical', 'Customer_Jointime_in_days',
                                      'Engage_Categorical', 'Gender_Categorical','Avg_Transaction_Time', 'campaign_id']], on='customer_id', how='left')
rf_180.head()

Unnamed: 0_level_0,Recency_180,Frequency_180,Monetary_180,R_Score,F_Score,M_Score,rf_Score,age,LTV,Customer_Join_Time,Avg_Spend_Categorical,Customer_Jointime_in_days,Engage_Categorical,Gender_Categorical,Avg_Transaction_Time,campaign_id
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,80,6,3509.48,3,2,3,5,56.0,3509.48,118 days,2,118,1,2,19.666667,1.0
7,13,5,1339.82,4,2,2,6,43.467467,1339.82,170 days,1,170,2,2,34.0,1.0
12,150,4,1587.62,1,2,2,3,23.0,9244.97,1059 days,3,1059,3,1,88.25,1.0
15,162,2,1304.38,1,1,2,2,58.0,10997.99,644 days,2,644,3,2,37.882353,1.0
17,121,4,1314.81,2,2,2,4,40.0,2964.38,66 days,2,66,1,2,13.2,2.0


1 YEAR

In [864]:
days_365 = last_date - timedelta(days=365)
last_365_days = transactions[(transactions['transaction_date'] > days_365) & (transactions['transaction_date'] <= last_date)]
rf_365 = calculate_rf(last_365_days, last_date, '365')
rf_365.head()

Unnamed: 0_level_0,Recency_365,Frequency_365,Monetary_365
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80,6,3509.48
7,13,5,1339.82
8,305,7,4693.03
9,306,1,252.74
12,150,5,3221.32


In [865]:
# rf Score Creation for 1 year
# Scoring Recency: Lower recency is better
rf_365['R_Score'] = pd.qcut(rf_365['Recency_365'], 4, labels=[4, 3, 2, 1])

# Scoring Frequency and Monetary: Higher values are better
rf_365['F_Score'] = pd.qcut(rf_365['Frequency_365'], 4, labels=[1, 2, 3, 4])
rf_365['M_Score'] = pd.qcut(rf_365['Monetary_365'], 4, labels=[1, 2, 3, 4])

# Combine scores to a single score
rf_365['rf_Score'] = rf_365['R_Score'].astype(int) + rf_365['F_Score'].astype(int)
rf_365.head()

Unnamed: 0_level_0,Recency_365,Frequency_365,Monetary_365,R_Score,F_Score,M_Score,rf_Score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,80,6,3509.48,3,2,2,5
7,13,5,1339.82,4,2,1,6
8,305,7,4693.03,1,2,3,3
9,306,1,252.74,1,1,1,2
12,150,5,3221.32,3,2,2,5


In [866]:
rf_365 = pd.merge(rf_365, joint_data[['age', 'LTV', 'Customer_Join_Time', 'Avg_Spend_Categorical', 'Customer_Jointime_in_days',
                                      'Engage_Categorical', 'Gender_Categorical','Avg_Transaction_Time', 'campaign_id' ]], on='customer_id', how='left')
rf_365.head()

Unnamed: 0_level_0,Recency_365,Frequency_365,Monetary_365,R_Score,F_Score,M_Score,rf_Score,age,LTV,Customer_Join_Time,Avg_Spend_Categorical,Customer_Jointime_in_days,Engage_Categorical,Gender_Categorical,Avg_Transaction_Time,campaign_id
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,80,6,3509.48,3,2,2,5,56.0,3509.48,118 days,2,118,1,2,19.666667,1.0
7,13,5,1339.82,4,2,1,6,43.467467,1339.82,170 days,1,170,2,2,34.0,1.0
8,305,7,4693.03,1,2,3,3,68.0,11710.62,181 days,3,181,2,1,12.066667,2.0
9,306,1,252.74,1,1,1,2,68.0,10382.96,568 days,1,568,2,2,25.818182,1.0
12,150,5,3221.32,3,2,2,5,23.0,9244.97,1059 days,3,1059,3,1,88.25,1.0


## MODEL BUILDING AND EVALUATION

3 MONTHS RF - LOGISTICAL REGRESSION (THIS WAS CHOSEN AS THE MDOEL OF CHOICE FOR RF)

In [867]:
joint_data['LTV'].describe()

count    10000.000000
mean      7737.562981
std       4628.799469
min         23.810000
25%       3839.915000
50%       7498.890000
75%      11275.797500
max      24298.220000
Name: LTV, dtype: float64

In [868]:
# 75th percentile or better for the joint_data['LTV'] is used for binary output because we are trying to predict the highest value customers
# and this is reflective of the highest $$$ value customers over the lifetime of transactions
rf_90['binary_output'] = rf_90['LTV'].apply(lambda x: 1 if x>=11275.797500 else 0)
rf_90['binary_output'].value_counts()

binary_output
0    1120
1     477
Name: count, dtype: int64

In [869]:
# This model will predict whether there will be a large amount of high value customers 
X_90 = rf_90[['age','Customer_Jointime_in_days', 'Engage_Categorical', 'Gender_Categorical', 
              'Avg_Transaction_Time', 'campaign_id', 'Recency_90','Frequency_90' ]]
y_90 = rf_90['binary_output']

In [870]:
# reserve 30% for testing
X_train_90, X_test_90, y_train_90, y_test_90 = train_test_split(X_90,y_90, test_size=0.3, random_state=42)

In [871]:
# build our pipeline that includes these transformations
numeric_columns_90 = ['age','Customer_Jointime_in_days', 'Avg_Transaction_Time', 'campaign_id',
                      'Recency_90','Frequency_90' ]
categorical_columns_90 = ['Engage_Categorical', 'Gender_Categorical']

In [872]:
# create a pre-processing pipeline which includes the steps of Scaling numeric variables and encoding categoricals
preprocessor_90 = ColumnTransformer(
    transformers=[
        ('num',MinMaxScaler(), numeric_columns_90),
        ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_columns_90)
    ]
)

In [873]:
# test 3 models with cross validation to see which ones work best for this data
knn_90 = KNeighborsClassifier()
logreg_90 = LogisticRegression()
nb_90 = GaussianNB()

In [874]:
# cross validation to determine in general which model works best for the given problem
knn_scores_90 = cross_val_score(knn_90, X_train_90, y_train_90, scoring='f1', cv=5)
logreg_scores_90 = cross_val_score(logreg_90, X_train_90, y_train_90, scoring='f1', cv=5)
nb_scores_90 = cross_val_score(nb_90, X_train_90, y_train_90, scoring='f1', cv=5)
print(f"knn_scores_90: {np.mean(knn_scores_90)}")
print(f"logreg_scores_90: {np.mean(logreg_scores_90)}")
print(f"nb_scores_90: {np.mean(nb_scores_90)}")

# KNN ruled out because it has lowest score Logreg will be put through the pipeline

knn_scores_90: 0.5048956722884775
logreg_scores_90: 0.670379761997604
nb_scores_90: 0.5040129057806172


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [875]:
# build our pipeline
pipeline_90 = Pipeline(steps=[
    ('preprocessor', preprocessor_90),
    ('classifier', LogisticRegression(max_iter=1000, solver='lbfgs'))])

In [876]:
# use GRID SEARCH to find the best combination of hyperparameters for our problem
param_grid_90 = {
  'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],  # Algorithms to use in the optimization problem
    'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Norm used in the penalization
    'classifier__max_iter': [100, 200, 300]  # Maximum number of iterations taken for the solvers to converge
}

In [877]:
grid_search_90 = GridSearchCV(pipeline_90, param_grid_90, cv=5, verbose=1, scoring='f1')
grid_search_90.fit(X_train_90,y_train_90)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


975 fits failed out of a total of 1500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estim

In [878]:
final_classifier_90 = LogisticRegression(
    C=grid_search_90.best_params_['classifier__C'],
    solver=grid_search_90.best_params_['classifier__solver'],
    penalty=grid_search_90.best_params_['classifier__penalty'],
    max_iter=grid_search_90.best_params_['classifier__max_iter']
)

In [879]:
final_pipeline_90 = Pipeline(steps=
                          [
                              ('preprocessor',preprocessor_90),
                              ('classifier',final_classifier_90)
                          ])

In [880]:
final_pipeline_90.fit(X_train_90,y_train_90)

In [881]:
pred_90 = final_pipeline_90.predict(X_test_90)
probs_90 = final_pipeline_90.predict_proba(X_test_90)

In [882]:
# F1 Scores for KNN, LogReg, and NB - Using RF Scores over 6 month period (before building pipeline and using GridSearch)
# Logreg chosen due to having highest score
report_90 = classification_report(y_test_90, pred_90)
print(report_90)

              precision    recall  f1-score   support

           0       0.89      0.89      0.89       331
           1       0.75      0.76      0.76       149

    accuracy                           0.85       480
   macro avg       0.82      0.82      0.82       480
weighted avg       0.85      0.85      0.85       480



RandomForestRegressor - 3 MONTH RF

In [883]:
# Create a RandomForestRegressor object
model_90RFR = RandomForestRegressor(n_estimators=100, random_state=42)

# Splitting the data into training and testing sets
X_train_90RFR, X_test_90RFR, y_train_90RFR, y_test_90RFR = train_test_split(X_90, y_90, test_size=0.3, random_state=42)

In [884]:
# Fit the model to the training data
model_90RFR.fit(X_train_90RFR, y_train_90RFR)

In [885]:
y_pred_90RFR= model_90RFR.predict(X_test_90RFR)
y_pred_90RFR

array([0.01, 0.03, 0.  , 0.  , 0.62, 0.02, 0.59, 0.05, 0.  , 0.63, 0.06,
       0.  , 0.  , 0.03, 0.52, 0.55, 0.02, 0.  , 0.14, 0.09, 0.  , 0.03,
       0.  , 0.64, 0.06, 0.01, 0.02, 0.77, 0.01, 0.31, 0.01, 0.01, 0.08,
       0.47, 0.  , 0.  , 0.31, 0.16, 0.  , 0.01, 0.95, 0.66, 0.01, 0.  ,
       0.77, 0.  , 0.01, 0.89, 0.05, 0.31, 0.  , 0.04, 0.01, 0.08, 0.42,
       0.26, 0.56, 0.03, 0.14, 0.05, 0.57, 0.  , 0.  , 0.17, 0.74, 0.79,
       0.  , 0.07, 0.  , 0.85, 0.54, 0.27, 0.27, 0.24, 0.11, 0.84, 0.87,
       0.09, 0.54, 0.72, 0.83, 0.11, 0.02, 0.76, 0.  , 0.  , 0.01, 0.04,
       0.07, 0.08, 0.01, 0.  , 0.09, 0.45, 0.01, 0.71, 0.01, 0.  , 0.69,
       0.76, 0.02, 0.74, 0.83, 0.  , 0.  , 0.61, 0.47, 0.04, 0.01, 0.  ,
       0.6 , 0.62, 0.57, 0.  , 0.01, 0.9 , 0.92, 0.  , 0.65, 0.46, 0.09,
       0.03, 0.66, 0.36, 0.02, 0.79, 0.93, 0.12, 0.93, 0.  , 0.  , 0.82,
       0.  , 0.18, 0.  , 0.06, 0.04, 0.  , 0.14, 0.  , 0.  , 0.58, 0.  ,
       0.83, 0.12, 0.56, 0.74, 0.4 , 0.88, 0.02, 0.

In [886]:
# Convert predictions to binary by applying a threshold
threshold_90RFR = 0.5
y_pred_binary_90RFR = (y_pred_90RFR > threshold_90RFR).astype(int)


#  RandomForestRegressor Scores - Using RF Scores over 3 month period
print(classification_report(y_test_90RFR, y_pred_binary_90RFR))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88       331
           1       0.73      0.70      0.72       149

    accuracy                           0.83       480
   macro avg       0.80      0.79      0.80       480
weighted avg       0.83      0.83      0.83       480



6 MONTHS

In [887]:
# 75th percentile or better for the joint_data['LTV'] is used for binary output because we are trying to predict the highest value customers
# and this is reflective of the highest $$$ value customers over the lifetime of transactions
rf_180['binary_output'] = rf_180['LTV'].apply(lambda x: 1 if x>=11275.797500 else 0)
rf_180['binary_output'].value_counts()

binary_output
0    2089
1     812
Name: count, dtype: int64

In [888]:
# This model will predict whether there will be a large amount of high value customers 
X_180 = rf_180[['age','Customer_Jointime_in_days', 'Engage_Categorical', 'Gender_Categorical', 'Avg_Transaction_Time', 'campaign_id',
                'Frequency_180', 'Recency_180']]
y_180 = rf_180['binary_output']

In [889]:
# reserve 30% for testing
X_train_180, X_test_180, y_train_180, y_test_180 = train_test_split(X_180,y_180, test_size=0.3, random_state=42)

In [890]:
# build our pipeline that includes these transformations
numeric_columns_180 = ['Frequency_180', 'Recency_180','age','Customer_Jointime_in_days', 'Avg_Transaction_Time', 'campaign_id']
categorical_columns_180 = ['Engage_Categorical', 'Gender_Categorical']

In [891]:
# create a pre-processing pipeline which includes the steps of Scaling numeric variables and encoding categoricals
preprocessor_180 = ColumnTransformer(
    transformers=[
        ('num',MinMaxScaler(), numeric_columns_180),
        ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_columns_180)
    ]
)

In [892]:
# test 3 models with cross validation to see which ones work best for this data
knn_180 = KNeighborsClassifier()
logreg_180 = LogisticRegression()
nb_180 = GaussianNB()

In [893]:
# cross validation to determine in general which model works best for the given problem
knn_scores_180 = cross_val_score(knn_180, X_train_180, y_train_180, scoring='f1', cv=5)
logreg_scores_180 = cross_val_score(logreg_180, X_train_180, y_train_180, scoring='f1', cv=5)
nb_scores_180 = cross_val_score(nb_180, X_train_180, y_train_180, scoring='f1', cv=5)
print(f"knn_scores_180: {np.mean(knn_scores_180)}")
print(f"logreg_scores_180: {np.mean(logreg_scores_180)}")
print(f"nb_scores_180: {np.mean(nb_scores_180)}")

knn_scores_180: 0.4817200213028068
logreg_scores_180: 0.7020856644400768
nb_scores_180: 0.49006567089682324


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [894]:
# build our pipeline
pipeline_180 = Pipeline(steps=[
    ('preprocessor', preprocessor_180),
    ('classifier', LogisticRegression(max_iter=1000, solver='lbfgs'))])

In [895]:
# use GRID SEARCH to find the best combination of hyperparameters for our problem
param_grid_180 = {
  'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],  # Algorithms to use in the optimization problem
    'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Norm used in the penalization
    'classifier__max_iter': [100, 200, 300]  # Maximum number of iterations taken for the solvers to converge
}

In [896]:
grid_search_180 = GridSearchCV(pipeline_180, param_grid_180, cv=5, verbose=1, scoring='f1')
grid_search_180.fit(X_train_180,y_train_180)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


975 fits failed out of a total of 1500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estim

In [897]:
final_classifier_180 = LogisticRegression(
    C=grid_search_180.best_params_['classifier__C'],
    solver=grid_search_180.best_params_['classifier__solver'],
    penalty=grid_search_180.best_params_['classifier__penalty'],
    max_iter=grid_search_180.best_params_['classifier__max_iter']
)

In [898]:
final_pipeline_180 = Pipeline(steps=
                          [
                              ('preprocessor',preprocessor_180),
                              ('classifier',final_classifier_180)
                          ])

In [899]:
final_pipeline_180.fit(X_train_180,y_train_180)

In [900]:
pred_180 = final_pipeline_180.predict(X_test_180)
probs_180 = final_pipeline_180.predict_proba(X_test_180)

In [901]:
# F1 Scores for KNN, LogReg, and NB - Using RF Scores over 6 month period (before building pipeline and using GridSearch)
# Logreg chosen due to having highest score
report_180 = classification_report(y_test_180, pred_180)
print(report_180)

              precision    recall  f1-score   support

           0       0.87      0.91      0.89       615
           1       0.75      0.68      0.71       256

    accuracy                           0.84       871
   macro avg       0.81      0.79      0.80       871
weighted avg       0.84      0.84      0.84       871



RandomForestRegressor - 6 MONTH RFM

In [902]:
# Create a RandomForestRegressor object
model_180RFR = RandomForestRegressor(n_estimators=100, random_state=42)

# Splitting the data into training and testing sets
X_train_180RFR, X_test_180RFR, y_train_180RFR, y_test_180RFR = train_test_split(X_180, y_180, test_size=0.3, random_state=42)

In [903]:
# Fit the model to the training data
model_180RFR.fit(X_train_180RFR, y_train_180RFR)

In [904]:
y_pred_180RFR= model_180RFR.predict(X_test_180RFR)
y_pred_180RFR

array([0.  , 0.78, 0.02, 0.01, 0.57, 0.02, 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.01, 0.7 , 0.  , 0.  , 0.5 , 0.12, 0.63, 0.14, 0.03,
       0.01, 0.  , 0.  , 0.56, 0.03, 0.  , 0.  , 0.12, 0.  , 0.87, 0.  ,
       0.71, 0.14, 0.01, 0.05, 0.  , 0.2 , 0.03, 0.09, 0.19, 0.87, 0.  ,
       0.  , 0.25, 0.  , 0.  , 0.04, 0.  , 0.9 , 0.07, 0.  , 0.86, 0.59,
       0.  , 0.01, 0.9 , 0.14, 0.69, 0.  , 0.  , 0.21, 0.  , 0.54, 0.  ,
       0.48, 0.  , 0.66, 0.21, 0.  , 0.55, 0.04, 0.55, 0.19, 0.  , 0.  ,
       0.59, 0.81, 0.  , 0.24, 0.32, 0.  , 0.04, 0.01, 0.  , 0.03, 0.15,
       0.  , 0.08, 0.79, 0.  , 0.21, 0.  , 0.08, 0.  , 0.71, 0.  , 0.  ,
       0.96, 0.01, 0.68, 0.57, 0.38, 0.  , 0.25, 0.82, 0.02, 0.  , 0.  ,
       0.17, 0.61, 0.  , 0.75, 0.6 , 0.99, 0.02, 0.06, 0.67, 0.01, 0.  ,
       0.77, 0.9 , 0.75, 0.42, 0.51, 0.02, 0.45, 0.09, 0.21, 0.06, 0.05,
       0.01, 0.7 , 0.  , 0.  , 0.  , 0.92, 0.96, 0.05, 0.44, 0.  , 0.39,
       0.  , 0.32, 0.42, 0.29, 0.  , 0.24, 0.59, 0.

In [905]:
# Convert predictions to binary by applying a threshold
threshold_180RFR = 0.5
y_pred_binary_180RFR = (y_pred_180RFR > threshold_180RFR).astype(int)


#  RandomForestRegressor Scores - Using RF Scores over 6 month period
print(classification_report(y_test_180RFR, y_pred_binary_180RFR))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88       615
           1       0.73      0.68      0.71       256

    accuracy                           0.83       871
   macro avg       0.80      0.79      0.79       871
weighted avg       0.83      0.83      0.83       871



1 YEAR

In [906]:
# 75th percentile or better for the joint_data['LTV'] is used for binary output because we are trying to predict the highest value customers
# and this is reflective of the highest $$$ value customers over the lifetime of transactions
rf_365['binary_output'] = rf_365['LTV'].apply(lambda x: 1 if x>=11275.797500 else 0)
rf_365['binary_output'].value_counts()

binary_output
0    3721
1    1323
Name: count, dtype: int64

In [907]:
# This model will predict whether there will be a large amount of high value customers 
X_365 = rf_365[['Recency_365','Frequency_365','age','Customer_Jointime_in_days', 'Engage_Categorical', 
                 'Gender_Categorical', 'Avg_Transaction_Time', 'campaign_id']]
y_365 = rf_365['binary_output']

In [908]:
# reserve 30% for testing
X_train_365, X_test_365, y_train_365, y_test_365 = train_test_split(X_365,y_365, test_size=0.3, random_state=42)

In [909]:
# build our pipeline that includes these transformations
numeric_columns_365 = ['Recency_365','Frequency_365','age','Customer_Jointime_in_days', 'Avg_Transaction_Time', 'campaign_id']
categorical_columns_365 = ['Engage_Categorical', 'Gender_Categorical']

In [910]:
# create a pre-processing pipeline which includes the steps of Scaling numeric variables and encoding categoricals
preprocessor_365 = ColumnTransformer(
    transformers=[
        ('num',MinMaxScaler(), numeric_columns_365),
        ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_columns_365)
    ]
)

In [911]:
# test 3 models with cross validation to see which ones work best for this data
knn_365 = KNeighborsClassifier()
logreg_365 = LogisticRegression()
nb_365 = GaussianNB()

In [912]:
# cross validation to determine in general which model works best for the given problem
knn_scores_365 = cross_val_score(knn_365, X_train_365, y_train_365, scoring='f1', cv=5)
logreg_scores_365 = cross_val_score(logreg_365, X_train_365, y_train_365, scoring='f1', cv=5)
nb_scores_365 = cross_val_score(nb_365, X_train_365, y_train_365, scoring='f1', cv=5)
print(f"knn_scores_365: {np.mean(knn_scores_365)}")
print(f"logreg_scores_365: {np.mean(logreg_scores_365)}")
print(f"nb_scores_365: {np.mean(nb_scores_365)}")

knn_scores_365: 0.4780396027700874
logreg_scores_365: 0.714646705690817
nb_scores_365: 0.5198162976597955


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [913]:
# build our pipeline
pipeline_365 = Pipeline(steps=[
    ('preprocessor', preprocessor_365),
    ('classifier', LogisticRegression(max_iter=1000, solver='lbfgs'))])

In [914]:
# use GRID SEARCH to find the best combination of hyperparameters for our problem
param_grid_365 = {
  'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],  # Algorithms to use in the optimization problem
    'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Norm used in the penalization
    'classifier__max_iter': [100, 200, 300]  # Maximum number of iterations taken for the solvers to converge
}

In [915]:
grid_search_365 = GridSearchCV(pipeline_365, param_grid_365, cv=5, verbose=1, scoring='f1')
grid_search_365.fit(X_train_365,y_train_365)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


975 fits failed out of a total of 1500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estim

In [916]:
final_classifier_365 = LogisticRegression(
    C=grid_search_365.best_params_['classifier__C'],
    solver=grid_search_365.best_params_['classifier__solver'],
    penalty=grid_search_365.best_params_['classifier__penalty'],
    max_iter=grid_search_365.best_params_['classifier__max_iter']
)

In [917]:
final_pipeline_365 = Pipeline(steps=
                          [
                              ('preprocessor',preprocessor_365),
                              ('classifier',final_classifier_365)
                          ])

In [918]:
final_pipeline_365.fit(X_train_365,y_train_365)

In [919]:
pred_365 = final_pipeline_365.predict(X_test_365)
probs_365 = final_pipeline_365.predict_proba(X_test_365)

In [920]:
# F1 Scores for KNN, LogReg, and NB - Using RF Scores over 1 year period (before building pipeline and using GridSearch)
# Logreg chosen due to having highest score
report_365 = classification_report(y_test_365, pred_365)
print(report_365)

              precision    recall  f1-score   support

           0       0.90      0.91      0.90      1108
           1       0.74      0.71      0.73       406

    accuracy                           0.86      1514
   macro avg       0.82      0.81      0.81      1514
weighted avg       0.85      0.86      0.85      1514



RandomForestRegressor - 1 YEAR RFM

In [921]:
# Create a RandomForestRegressor object
model_365RFR = RandomForestRegressor(n_estimators=100, random_state=42)
# Splitting the data into training and testing sets
X_train_365RFR, X_test_365RFR, y_train_365RFR, y_test_365RFR = train_test_split(X_365, y_365, test_size=0.3, random_state=42)

In [922]:
# Fit the model to the training data
model_365RFR.fit(X_train_365RFR, y_train_365RFR)

In [923]:
y_pred_365RFR= model_365RFR.predict(X_test_365RFR)
y_pred_365RFR

array([0.94, 0.65, 0.  , ..., 0.  , 0.69, 0.  ])

In [924]:
# Convert predictions to binary by applying a threshold
threshold_365RFR = 0.5
y_pred_binary_365RFR = (y_pred_365RFR > threshold_365RFR).astype(int)
# Logistical Regression - Using RF Scores over 1 year period (after building pipeline and using GridSearch)
print(classification_report(y_test_365RFR, y_pred_binary_365RFR))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89      1108
           1       0.71      0.69      0.70       406

    accuracy                           0.84      1514
   macro avg       0.80      0.79      0.80      1514
weighted avg       0.84      0.84      0.84      1514



## SCORES

3 MONTHS

In [925]:
# F1 Scores for KNN, LogReg, and NB - Using RF Scores over 3 month period (before building pipeline and using GridSearch)
# Logreg chosen due to having highest score
print(f"knn_scores_90: {np.mean(knn_scores_90)}")
print(f"logreg_scores_90: {np.mean(logreg_scores_90)}")
print(f"nb_scores_90: {np.mean(nb_scores_90)}")

knn_scores_90: 0.5048956722884775
logreg_scores_90: 0.670379761997604
nb_scores_90: 0.5040129057806172


In [926]:
# Logistical Regression - Using RF Scores over 3 month period (after building pipeline and using GridSearch)
report_90 = classification_report(y_test_90, pred_90)
print(report_90)


              precision    recall  f1-score   support

           0       0.89      0.89      0.89       331
           1       0.75      0.76      0.76       149

    accuracy                           0.85       480
   macro avg       0.82      0.82      0.82       480
weighted avg       0.85      0.85      0.85       480



In [927]:
#  RandomForestRegressor Scores - Using RF Scores over 3 month period
print(classification_report(y_test_90RFR, y_pred_binary_90RFR))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88       331
           1       0.73      0.70      0.72       149

    accuracy                           0.83       480
   macro avg       0.80      0.79      0.80       480
weighted avg       0.83      0.83      0.83       480



6 MONTHS

In [928]:
# F1 Scores for KNN, LogReg, and NB - Using RF Scores over 6 month period (before building pipeline and using GridSearch)
# Logreg chosen due to having highest score
print(f"knn_scores_180: {np.mean(knn_scores_180)}")
print(f"logreg_scores_180: {np.mean(logreg_scores_180)}")
print(f"nb_scores_180: {np.mean(nb_scores_180)}")

knn_scores_180: 0.4817200213028068
logreg_scores_180: 0.7020856644400768
nb_scores_180: 0.49006567089682324


In [929]:
# Logistical Regression - Using RF Scores over 6 month period (after building pipeline and using GridSearch)
report_180 = classification_report(y_test_180, pred_180)
print(report_180)

              precision    recall  f1-score   support

           0       0.87      0.91      0.89       615
           1       0.75      0.68      0.71       256

    accuracy                           0.84       871
   macro avg       0.81      0.79      0.80       871
weighted avg       0.84      0.84      0.84       871



In [930]:
#  RandomForestRegressor Scores - Using RF Scores over 6 month period
print(classification_report(y_test_180RFR, y_pred_binary_180RFR))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88       615
           1       0.73      0.68      0.71       256

    accuracy                           0.83       871
   macro avg       0.80      0.79      0.79       871
weighted avg       0.83      0.83      0.83       871



1 YEAR

In [931]:
# F1 Scores for KNN, LogReg, and NB - Using RF Scores over 1 year period (before building pipeline and using GridSearch)
# Logreg chosen due to having highest score
print(f"knn_scores_365: {np.mean(knn_scores_365)}")
print(f"logreg_scores_365: {np.mean(logreg_scores_365)}")
print(f"nb_scores_365: {np.mean(nb_scores_365)}")

knn_scores_365: 0.4780396027700874
logreg_scores_365: 0.714646705690817
nb_scores_365: 0.5198162976597955


In [932]:
# Logistical Regression - Using RF Scores over 1 year period (after building pipeline and using GridSearch)
report_365 = classification_report(y_test_365, pred_365)
print(report_365)

              precision    recall  f1-score   support

           0       0.90      0.91      0.90      1108
           1       0.74      0.71      0.73       406

    accuracy                           0.86      1514
   macro avg       0.82      0.81      0.81      1514
weighted avg       0.85      0.86      0.85      1514



In [933]:
#  RandomForestRegressor Scores - Using RF Scores over 1 year period
print(classification_report(y_test_365RFR, y_pred_binary_365RFR))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89      1108
           1       0.71      0.69      0.70       406

    accuracy                           0.84      1514
   macro avg       0.80      0.79      0.80      1514
weighted avg       0.84      0.84      0.84      1514

