In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, PrecisionRecallDisplay, precision_score, recall_score, roc_auc_score, RocCurveDisplay, roc_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [2]:
# read all tables
customers = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/customers_final.csv')
engagement = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/engagements_final.csv')
marketing = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/marketing_final.csv')
transactions = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/transactions_final.csv')

In [3]:
# example 1: join at customer level with # of marketing campaigns customer responded YES to
# step 1: marketing data at a customer level
marketing_agg = marketing[marketing['response']=='Yes'].groupby('customer_id')['campaign_id'].count().to_frame()

In [4]:
# step 2: aggregate transaction data at a customer level
transactions_agg = transactions.groupby('customer_id').aggregate({'transaction_id':'count','transaction_amount':'sum'})

In [5]:
# step 3: set customers and engagement index as customer_id
customers.set_index('customer_id', inplace=True)
engagement.set_index('customer_id', inplace=True)

In [6]:
# step 4: join all tables
joint_data = customers.join(engagement).join(transactions_agg).join(marketing_agg)
joint_data

Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,number_of_site_visits,number_of_emails_opened,number_of_clicks,transaction_id,transaction_amount,campaign_id
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,10,15,1,6,3509.48,1.0
2,2021-09-08,2023-10-25,,Male,Hillville,285,49,51,9,6081.32,2.0
3,2021-06-01,2022-11-27,,,North Latoyatown,192,73,25,6,1454.87,1.0
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,110,30,17,20,7874.68,1.0
5,2022-01-24,2023-06-02,,Male,East Matthewfort,161,2,7,24,15524.55,
...,...,...,...,...,...,...,...,...,...,...,...
9996,2022-12-16,2023-08-13,42.0,Female,Johnstonborough,119,47,16,10,5498.20,
9997,2022-07-09,2023-01-25,26.0,Male,Jessicamouth,3,33,14,12,5848.30,1.0
9998,2023-09-17,2024-01-30,39.0,Male,New John,53,17,5,3,3503.13,1.0
9999,2022-05-10,2022-07-15,31.0,Female,Andrewland,23,5,4,12,6721.86,


In [8]:
# Calculation of LTV
joint_data.groupby('customer_id')['transaction_amount'].sum()

customer_id
1         3509.48
2         6081.32
3         1454.87
4         7874.68
5        15524.55
           ...   
9996      5498.20
9997      5848.30
9998      3503.13
9999      6721.86
10000     5775.83
Name: transaction_amount, Length: 10000, dtype: float64

In [18]:
joint_data['LTV'] = joint_data.groupby('customer_id')['transaction_amount'].sum()
joint_data

Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,number_of_site_visits,number_of_emails_opened,number_of_clicks,transaction_id,transaction_amount,campaign_id,LTV,Customer_Age
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,10,15,1,6,3509.48,1.0,3509.48,118 days
2,2021-09-08,2023-10-25,,Male,Hillville,285,49,51,9,6081.32,2.0,6081.32,777 days
3,2021-06-01,2022-11-27,,,North Latoyatown,192,73,25,6,1454.87,1.0,1454.87,544 days
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,110,30,17,20,7874.68,1.0,7874.68,243 days
5,2022-01-24,2023-06-02,,Male,East Matthewfort,161,2,7,24,15524.55,,15524.55,494 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,2022-12-16,2023-08-13,42.0,Female,Johnstonborough,119,47,16,10,5498.20,,5498.20,240 days
9997,2022-07-09,2023-01-25,26.0,Male,Jessicamouth,3,33,14,12,5848.30,1.0,5848.30,200 days
9998,2023-09-17,2024-01-30,39.0,Male,New John,53,17,5,3,3503.13,1.0,3503.13,135 days
9999,2022-05-10,2022-07-15,31.0,Female,Andrewland,23,5,4,12,6721.86,,6721.86,66 days


In [16]:
# customer age
joint_data['Customer_Age'] = pd.to_datetime(joint_data['last_purchase_date']) - pd.to_datetime(joint_data['join_date'])
joint_data['Customer_Age']

customer_id
1       118 days
2       777 days
3       544 days
4       243 days
5       494 days
          ...   
9996    240 days
9997    200 days
9998    135 days
9999     66 days
10000    98 days
Name: Customer_Age, Length: 10000, dtype: timedelta64[ns]

In [26]:
# Avg transaction amount
joint_data['Avg_Transaction_Amount'] = (joint_data['LTV'])/(joint_data['transaction_id'])
joint_data


Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,number_of_site_visits,number_of_emails_opened,number_of_clicks,transaction_id,transaction_amount,campaign_id,LTV,Customer_Age,Avg_Transaction_Amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,10,15,1,6,3509.48,1.0,3509.48,118 days,584.913333
2,2021-09-08,2023-10-25,,Male,Hillville,285,49,51,9,6081.32,2.0,6081.32,777 days,675.702222
3,2021-06-01,2022-11-27,,,North Latoyatown,192,73,25,6,1454.87,1.0,1454.87,544 days,242.478333
4,2022-01-01,2022-09-01,29.0,Male,Grossstad,110,30,17,20,7874.68,1.0,7874.68,243 days,393.734000
5,2022-01-24,2023-06-02,,Male,East Matthewfort,161,2,7,24,15524.55,,15524.55,494 days,646.856250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,2022-12-16,2023-08-13,42.0,Female,Johnstonborough,119,47,16,10,5498.20,,5498.20,240 days,549.820000
9997,2022-07-09,2023-01-25,26.0,Male,Jessicamouth,3,33,14,12,5848.30,1.0,5848.30,200 days,487.358333
9998,2023-09-17,2024-01-30,39.0,Male,New John,53,17,5,3,3503.13,1.0,3503.13,135 days,1167.710000
9999,2022-05-10,2022-07-15,31.0,Female,Andrewland,23,5,4,12,6721.86,,6721.86,66 days,560.155000


In [33]:
# Recency for RFM
joint_data['most_recent_purchase_date'] = pd.to_datetime(joint_data['last_purchase_date'])
recency_df = joint_data.groupby('customer_id').most_recent_purchase_date.max().reset_index()
recency_df.columns = ['customer_id', 'last_purchase_date']
now = joint_data['most_recent_purchase_date'].max()
recency_df['Recency'] = (now - recency_df['last_purchase_date'])
recency_df

Unnamed: 0,customer_id,last_purchase_date,Recency
0,1,2024-03-17,75 days
1,2,2023-10-25,219 days
2,3,2022-11-27,551 days
3,4,2022-09-01,638 days
4,5,2023-06-02,364 days
...,...,...,...
9995,9996,2023-08-13,292 days
9996,9997,2023-01-25,492 days
9997,9998,2024-01-30,122 days
9998,9999,2022-07-15,686 days


In [40]:
frequency_df = transactions.groupby('customer_id').transaction_id.count().reset_index()
frequency_df.columns = ['customer_id', 'Frequency']
frequency_df

Unnamed: 0,customer_id,Frequency
0,1,6
1,2,9
2,3,6
3,4,20
4,5,24
...,...,...
9995,9996,10
9996,9997,12
9997,9998,3
9998,9999,12


In [42]:
monetary_df = transactions.groupby('customer_id').transaction_amount.sum().reset_index()
monetary_df.columns = ['customer_id', 'Monetary']
monetary_df

Unnamed: 0,customer_id,Monetary
0,1,3509.48
1,2,6081.32
2,3,1454.87
3,4,7874.68
4,5,15524.55
...,...,...
9995,9996,5498.20
9996,9997,5848.30
9997,9998,3503.13
9998,9999,6721.86
