In [1]:
# import relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from mlxtend.frequent_patterns import apriori, association_rules
from imblearn.over_sampling import SMOTE

In [9]:
# load the dataframes
spend_revenue = pd.read_csv(r"/Users/adityamxr/Desktop/finding-marketing-insights/data-analysis/spend_revenue.csv")
sales_full_merged = pd.read_csv(r"/Users/adityamxr/Desktop/finding-marketing-insights/data-analysis/sales_full_merged.csv")
rfm = pd.read_csv(r"/Users/adityamxr/Desktop/finding-marketing-insights/models/rfm.csv")
customer_features = pd.read_csv(r"/Users/adityamxr/Desktop/finding-marketing-insights/models/customer_features.csv")

In [3]:
# transaction_date is in str format - convert to datetime
sales_full_merged['Transaction_Date'] = pd.to_datetime(sales_full_merged['Transaction_Date'])

In [4]:
# sort by CustomerID and Transaction_Date to calculate the difference between transactions
sales_full_merged = sales_full_merged.sort_values(by=['CustomerID', 'Transaction_Date'])

# calculate the difference in days between consecutive transactions for each customer
sales_full_merged['Days_Between_Purchases'] = sales_full_merged.groupby('CustomerID')['Transaction_Date'].diff().dt.days

# filter out the first purchase (will have NaN for Days_Between_Purchases)
repeat_customers = sales_full_merged.dropna(subset=['Days_Between_Purchases'])

In [5]:
# calculate the average days between purchases for each repeat customer
avg_days_between_purchases = repeat_customers.groupby('CustomerID')['Days_Between_Purchases'].mean().reset_index()

# rename the column for clarity
avg_days_between_purchases.columns = ['CustomerID', 'Avg_Days_Between_Purchases']

In [6]:
# define a function to categorize the average days into the four categories
def categorize_days(days):
    if days <= 30:
        return '0-30 days'
    else:
        return '30+ days'

# apply the function to categorize each customer
avg_days_between_purchases['Next_Purchase_Category'] = avg_days_between_purchases['Avg_Days_Between_Purchases'].apply(categorize_days)

In [7]:
# verify
avg_days_between_purchases.head()

Unnamed: 0,CustomerID,Avg_Days_Between_Purchases,Next_Purchase_Category
0,12346,0.0,0-30 days
1,12347,3.779661,0-30 days
2,12348,5.409091,0-30 days
3,12350,0.0,0-30 days
4,12356,0.0,0-30 days


In [10]:
# merge customer_features df from earlier with avg_days_between_purchases df

classification_data = pd.merge(avg_days_between_purchases, customer_features, on='CustomerID')

In [11]:
# verify

classification_data.head()

Unnamed: 0,CustomerID,Avg_Days_Between_Purchases,Next_Purchase_Category,Recency,Frequency,Quantity,Delivery_Charges,Discount_pct,Total_Revenue,CLV_Category
0,12346,0.0,0-30 days,107,1,3,75.0,30.0,171.725887,Low
1,12347,3.779661,0-30 days,59,31,342,11.085667,24.666667,10632.890268,High
2,12348,5.409091,0-30 days,73,8,209,8.571739,24.782609,1341.274742,Low
3,12350,0.0,0-30 days,17,11,21,7.522353,30.0,1080.966849,Low
4,12356,0.0,0-30 days,107,13,56,17.708056,30.0,1648.281319,Low


In [12]:
# define X and y
X = classification_data[['Recency', 'Frequency', 'Total_Revenue']]  # Features
y = classification_data['Next_Purchase_Category']  # Target

In [13]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
# initialize rf model
clf2 = RandomForestClassifier(random_state=42)
clf2.fit(X_train, y_train)

# evaluate the model
y_pred = clf2.predict(X_test)

# get accuracy & classification report
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9834
              precision    recall  f1-score   support

   0-30 days       0.99      1.00      0.99       415
    30+ days       0.00      0.00      0.00         6

    accuracy                           0.98       421
   macro avg       0.49      0.50      0.50       421
weighted avg       0.97      0.98      0.98       421



In [15]:
# class imbalance might exist, check distribution:
print(y.value_counts())

Next_Purchase_Category
0-30 days    1385
30+ days       17
Name: count, dtype: int64


In [16]:
# use SMOTE to oversample the minority classes
smote = SMOTE(random_state=42, k_neighbors=2)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# convert the resampled y values to series to check the distribution
y_resampled_series = pd.Series(y_resampled)

# print distribution
print("Resampled class distribution:")
print(y_resampled_series.value_counts())

Resampled class distribution:
Next_Purchase_Category
0-30 days    970
30+ days     970
Name: count, dtype: int64


In [17]:
# train the Random Forest model on the resampled data
clf_smote = RandomForestClassifier(random_state=42, class_weight='balanced')
clf_smote.fit(X_resampled, y_resampled)

# make predictions on the test data
y_pred_smote = clf_smote.predict(X_test)

In [18]:
# get accuracy of the model
accuracy_smote = accuracy_score(y_test, y_pred_smote)
print(f"Accuracy: {accuracy_smote:.4f}")

# print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_smote))

# print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_smote))

Accuracy: 0.9715
Classification Report:
              precision    recall  f1-score   support

   0-30 days       0.99      0.98      0.99       415
    30+ days       0.20      0.33      0.25         6

    accuracy                           0.97       421
   macro avg       0.60      0.66      0.62       421
weighted avg       0.98      0.97      0.97       421

Confusion Matrix:
[[407   8]
 [  4   2]]


**Model performs poorly for minority class 30+ days (62% accuracy) despite SMOTE resampling and using balanced class weights. This does makes sense since there are so few instances of customers in the 30+ categories. Model performs very well on 0-30 day repeat customers, with an accuracy of 97%.**