In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import pickle
import datetime as dt

# Data loading and preprocessing
data = pd.read_csv("C:/Users/Administrator/Desktop/Personal_NAING/Parami_Naing/Advance Machine Learning/Midterm/Final_Midterm/Online Retail.csv")

# Drop missing CustomerID rows
data = data.dropna(subset=['CustomerID'])

# #We will remove the negative quantity as it is not possible to have negative quantity in a retail transaction.
data = data[data['Quantity'] > 0]

# #We will check the total amount of money spent by each customer and add a new column.
data['TotalPrice'] = data['Quantity'] * data['UnitPrice']

# #We will check the recency of each customer. The output is the latest date of purchase.

data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])

# Define the latest purchase date
latest_date = dt.datetime(2011, 12, 10)

# Create the RFM metrics
RFM_data = data.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (latest_date - x.max()).days,
    'InvoiceNo': 'count',
    'TotalPrice': 'sum'
}).rename(columns={
    'InvoiceDate': 'Recency',
    'InvoiceNo': 'Frequency',
    'TotalPrice': 'Monetary'
})

# Apply StandardScaler to the RFM data
scaler = StandardScaler()
RFM_data_scaled = scaler.fit_transform(RFM_data)
RFM_data_scaled = pd.DataFrame(RFM_data_scaled, columns=RFM_data.columns)

# Train the Random Forest model
X = RFM_data[['Recency', 'Frequency', 'Monetary']]
y = RFM_data['Recency'].apply(lambda x: 'Low-value Customer' if x > 100 else 'High-value Customer')  # Example segmentation

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(RFM_data_scaled, y)

# Save the model and scaler for deployment
with open('rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
    
# Check model accuracy
from sklearn.metrics import accuracy_score
y_pred = rf_model.predict(RFM_data_scaled)
accuracy = accuracy_score(y, y_pred)
print(f'Model accuracy: {accuracy:.2f}')


Model accuracy: 1.00


# FRM Analysis
We will use RFM (Recency, Frequency, Monetary) analysis. This is common to use in marketing sector especially to analyse and rank customers according to their purchasing behavior, which is perfect for the purpose of this project.
Recency shows the recency of a customer's purchase.
Frequency shows how often a customer make a purchase.
Monetary shows the total amount of money spent by the customer.

# Analysis
We selected the customers who made the purchase from 2011-12-09. This resulted to having 4339 customers. We can say that on average, customers last shopped about 3 months ago, made around 92 purchases and spent about 2054. There are some customers who shop and spend a lot more than others. We can say half of the customers last shopped about 50 days ago, made around 41 purchases and spent about 674. We should focus on big spenders as this could be really valuable to the business.
