In [None]:
#!pip install lifetimes
import pandas as pd
import numpy as np
from lifetimes import GammaGammaFitter
from lifetimes import BetaGeoFitter
import datetime as dt
from lifetimes.plotting import plot_probability_alive_matrix
from lifetimes.plotting import plot_frequency_recency_matrix
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime

: 

In [None]:
df = pd.read_csv("globbing.csv")

: 

In [None]:
df.head()

: 

In [None]:
df.describe()

: 

In [None]:
df.info()

: 

In [None]:
#checking for duplicates
df = df.drop_duplicates()
df.duplicated().sum()

: 

In [None]:
#checking for null values
df.isnull().sum()

: 

In [None]:
#calculating price for one kg
df['Unit Price'] = df['Product price'] / df['Product weight']

: 

In [None]:
#changeing type to datetime
df["Date"] = pd.to_datetime(df["Date"])

: 

In [None]:
df.head()

: 

## **Data Descriptive Visualizations**

In [None]:
# Check the correlation between numerical columns using a heatmap

# Select numerical columns
num_cols = df.select_dtypes(include='number')
corr_matrix = num_cols.corr()
fig = px.imshow(corr_matrix.values,
                x=num_cols.columns,
                y=num_cols.columns,
                color_continuous_scale='RdBu',
                zmin=-1,
                zmax=1)
fig.update_layout(title='Correlation Heatmap')
fig.show()

: 

In [None]:
# Check for outliers in numerical columns using boxplots
numerical_cols = ['Product weight', 'Product price']

for col in numerical_cols:
    fig = px.box(df, x=col)
    fig.show()

: 

In [None]:
# Calculate daily sales
daily_sales = df.groupby('Date')['Product price'].sum()
fig = go.Figure()
fig.add_trace(go.Scatter(x=daily_sales.index, y=daily_sales.values, mode='lines'))
fig.update_layout(title='Daily Sales', xaxis_title='Date', yaxis_title='Total sales')
fig.show()

: 

In [None]:
#Product Weight and Price plot
fig = px.scatter(df, x="Product weight", y="Product price",
                 title="Product Weight vs Price",
                 labels={"Product weight": "Weight (kg)", "Product price": "Price (AMD)"},
                 size="Product weight")
fig.show()

: 

In [None]:
#distributions of prices of purchases based on gender
fig = px.box(df, x='Gender', y='Product price', title='Product price by gender')
fig.show()

: 

In [None]:
sales_by_gender = df.groupby('Gender')['InvoiceId'].count().reset_index()

fig = px.bar(sales_by_gender, x="Gender", y="InvoiceId",
             title="Number of Sales by Gender",
             labels={"Gender": "Gender", "InvoiceId": "Number of Sales"})
fig.show()


: 

In [None]:
# Create a new DataFrame of unique customers
unique_customers = df.drop_duplicates(subset='Customer')

# Calculate gender counts
gender_counts = unique_customers['Gender'].value_counts()

# Create pie chart
fig = px.pie(values=gender_counts.values, names=gender_counts.index, title='Gender Distribution of Unique Customers')
fig.show()


: 

In [None]:
# Group by branch/locker and gender, and sum product price
branch_gender_sales = df.groupby(['Branch/Locker', 'Gender'])['Product price'].sum()

# Reshape data for stacked bar chart
branch_gender_sales = branch_gender_sales.unstack(level=1)

# Create stacked bar chart
trace1 = go.Bar(x=branch_gender_sales.index, y=branch_gender_sales['F'], name='Female')
trace2 = go.Bar(x=branch_gender_sales.index, y=branch_gender_sales['M'], name='Male')
data = [trace1, trace2]
layout = go.Layout(title='Total Sales by Branch/Locker and Gender',
                   xaxis=dict(title='Branch/Locker'),
                   yaxis=dict(title='Total Sales'),
                   barmode='stack')
fig = go.Figure(data=data, layout=layout)
fig.show()


: 

# **RFM**

In [None]:
#grouping data of each customer and creating aggregated columns
df_data_group=df.groupby('Customer').agg({'Date': lambda date: (date.max() - date.min()).days,
                                        'InvoiceId': lambda num: len(num),
                                        'Product weight': lambda quant: quant.sum(),
                                        'Product price': lambda price: price.sum()})

: 

In [None]:
df_data_group.head()

: 

In [None]:
# Change the name of columns
df_data_group.columns=['num_days','num_transactions','num_units','spent_money']
df_data_group.head()


: 

 CLTV = ((Average Order Value x Purchase Frequency)/Churn Rate) x Profit margin.

 Customer Value = Average Order Value * Purchase Frequency


In [None]:
# Average Order Value
df_data_group['avg_order_value']=df_data_group['spent_money']/df_data_group['num_transactions']

: 

In [None]:
#Calculate purchase frequency
purchase_frequency=sum(df_data_group['num_transactions'])/df_data_group.shape[0]

: 

In [None]:
# Repeat Rate
repeat_rate=df_data_group[df_data_group.num_transactions > 1].shape[0]/df_data_group.shape[0]

: 

In [None]:
#Churn Rate
churn_rate=1-repeat_rate

: 

In [None]:
purchase_frequency,repeat_rate,churn_rate


: 

In [None]:
# Profit Margin which is taken randomly
df_data_group['profit_margin']=df_data_group['spent_money']*0.05

: 

In [None]:
# Customer Value
df_data_group['CLV']=(df_data_group['avg_order_value']*purchase_frequency)/churn_rate

: 

In [None]:
#distribution of CLV
sns.distplot(df_data_group["CLV"])
# plt.savefig("CLV.png")
plt.show()

: 

In [None]:
#distribution of CLV with Q1,Q2 and Q3
sns.boxplot(df_data_group["CLV"])
plt.show()

: 

In [None]:
#skewness of CLV distribution
df_data_group["CLV"].skew()

: 

In [None]:
#Customer Lifetime Value (CLTV)
df_data_group['CLTV']=df_data_group['CLV']*df_data_group['profit_margin']

: 

In [None]:
df_data_group.head()

: 

In [None]:
today_date = datetime.today()
cltv_df = df.groupby('Customer').agg({'Date': [lambda date: (date.max() - date.min()).days,
                                                     lambda date: (today_date - date.min()).days],
                                           'InvoiceId':      lambda num: num.nunique(),
                                           'Product price':   lambda TotalPrice: TotalPrice.sum()})


: 

In [None]:
cltv_df.columns = cltv_df.columns.droplevel(0)
cltv_df.columns = ['recency', 'T', 'frequency', 'monetary']
cltv_df["monetary"] = cltv_df["monetary"] / cltv_df["frequency"]
cltv_df = cltv_df[cltv_df["monetary"] > 0]
cltv_df = cltv_df[cltv_df["frequency"] > 0]
cltv_df["recency"] = cltv_df["recency"] / 7
cltv_df["T"] = cltv_df["T"] / 7
cltv_df = cltv_df[(cltv_df['frequency'] > 1)]

: 

In [None]:
cltv_df.head()


: 

In [None]:
rfm = pd.DataFrame()
rfm["recency_score"] = pd.qcut(cltv_df['recency'], 5, labels=[5, 4, 3, 2, 1])
rfm["frequency_score"] = pd.qcut(cltv_df["frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])
rfm["monetary_score"] = pd.qcut(cltv_df["monetary"], 5, labels=[1, 2, 3, 4, 5])
rfm["RFM_SCORE"] = (rfm["recency_score"].astype(str) + rfm["frequency_score"].astype(str))

: 

In [None]:
seg_map = {
    r'[1-2][1-2]': 'HIBERNATING',
    r'[1-2][3-4]': 'AT RISK',
    r'[1-2]5': 'CANT LOSE',
    r'3[1-2]': 'ABOUT TO SLEEP',
    r'33': 'NEED ATTENTION',
    r'[3-4][4-5]': 'LOYAL CUSTOMER',
    r'41': 'PROMISING',
    r'51': 'NEW CUSTOMERS',
    r'[4-5][2-3]': 'POTENTIAL LOYALIST',
    r'5[4-5]': 'CHAMPIONS'
}
rfm['segment'] = rfm['RFM_SCORE'].replace(seg_map, regex=True)
rfm.head(10)

: 

In [None]:
df_treemap = rfm.groupby('segment').agg('count').reset_index()
fig = px.treemap(df_treemap, path=['segment'], values='RFM_SCORE')
fig.show()

: 

# **MODELING**

In [None]:
bgf = BetaGeoFitter(penalizer_coef=0.0)
bgf.fit(cltv_df['frequency'],
        cltv_df['recency'],
        cltv_df['T'])

: 

In [None]:
fig = plt.figure(figsize=(12,8))
p =plot_frequency_recency_matrix(bgf)

: 

In [None]:
fig = plt.figure(figsize=(12,8))
plot_probability_alive_matrix(bgf)

: 

In [None]:
cltv_df["bgf_1week_purchase_pred"] = bgf.predict(1,cltv_df['frequency'],cltv_df['recency'],cltv_df['T'])
cltv_df["bgf_1month_purchase_pred"] = bgf.predict(4,cltv_df['frequency'],cltv_df['recency'],cltv_df['T'])
cltv_df.sort_values("bgf_1week_purchase_pred", ascending=False)
cltv_df.sort_values("bgf_1month_purchase_pred", ascending=False)

: 

In [None]:
#Gamma model
ggf = GammaGammaFitter(penalizer_coef=0.1)
ggf.fit(cltv_df['frequency'], cltv_df['monetary'])

: 

In [None]:
cltv_df["gg_average_profit_pred"] = ggf.conditional_expected_average_profit(cltv_df['frequency'],cltv_df['monetary'])
cltv_df.sort_values("gg_average_profit_pred", ascending=False)

: 

In [None]:
# 1 Month CLTV prediction with GammaGamma
cltv = ggf.customer_lifetime_value(bgf,
                                   cltv_df['frequency'],
                                   cltv_df['recency'],
                                   cltv_df['T'],
                                   cltv_df['monetary'],
                                   time=1,  # 1 month
                                   freq="W",  # frequency of T
                                   discount_rate=0.01)
cltv= cltv.reset_index()
cltv = cltv_df.merge(cltv, on="Customer", how="left")
cltv.sort_values(by="clv", ascending=False).head(5)

: 

In [None]:
#12 month CLTV prediction with GammaGamma
cltv_12 = ggf.customer_lifetime_value(bgf,
                                   cltv_df['frequency'],
                                   cltv_df['recency'],
                                   cltv_df['T'],
                                   cltv_df['monetary'],
                                   time=12, 
                                   freq="W",  
                                   discount_rate=0.01)
cltv_12 = cltv_12.reset_index()
cltv_12 = cltv_df.merge(cltv_12, on="Customer", how="left")
cltv_12.sort_values(by="clv", ascending=False).head(5)

: 