## **Libraries**

In [None]:
# Import libraries
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns
sns.set()

## **Data**

In [None]:
# Load raw data
df_purchase = pd.read_csv(os.path.join("..", "data", "purchase_data.csv"))

## **Exploration**

In [None]:
# Check the first rows
df_purchase.head()

In [None]:
# Check for missing values
df_purchase.isnull().sum()

## **Segmentation**

### **Model**

In [None]:
# Load pickled objects in order to segment the purchase data
kmeans_pca = pickle.load(open(os.path.join("utils", "kmeans_pca.pickle"), "rb"))
pca = pickle.load(open(os.path.join("utils", "pca.pickle"), "rb"))
scaler = pickle.load(open(os.path.join("utils", "scaler.pickle"), "rb"))

### **Standardization**

In [None]:
# Standardize the purchase data like for segmentation data
features = df_purchase[["Sex", "Marital status", "Age", "Education", "Income", "Occupation", "Settlement"]]
df_segm_std = scaler.transform(features)

### **PCA**

In [None]:
# Apply PCA on the purchase data to obtain three principal components for each row in the table
df_segm_pca = pca.transform(df_segm_std)

### **K-Means (PCA)**

In [None]:
# Segment the purchase data into the four segments using the principal component analysis
pca_segments = kmeans_pca.predict(df_segm_pca)

In [None]:
# Create a copy to keep original data
df_predictors = df_purchase.copy()

In [None]:
# Add segment information as a new column in our predictors data
df_predictors["Segment"] = pca_segments

## **Analysis**

### **Customers**

In [None]:
# Explore the first rows
df_predictors.head()

In [None]:
# Compute purchase occasions by customer
temp1 = df_predictors[["ID", "Incidence"]].groupby(["ID"], as_index = False).count()
temp1 = temp1.set_index("ID")
temp1 = temp1.rename(columns = {"Incidence": "Visits"})
temp1.head()

In [None]:
# Compute number of purchases per customer
temp2 = df_predictors[["ID", "Incidence"]].groupby(["ID"], as_index = False).sum()
temp2 = temp2.set_index("ID")
temp2 = temp2.rename(columns = {"Incidence": "Purchases"})
temp3 = temp1.join(temp2)
temp3.head()

In [None]:
# Compute the average number of purchases by customer
temp3["Average Purchases"] = temp3["Purchases"] / temp3["Visits"]
temp3.head()

In [None]:
# Obtain the segment for each customer in the data set
temp4 = df_predictors[["ID", "Segment"]].groupby(["ID"], as_index = False).mean()
temp4 = temp4.set_index("ID")
df_description = temp3.join(temp4)
df_description.head()

### **Segments**

In [None]:
# Calculate the proportions of each segment and set the appropriate column name
segment_proportion = df_description[["Purchases", "Segment"]].groupby(["Segment"]).count() / df_description.shape[0]
segment_proportion = segment_proportion.rename(columns = {"Purchases": "Proportions"})
segment_proportion.head()

In [None]:
# Plot the segment proportions as a pie chart
plt.figure(figsize = (9, 6))
plt.pie(segment_proportion["Proportions"],
        labels = ["Standard", "Career-Focused", "Fewer-Opportunities", "Well-Off"],
        autopct = "%1.1f%%",
        colors = ["b", "g", "r", "orange"],
        textprops = {"fontsize": 12})
plt.show()

### **Purchases**

In [None]:
# Calculate the mean by the four segments to determine the average customer behaviour in each segment
segments_mean = df_description.groupby(["Segment"]).mean()
segments_mean

In [None]:
# Calculate the standard deviation by segments to determine how homogoneus each of the segments is
segments_std = df_description.groupby(["Segment"]).std()

In [None]:
# Plot the average number of store visits for each of the four segments using a bar chart
plt.figure(figsize = (9, 6))
sns.set_style("white")
plt.bar(x = (0, 1, 2, 3),
        tick_label = ("Standard", "Career-Focused", "Fewer-Opportunities", "Well-Off"), 
        height = segments_mean["Visits"],
        yerr = segments_std["Visits"], # Display the standard deviation as a straight line
        color = ("b", "g", "r", "orange"))
plt.xlabel("Segment", fontsize = 12)
plt.ylabel("Store Visits", fontsize = 12)

In [None]:
# Plot the average number of purchases by segments to understand how often each group buys the product
plt.figure(figsize = (9, 6))
sns.set_style("white")
plt.bar(x = (0, 1, 2, 3),
        tick_label = ("Standard", "Career-Focused", "Fewer-Opportunities", "Well-Off"), 
        height = segments_mean["Purchases"],
        yerr = segments_std["Purchases"], # Display the standard deviation as a straight line
        color = ("b", "g", "r", "orange"))
plt.xlabel("Segment", fontsize = 12)
plt.ylabel("Purchase Incidences", fontsize = 12)

### **Brand**

In [None]:
# Select rows where the incidence column equals 1 that indicates times when a purchase was made
df_incidence = df_predictors[df_predictors["Incidence"] == 1]

In [None]:
# Create dummies for each of the five brands
brand_dummies = pd.get_dummies(df_incidence["Brand"], prefix = "Brand", prefix_sep = "_")
brand_dummies["Segment"], brand_dummies["ID"] = df_incidence["Segment"], df_incidence["ID"]
brand_dummies

In [None]:
# Group the dummies by the customer column and calculate the mean for each group
temp = brand_dummies.groupby(["ID"], as_index = True).mean()

# Group the dummies by segment and calculate the mean brand choice for each of them
mean_choice = temp.groupby(["Segment"], as_index = True).mean()

In [None]:
# Plot the mean brand choice by segment
sns.heatmap(mean_choice,
            vmin = 0,
            vmax = 1,
            cmap = "PuBu",
            annot = True)
plt.xticks(rotation = 90, fontsize = 12)
plt.yticks([0, 1, 2, 3],
           ["Standard", "Career-Focused", "Fewer-Opportunities", "Well-Off"],
           rotation = 0,
           fontsize = 12)
plt.show()

### **Revenue**

In [None]:
# Compute the revenue for the first brand
temp = df_predictors[df_predictors["Brand"] == 1]
temp.loc[:, "Revenue Brand 1"] = temp["Price_1"] * temp["Quantity"] # Create a new column by multiplying price and quantity for each row
segment_revenue = pd.DataFrame()
segment_revenue[["Segment", "Revenue Brand 1"]] = temp[["Segment", "Revenue Brand 1"]].groupby(["Segment"], as_index=False).sum()

In [None]:
# Compute the revenue for the second brand by using the same formula
temp = df_predictors[df_predictors["Brand"] == 2]
temp.loc[:, "Revenue Brand 2"] = temp["Price_2"] * temp["Quantity"]
segment_revenue[["Segment", "Revenue Brand 2"]] = temp[["Segment", "Revenue Brand 2"]].groupby(["Segment"], as_index = False).sum()

In [None]:
# Compute the revenue for the third brand by using the same formula
temp = df_predictors[df_predictors["Brand"] == 3]
temp.loc[:, "Revenue Brand 3"] = temp["Price_3"] * temp["Quantity"]
segment_revenue[["Segment", "Revenue Brand 3"]] = temp[["Segment", "Revenue Brand 3"]].groupby(["Segment"], as_index = False).sum()

In [None]:
# Compute the revenue for the fourth brand by using the same formula
temp = df_predictors[df_predictors["Brand"] == 4]
temp.loc[:, "Revenue Brand 4"] = temp["Price_4"] * temp["Quantity"]
segment_revenue[["Segment", "Revenue Brand 4"]] = temp[["Segment", "Revenue Brand 4"]].groupby(["Segment"], as_index = False).sum()

In [None]:
# Compute the revenue for the fifth brand by using the same formula
temp = df_predictors[df_predictors["Brand"] == 5]
temp.loc[:, "Revenue Brand 5"] = temp["Price_5"] * temp["Quantity"]
segment_revenue[["Segment", "Revenue Brand 5"]] = temp[["Segment", "Revenue Brand 5"]].groupby(["Segment"], as_index = False).sum()

In [None]:
# Compute the total revenue by summing the revenue for each of the five brands
segment_revenue["Total Revenue"] = (segment_revenue["Revenue Brand 1"] +
                                    segment_revenue["Revenue Brand 2"] +
                                    segment_revenue["Revenue Brand 3"] +
                                    segment_revenue["Revenue Brand 4"] +
                                    segment_revenue["Revenue Brand 5"])
segment_revenue

In [None]:
# Modify the table to see the size of the segment compared to the revenue they bring
segment_revenue["Proportions"] = segment_proportion["Proportions"]
segment_revenue["Segment"] = segment_revenue["Segment"].map({0:"Standard",
                                                            1:"Career-Focused",
                                                            2:"Fewer-Opportunities",
                                                            3:"Well-Off"})
segment_revenue = segment_revenue.set_index(["Segment"])
segment_revenue