## **Libraries**

In [None]:
# Import libraries
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns
sns.set()

from scipy.cluster.hierarchy import dendrogram, linkage # For hierarchical clustering
from sklearn.cluster import KMeans                      # For K-means clustering
from sklearn.decomposition import PCA                   # For principal component analysis
from sklearn.preprocessing import StandardScaler        # For standardizing features

## **Data**

In [None]:
# Load raw data
df_segmentation = pd.read_csv(os.path.join("..", "data", "customer_data.csv"), index_col = 0)

## **Exploration**

In [None]:
# Check the first rows
df_segmentation.head()

In [None]:
# Descriptive statistics
df_segmentation.describe()

In [None]:
# Plot correlation coefficient for all the features in our data
sns.heatmap(df_segmentation.corr(),
               annot = True, 
               cmap = "RdBu",
               vmin = -1, 
               vmax = 1)
plt.xticks(rotation = 90, fontsize = 12)
plt.yticks(rotation = 0, fontsize = 12)
plt.show()

In [None]:
# Plot income against age 
plt.figure(figsize = (9, 6))
sns.set_style("white")
plt.scatter(df_segmentation.iloc[:, 2], df_segmentation.iloc[:, 4])
plt.xlabel("Age", fontsize = 12)
plt.ylabel("Income", fontsize = 12)

## **Standardization**

In [None]:
# Standardize data so that all features have equal weight
scaler = StandardScaler()
df_segm_std = scaler.fit_transform(df_segmentation)

## **HCA**

In [None]:
# Perform hierarchical clustering 
hca = linkage(df_segm_std, method = "ward")

In [None]:
# Plot result matrix
plt.figure(figsize = (9, 6))
sns.set_style("white")
plt.xlabel("Observations", fontsize = 12)
plt.ylabel("Distance", fontsize = 12)
dendrogram(hca,
           truncate_mode = "level", # Truncate the dendrogram for better readability
           p = 5,                   # Shows only the last p merged clusters
           show_leaf_counts = False,
           no_labels = True,        # Omit showing the labels for each point
           leaf_font_size = 12)
plt.show()

## **K-Means**

In [None]:
# Perform K-means clustering
wcss = []                               # Within Cluster Sum of Squares (WCSS)
for i in range(1, 11):                  # The algoritm considers 1 to 10 clusters
    kmeans = KMeans(n_clusters = i,
                    init = "k-means++", # Run the algortihm at many different starting points
                    random_state = 42)  # Set a random state for reproducibility
    kmeans.fit(df_segm_std)
    wcss.append(kmeans.inertia_)

In [None]:
# Plot the WCSS for the different number of clusters
plt.figure(figsize = (9, 6))
sns.set_style("white")
plt.plot(range(1, 11), wcss, marker = "o", linestyle = "--")
plt.xlabel("Number of Clusters", fontsize = 12)
plt.ylabel("WCSS", fontsize = 12)
plt.show()

In [None]:
# Perform K-means with a fixed number of clusters
kmeans = KMeans(n_clusters = 4, init = "k-means++", random_state = 42)
kmeans.fit(df_segm_std)

### **Results**

In [None]:
# Create a new data frame with the original features
df_kmeans = df_segmentation.copy()

# Add a new column with the assigned clusters for each point
df_kmeans["Segment K-Means"] = kmeans.labels_

In [None]:
# Calculate mean values for the clusters
df_analysis = df_kmeans.groupby(["Segment K-Means"]).mean()
df_analysis

In [None]:
# Compute the size and proportions of the four clusters
df_analysis["N Obs"] = df_kmeans[["Segment K-Means", "Sex"]].groupby(["Segment K-Means"]).count()
df_analysis["Prop Obs"] = df_analysis["N Obs"] / df_analysis["N Obs"].sum()
df_analysis

In [None]:
# Label the four clusters
df_analysis.rename({0:"well-off",
                    1:"fewer-opportunities",
                    2:"standard",
                    3:"career-focused"})

In [None]:
# Add the segment labels to the table
df_kmeans["Labels"] = df_kmeans["Segment K-Means"].map({0:"well-off",
                                                        1:"fewer-opportunities",
                                                        2:"standard",
                                                        3:"career-focused"})

In [None]:
# Plot the results from the algorithm with different color for each cluster
x_axis = df_kmeans["Age"]
y_axis = df_kmeans["Income"]
plt.figure(figsize = (9, 6))
sns.set_style("white")
sns.scatterplot(x_axis, y_axis, hue=df_kmeans["Labels"], palette=["g", "r", "c", "m"])
legend = plt.legend(title="Labels", loc="upper left", bbox_to_anchor=(1, 1), fontsize = 12)
plt.xlabel("Age", fontsize = 12)
plt.ylabel("Income", fontsize = 12)
plt.show()

## **PCA**

In [None]:
# Employ PCA to find a subset of components that explain the variance in the data
pca = PCA()
pca.fit(df_segm_std)

In [None]:
# Plot the cumulative variance explained by total number of components
plt.figure(figsize = (9, 6))
sns.set_style("white")
plt.plot(range(1, 8), pca.explained_variance_ratio_.cumsum(), marker = "o", linestyle = "--")
plt.xlabel("Number of Components", fontsize = 12)
plt.ylabel("Cumulative Explained Variance", fontsize = 12)

In [None]:
# Fit the model in our data with the selected number of components
pca = PCA(n_components = 3)
pca.fit(df_segm_std)

### **Results**

In [None]:
# Show the loadings that are the coefficients applied to the variables to determine the principal components
df_pca = pd.DataFrame(data = pca.components_,
                      columns = df_segmentation.columns.values,
                      index = ["Component 1", "Component 2", "Component 3"])
df_pca

In [None]:
# Plot principal components against original features
heatmap = sns.heatmap(df_pca,
                      vmin = -1,
                      vmax = 1,
                      cmap = "RdBu",
                      annot = True)
cbar = heatmap.collections[0].colorbar
cbar.ax.tick_params(labelsize = 12)
plt.xticks(fontsize = 12)
plt.yticks([0, 1, 2],
           ["Component 1", "Component 2", "Component 3"],
           rotation = 0,
           fontsize = 12)
plt.show()

In [None]:
# Store principal components scores
scores_pca = pca.transform(df_segm_std)
scores_pca

## **K-Means (PCA)**

In [None]:
# Perform K-means clustering using the principal components scores
wcss = []
for i in range(1, 11):                      # The algoritm considers 1 to 10 clusters
    kmeans_pca = KMeans(n_clusters = i,
                        init = "k-means++", # Run the algortihm at many different starting points
                        random_state = 42)  # Set a random state for reproducibility
    kmeans_pca.fit(scores_pca)
    wcss.append(kmeans_pca.inertia_)

In [None]:
# Plot the WCSS for the K-means PCA model
plt.figure(figsize = (9, 6))
plt.plot(range(1, 11), wcss, marker = "o", linestyle = "--")
plt.xlabel("Number of Clusters", fontsize = 12)
plt.ylabel("WCSS", fontsize = 12)
plt.show()

In [None]:
# Perform K-means with the chosen number of clusters
kmeans_pca = KMeans(n_clusters = 4, init = "k-means++", random_state = 42)
kmeans_pca.fit(scores_pca)

### **Results**

In [None]:
# Create a new data frame with the original features and add the PCA scores and assigned clusters
df_pca_kmeans = pd.concat([df_segmentation.reset_index(drop = True), pd.DataFrame(scores_pca)], axis = 1)
df_pca_kmeans.columns.values[-3: ] = ["Component 1", "Component 2", "Component 3"]

In [None]:
# Add a column that contains the PCA K-means clustering labels
df_pca_kmeans["Segment K-Means PCA"] = kmeans_pca.labels_
df_pca_kmeans.head()

In [None]:
# Calculate the means of all the features by segments
df_pca_freq = df_pca_kmeans.groupby(["Segment K-Means PCA"]).mean()
df_pca_freq

In [None]:
# Calculate the size of each cluster and its proportion to the entire data set
df_pca_freq["N Obs"] = df_pca_kmeans[["Segment K-Means PCA", "Sex"]].groupby(["Segment K-Means PCA"]).count()
df_pca_freq["Prop Obs"] = df_pca_freq["N Obs"] / df_pca_freq["N Obs"].sum()
df_pca_freq = df_pca_freq.rename({0:"standard", 1:"career-focused", 2:"fewer-opportunities", 3:"well-off"})
df_pca_freq

In [None]:
# Add a column that contains labels for each segment
df_pca_kmeans["Legend"] = df_pca_kmeans["Segment K-Means PCA"].map({0:"standard",
                                                                    1:"career-focused",
                                                                    2:"fewer-opportunities",
                                                                    3:"well-off"})
df_pca_kmeans

In [None]:
# Plot clusters by first and second components
x_axis = df_pca_kmeans["Component 2"]
y_axis = df_pca_kmeans["Component 1"]
plt.figure(figsize = (9, 6))
sns.scatterplot(x_axis, y_axis, hue = df_pca_kmeans["Legend"], palette = ["g", "r", "c", "m"])
plt.legend(title = "Labels", loc = "upper left", bbox_to_anchor = (1, 1), fontsize = 12)
plt.xlabel("Component 2", fontsize = 12)
plt.ylabel("Component 1", fontsize = 12)
plt.show()

In [None]:
# Plot clusters by first and third components
x_axis = df_pca_kmeans["Component 3"]
y_axis = df_pca_kmeans["Component 1"]
plt.figure(figsize = (9, 6))
sns.scatterplot(x_axis, y_axis, hue = df_pca_kmeans["Legend"], palette = ["g", "r", "c", "m"])
plt.legend(title = "Labels", loc = "upper left", bbox_to_anchor = (1, 1), fontsize = 12)
plt.xlabel("Component 3", fontsize = 12)
plt.ylabel("Component 1", fontsize = 12)
plt.show()

In [None]:
# Plot clusters by third and second components
x_axis = df_pca_kmeans["Component 3"]
y_axis = df_pca_kmeans["Component 2"]
plt.figure(figsize = (9, 6))
sns.scatterplot(x_axis, y_axis, hue = df_pca_kmeans["Legend"], palette = ["g", "r", "c", "m"])
plt.legend(title = "Labels", loc = "upper left", bbox_to_anchor = (1, 1), fontsize = 12)
plt.xlabel("Component 3", fontsize = 12)
plt.ylabel("Component 2", fontsize = 12)
plt.show()

## **Saving**

In [None]:
# Save the items needed for analytics and export them as pickle objects
pickle.dump(kmeans_pca, open(os.path.join("utils", "kmeans_pca.pickle"), "wb"))
pickle.dump(pca, open(os.path.join("utils", "pca.pickle"), "wb"))
pickle.dump(scaler, open(os.path.join("utils", "scaler.pickle"), "wb"))