In [None]:
import pandas as pd
import numpy as np


# Preprocessing

### Load data

There are no null-values or na values in the data. All values have dtype `int64`.

In [None]:
# Load the heart dataset
data = pd.read_csv("customer_data_large.csv")

# Check the dimensions of the dataset
dataset_dimensions = data.shape

print("The dataset has {} rows and {} colums.".format(dataset_dimensions[0],dataset_dimensions[1]))

print(f'The data has {data.isnull().sum().sum()} null values')
print(f'The data has {data.isna().sum().sum()} na values')

data.info()
print(data.describe())

### Visualize data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def show_numerical_figures(df):
    columns = df.columns.tolist()
    for col in columns:
        plt.figure(figsize=(8, 4))
        sns.histplot(df[col],  bins=30)
        plt.title(f'Histogram of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.show()


show_numerical_figures(data)

### Aggregate new features

For this unsupervised problem, there are a number of features one can extract by combining exisitng features.

In [None]:
data['TotalSpend'] = data[['MntWines', 'MntFruits', 'MntSweetProducts', 'MntGoldProds', 'NumWebVisitsMonth']].sum(axis=1)

data['AvgSpendPerPurchase'] = data['TotalSpend'] / (data['NumWebPurchases'] + data['NumStorePurchases'] + 1)

data['TotalPurchases'] = data[['NumWebPurchases', 'NumStorePurchases']].sum(axis=1)

data['WebToStorePurchaseRatio'] = data['NumWebPurchases'] / (data['NumStorePurchases'] + 1)  # Adding 1 to avoid division by zero

data.describe()


### Check pairwise correlation

# TODO - write about correlations

In [None]:
matrix = data.corr(method='pearson')

# Don't check pairwise relevances twices
checked_features= []
count = 0
threshold = 0.7

relevance_dict = {}

for col in data.columns.tolist():
    checked_features.append(col)
    for val in matrix[col].items():
        current_col = val[0]
        relevance = val[1]
        if relevance > threshold and current_col not in checked_features:
            if relevance in relevance_dict.keys():
                relevance_dict[relevance].append((col, current_col))
            else:
                relevance_dict[relevance] = [(col, current_col)]


if not relevance_dict:
    print(f'No variables with significance above {threshold}')
else:
    sorted_relevances = sorted(relevance_dict.keys(), reverse=True)
    for relevance in sorted_relevances:
        relationships = relevance_dict[relevance]
        for relationship in relationships:
            print(f'{relationship[0]} and {relationship[1]} have correlation {relevance}')

### Outliers

To analyze outliers, we start by taking a look at box plots of features with potential outliers as well as identifying the number of instances falling out of range. These include the 'amount'-features, and the 'purchases'- and 'visits'-features.

In [None]:
def IQR_bounds(dataframe, column, lower_quantile=0.25, upper_quantile=0.75, should_print=True):
    Q1, Q3 = dataframe[column].quantile([lower_quantile, upper_quantile])

    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    low_outliers = (dataframe[column] < lower_bound).sum()
    high_outliers = (dataframe[column] > upper_bound).sum()

    if should_print:
        print(f'{column} has {low_outliers} low outliers (below {lower_bound}) and {high_outliers} high outliers (above {upper_bound})')

    return lower_bound, upper_bound

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

possible_outlier_columns = ['MntWines', 'MntFruits',
       'MntSweetProducts', 'MntGoldProds', 'NumWebPurchases',
       'NumStorePurchases', 'NumWebVisitsMonth', 'MntFishMeatProdcts']
# possible_outlier_columns = ['MntWines', 'MntFruits',
#        'MntSweetProducts', 'MntGoldProds', 'NumWebPurchases',
#        'NumStorePurchases', 'NumWebVisitsMonth', 
#        'Num_AcceptedCmp', 'MntFishMeatProdcts', 'TotalSpend',
#        'AvgSpendPerPurchase', 'TotalPurchases', 'WebToStorePurchaseRatio']

for feature in possible_outlier_columns:
    IQR_bounds(data, feature, 0.25, 0.75)
    plt.figure(figsize=(10, 4))
    sns.boxplot(x=data[feature])
    plt.title(f'Boxplot of {feature}')
    plt.show()

From this we see that none of the features have any low outliers. We also see that
- `MntWines`: Have 35 potential outliers. These may be important in a business perspective, as they can be a part of a customer segment (e.g. high-expenditure customers). As the majority of the instances are significantly lower than these high outliers, they may however skew standardization and make  the clustering poorer. We will try to both remove and keep them, and see what makes the clustering better.  

- `MntFruits`: All possible outliers are evenly spread from Q3 to max. The max is however more than twice the Q3 limit, and including them will significanly skew the standardization. We will try to both include all, and to remove all instances above 2\*Q3. By removing the instances above 2\*Q3, we will still keep some of these potentially important values and not skew the standardization too much.

-  `MntSweetProducts`: The majority of possible outliers increase gradually from Q3, whereas two customers have spent significantly more (more than 25%). The gradual increase from Q3 suggests that these may be an important customer segment, whereas the two lone outliers will be removed. 

- `MntGoldProds`: As with `MntSweetProducts`, the outliers increase gradually and intensively from Q3 to 250 while four are significantly higher. These four will be removed. 

- `NumWebPurchases`: Only 11 outliers. Most of them are close to the Q3, whereas four are significantly higher. We will try to both remove all (as there are few outliers) and to only remove the highest four. 

- `NumStorePurchases`: No outliers. 

- `NumWebVisitsMonth`: 148 potential high outliers. The values do however not vary greatly, and these high web visits may be of importance. due to the possible importance and the significant amount, they will all be kept. 

- `MntFishMeatProducts`: The majority of possible outliers increase gradually from Q3, whereas a few customers have spent significantly more. The significant amount of outliers close to Q3, and their potential value, forces ut to keep them, while the highest outliers will be removed. 

In [None]:
def remove_outliers(df):
    _, bound_fruit = IQR_bounds(df, 'MntFruits', should_print=False)
    _, bound_web = IQR_bounds(df, 'NumWebPurchases', should_print=False)

    outliers_wine = df[(df['MntWines'] > 1400)].index
    print(f'Removing {len(outliers_wine)} wine outliers')
    df = df.drop(outliers_wine)
    
    outliers_fruit = df[(df['MntFruits'] > bound_fruit * 2)].index
    print(f'Removing {len(outliers_fruit)} fruit outliers')
    df = df.drop(outliers_fruit)
    
    outliers_sweets = df[(df['MntSweetProducts'] > 200)].index
    print(f'Removing {len(outliers_sweets)} sweets outliers')
    df = df.drop(outliers_sweets)

    outliers_gold = df[(df['MntGoldProds'] > 250)].index
    print(f'Removing {len(outliers_gold)} gold outliers')
    df = df.drop(outliers_gold)

    outliers_web_1 = df[(df['NumWebPurchases'] > bound_web)].index
    outliers_web_2 = df[(df['NumWebPurchases'] > 20)].index
    # print(f'Removing {len(outliers_web_1)} web purchase outliers')
    # df = df.drop(outliers_web_1)
    print(f'Removing {len(outliers_web_2)} web purchase outliers')
    df = df.drop(outliers_web_2)

    outliers_fish_meat = df[(df['MntFishMeatProdcts'] > 1250)].index
    print(f'Removing {len(outliers_fish_meat)} fish_meat outliers')
    df = df.drop(outliers_fish_meat)

    return df


In [None]:
data = remove_outliers(data)

### Split data in testing and training

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(data, test_size=0.25, random_state=42)


### Scale data
In the dataset the scale of the features varies greatly; from 0-3 in `Education` and 0-1727 in `MntFishMeatProdcts`. In order to use k-means without bias towards the higher-scale features, we apply standardizaton through `sklearn.StandardScaler`.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

# Clustering

*In order to explore the result of the different preprocessing techniques effectively, we have created a `Preprocessor` class and extracted the above methods to it. For the remaining part of the notebook, we utilize this class for preprocessing.*

### Clustering evaluation metrics
**Silhouette score:** The cohesion and separation of clusters. Ranges from -1 to 1, with higher values indicating better clustering.

**Davies-Bouldin score:** The ratio of within-cluster distances to between-cluster distances. A lower value is better.

**Inertia:** The sum of squared distances between samples and their cluster means. A lower value indicates better clustering. 

**Distortion:** The average of squared distances between samples and their cluster means. A lower value indicates better clustering.  

In [None]:

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from preprocessor import Preprocessor
from sklearn.model_selection import KFold

prep = Preprocessor()

### Applying K-Means 

In [None]:
def elbow_method(df):
    '''Plot change of inertia for different values of k, 
    to get a reasonable guess for optimal amount of clusters'''
    db_scores = []
    db_scores_test = []
    silhouettes = []
    silhouettes_test = []
    distortions = []
    inertias = []
    K = range(1, 15)
    
    for k in K:
        distortion = []
        inertia = []
        silhouette = []
        silhouette_test = []
        db = []
        db_test = []

        # Apply K-fold 
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        for train_index, test_index in kf.split(df):
        
            # Prepare DataFrames
            train = df.iloc[train_index]
            train = pd.DataFrame(data=train, columns=df.columns)

            test = df.iloc[test_index]
            test = pd.DataFrame(data=test, columns=df.columns)

            # Fit the model and make predictions
            kmeanModel = KMeans(n_clusters=k, random_state=42).fit(train)
            labels = kmeanModel.predict(train)
            test_labels = kmeanModel.predict(test)

            # Calculate evaluation metrics
            distortion.append(sum(np.min(cdist(df, kmeanModel.cluster_centers_,
                                                'euclidean'), axis=1)) / df.shape[0])
            inertia.append(kmeanModel.inertia_)
    
            # Can only use silhouette_score and davies_bouldin_score if there are more than one cluster
            if len(np.unique(labels)) > 1: 
                silhouette.append(silhouette_score(train, labels))
                db.append(davies_bouldin_score(train, labels))

            # Can only use silhouette_score and davies_bouldin_score if there are more than one cluster
            if len(np.unique(test_labels)) > 1: 
                silhouette_test.append(silhouette_score(test, test_labels))
                db_test.append(davies_bouldin_score(test, test_labels))

        # Append mean of all folds
        distortions.append(np.mean(distortion))
        inertias.append(np.mean(inertia))
        silhouettes_test.append(np.mean(silhouette_test))
        db_scores_test.append(np.mean(db_test))
        silhouettes.append(np.mean(silhouette))
        db_scores.append(np.mean(db))


    # Plot distortion
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('Values of K')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method using Distortion')
    plt.show()

    # Plot inertia
    plt.plot(K, inertias, 'bx-')
    plt.xlabel('Values of K')
    plt.ylabel('Inertia')
    plt.title('The Elbow Method using Inertia')
    plt.show()

    # Plot silhouette score, both on test and train data
    plt.figure(figsize=(14, 5))  
    plt.subplot(1,2,1)
    sns.lineplot(x=K, y=silhouettes, marker='o', sort=False, color='blue')
    sns.lineplot(x=K, y=silhouettes_test, marker='o', sort=False, color='red')
    plt.xlabel('Values of K')
    plt.ylabel('Silhouette score')
    plt.title('The Elbow Method using Silhouette score')
    plt.show()

    # Plot db score, both on test and train data
    plt.figure(figsize=(14, 5))  
    plt.subplot(1,2,1)
    sns.lineplot(x=K, y=db_scores, marker='o', sort=False, color='blue')
    sns.lineplot(x=K, y=db_scores_test, marker='o', sort=False, color='red')
    plt.xlabel('Values of K')
    plt.ylabel('DB score')
    plt.title('The Elbow Method using DB score')
    plt.show()

In [None]:
elbow_data, _ = prep.get_data(test_size=-1)


elbow_method(elbow_data)
elbow_data.describe()

The elbow method using inertia, silhouette score and db score all suggests the optimal value of k is for `k=6`. The elbow method using distortia suggest the optimal value is `k=5`. 

None of the plots suggests strong

# TODO

### Characerize the clusters

In [None]:
optimal_clusters = 5  # Assuming 5 is optimal from the elbow method
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
clusters = kmeans.fit_predict(train_scaled)

train = pd.DataFrame(scaler.inverse_transform(train_scaled), index=train.index, columns=train.columns)

centroids = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=train.columns)

In [None]:
# Cluster centroids
print(centroids)

In [None]:
# Add cluster labels to your original DataFrame
train['Cluster'] = kmeans.labels_

# Calculate mean for all numerical columns for each cluster
cluster_profiles = train.groupby('Cluster').mean()

# Display cluster profiles
print(cluster_profiles)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example: Visualizing the distribution of a feature like 'TotalSpend' across clusters
plt.figure(figsize=(10, 6))
sns.boxplot(x='Cluster', y='TotalSpend', data=train)
plt.title('Total Spend by Cluster')
plt.show()

# PCA

In [None]:
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt


all_data_scaler = StandardScaler()
data_scaled = all_data_scaler.fit_transform(data)

#### One principal component

PCA with `n_components=1` gives an explained variance ratio of ≈0.364

In [None]:
pca_1 = PCA(n_components=1)
pca_1_components = pca_1.fit_transform(data_scaled)

print(pca_1.explained_variance_ratio_)

In [None]:
pca_1_df = pd.DataFrame(data=pca_1_components, columns=['pca1'])

plt.figure(figsize=(10, 6))
plt.scatter(pca_1_df.index, pca_1_df['pca1'], alpha=0.6)
plt.title('PCA: One Principal Component')
plt.xlabel('Index')
plt.ylabel('pca1')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(pca_1_df['pca1'], bins=30, kde=True)
plt.title('Distribution of pca1')
plt.xlabel('pca1')
plt.ylabel('Frequency')
plt.show()


#### Two principal components

PCA with `n_components=2` gives an explained variance ratio of ≈0.49

In [None]:

pca_2 = PCA(n_components=2)
pca_2_components = pca_2.fit_transform(data_scaled)

print(pca_2.explained_variance_ratio_.sum())

In [None]:
pca_2_df = pd.DataFrame(data=pca_2_components, columns=['pca1', 'pca2'])

plt.figure(figsize=(10, 8))
sns.scatterplot(x='pca1', y='pca2',  data=pca_2_df, palette='viridis', s=100, alpha=0.6)
plt.title('PCA: Two Principal Components')
plt.xlabel('pca1')
plt.ylabel('pca2')
plt.show()

PCA with neither one nor two principal components serves to identify possible clusters in the dataset. 

We further try to apply PCA with a higher number of `n_components` and train a k-means model using the resulting principal components. We then compare the clustering we get from this model to the clusterings from kmeans on the original features.