In [None]:
import pandas as pd
import numpy as np


# Preprocessing

### 1. Load data

There are no null-values or na values in the data. All values have dtype `int64`.

In [None]:
# Load the heart dataset
data = pd.read_csv("customer_data_large.csv")

# Check the dimensions of the dataset
dataset_dimensions = data.shape

print("The dataset has {} rows and {} colums.".format(dataset_dimensions[0],dataset_dimensions[1]))

print(f'The data has {data.isnull().sum().sum()} null values')
print(f'The data has {data.isna().sum().sum()} na values')

data.info()
data.describe()

### 2. Visualize data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def show_numerical_figures(df):
    numerical_columns = df.select_dtypes(include=np.number).columns.tolist()
    for col in numerical_columns:
        plt.figure(figsize=(8, 4))
        sns.histplot(df[col],  bins=30)
        plt.title(f'Histogram of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.show()


show_numerical_figures(data)

### 3. Aggregate new features

For this unsupervised problem, there are a number of features one can extract by combining exisitng features.

In [None]:
data['TotalSpend'] = data[['MntWines', 'MntFruits', 'MntSweetProducts', 'MntGoldProds', 'NumWebVisitsMonth']].sum(axis=1)

data['AvgSpendPerPurchase'] = data['TotalSpend'] / (data['NumWebPurchases'] + data['NumStorePurchases'] + 1)

data['TotalPurchases'] = data[['NumWebPurchases', 'NumStorePurchases']].sum(axis=1)

data['WebToStorePurchaseRatio'] = data['NumWebPurchases'] / (data['NumStorePurchases'] + 1)  # Adding 1 to avoid division by zero

data.describe()


### 4. Check pairwise correlation

# TODO

In [None]:
matrix = data.corr(method='pearson')

# Don't check pairwise relevances twices
checked_features= []
count = 0
threshold = 0.5

relevance_dict = {}

for col in data.columns.tolist():
    checked_features.append(col)
    for val in matrix[col].items():
        current_col = val[0]
        relevance = val[1]
        if relevance > threshold and current_col not in checked_features:
            if relevance in relevance_dict.keys():
                relevance_dict[relevance].append((col, current_col))
            else:
                relevance_dict[relevance] = [(col, current_col)]

if not relevance_dict:
    print(f'No variables with significance above {threshold}')
else:
    sorted_relevances = sorted(relevance_dict.keys(), reverse=True)
    for relevance in sorted_relevances:
        relationships = relevance_dict[relevance]
        for relationship in relationships:
            print(f'{relationship[0]} and {relationship[1]} has relevance {relevance}')

### 5. Split data in testing and training

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.25, random_state=42)

print(type(test))

### 6. Scale data
In the dataset the scale of the features varies greatly from 0-3 in `Education` and 0-1727 in `MntFishMeatProdcts`. In order to use k-means and PCA wothout bias towards the higher-scale features, we apply standardizaton through `sklearn.StandardScaler`.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

# Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt

### 1. K-Means on all features
We start by applying k-means on all features.

In [None]:
def elbow_method(df):
    '''Plot change of inertia for different values of k, 
    to get a reasonable guess for optimal amount of clusters'''
    inertias = {}
    silhouettes = {}
    db_scores = {}

    for k in range(2, 15):
        kmeans = KMeans(n_clusters=k, random_state=42).fit(df)
        inertias[k] = kmeans.inertia_
        
        sil_score = silhouette_score(df, kmeans.labels_)
        silhouettes[k] = sil_score

        db_score = davies_bouldin_score(df, kmeans.labels_)
        db_scores[k] = db_score

    plt.figure()
    plt.plot(list(inertias.keys()), list(inertias.values()))
    plt.xlabel("Number of cluster")
    plt.ylabel("Inertias")
    plt.show()

    plt.figure()
    plt.plot(list(silhouettes.keys()), list(silhouettes.values()))
    plt.xlabel("Number of cluster")
    plt.ylabel("Silhouttes")
    plt.show()

    plt.figure()
    plt.plot(list(db_scores.keys()), list(db_scores.values()))
    plt.xlabel("Number of cluster")
    plt.ylabel("DB_scores")
    plt.show()


In [None]:
elbow_method(train_scaled)

# TODO

### Characerize the clusters

In [None]:
optimal_clusters = 8  # Assuming 5 is optimal from the elbow method
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
clusters = kmeans.fit_predict(train_scaled)

train = pd.DataFrame(scaler.inverse_transform(train_scaled), index=train.index, columns=train.columns)

centroids = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=train.columns)

In [None]:
# Cluster centroids
print(centroids)

In [None]:
# Add cluster labels to your original DataFrame
train['Cluster'] = kmeans.labels_

# Calculate mean for all numerical columns for each cluster
cluster_profiles = train.groupby('Cluster').mean()

# Display cluster profiles
print(cluster_profiles)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example: Visualizing the distribution of a feature like 'TotalSpend' across clusters
plt.figure(figsize=(10, 6))
sns.boxplot(x='Cluster', y='TotalSpend', data=train)
plt.title('Total Spend by Cluster')
plt.show()