In [81]:
import numpy as np
import pandas as pd
import skfda

# Data
train_df = pd.read_csv("../../data/Subway_USA_Preprocessed/subway_usa_processed_train.csv", index_col="store")
fdata = skfda.FDataGrid(train_df.values)

# Grid
n_clusters_values = [5,6,7]
fuzzifier_values = [1.1, 1.2, 1.3]
max_iter_values = [50, 100, 200]
tol_values = [1e-3, 1e-4, 1e-5]

# Initial set up
best_score = float('inf')
best_params = None

# Tuning
for n_clusters in n_clusters_values:
    for fuzzifier in fuzzifier_values:
        for max_iter in max_iter_values:
            for tol in tol_values:
                # Create the FuzzyCMeans object
                fcm = skfda.ml.clustering.FuzzyCMeans(n_clusters=n_clusters,
                                                      fuzzifier=fuzzifier,
                                                      max_iter=max_iter,
                                                      tol=tol,
                                                      random_state=42)

                # Fit the model
                fcm.fit(fdata)

                # Calculate the within-cluster sum of squares as the score
                cluster_labels = fcm.labels_
                centroids = fcm.cluster_centers_
                score = fcm.inertia_
                
                # Update the best score
                if score < best_score:
                    best_score = score
                    best_params = {'n_clusters': n_clusters, 'fuzzifier': fuzzifier, 'max_iter': max_iter, 'tol': tol}

                # Print the distance for the current combination of hyperparameters
                print(f"n_clusters={n_clusters}, fuzzifier={fuzzifier}, max_iter={max_iter}, tol={tol}: {score}")

best_fcm = skfda.ml.clustering.FuzzyCMeans(n_clusters=best_params['n_clusters'],
                       fuzzifier=best_params['fuzzifier'],
                       max_iter=best_params['max_iter'],
                       tol=best_params['tol'],
                       random_state=42)

best_fcm.fit(fdata)

print(f"Best parameters: {best_params}")


n_clusters=5, fuzzifier=1.1, max_iter=50, tol=0.001: 8693.24607228539
n_clusters=5, fuzzifier=1.1, max_iter=50, tol=0.0001: 8693.210469316378
n_clusters=5, fuzzifier=1.1, max_iter=50, tol=1e-05: 8693.210469316378
n_clusters=5, fuzzifier=1.1, max_iter=100, tol=0.001: 8693.24607228539
n_clusters=5, fuzzifier=1.1, max_iter=100, tol=0.0001: 8693.208920769182
n_clusters=5, fuzzifier=1.1, max_iter=100, tol=1e-05: 8693.208340008763
n_clusters=5, fuzzifier=1.1, max_iter=200, tol=0.001: 8693.24607228539
n_clusters=5, fuzzifier=1.1, max_iter=200, tol=0.0001: 8693.208920769182
n_clusters=5, fuzzifier=1.1, max_iter=200, tol=1e-05: 8693.208340008763
n_clusters=5, fuzzifier=1.2, max_iter=50, tol=0.001: 8272.850672827346
n_clusters=5, fuzzifier=1.2, max_iter=50, tol=0.0001: 8272.816326706581
n_clusters=5, fuzzifier=1.2, max_iter=50, tol=1e-05: 8272.816326706581
n_clusters=5, fuzzifier=1.2, max_iter=100, tol=0.001: 8272.850672827346
n_clusters=5, fuzzifier=1.2, max_iter=100, tol=0.0001: 8272.785870116

In [84]:
# import sys
# import os
# cwd = os.getcwd()
# sys.path.append("../src/")
# from subway_usa_build_model import *

In [83]:

def print_cluster_percentages(n_clusters, test_df, fcm):
    """Funtion to print cluster_percentages
    Parameters
    ----------
    n_clusters : num
        number of clusters
    test_df : pandas.DataFrame
        The testing dataset
    fcm : FuzzyCMeans object
        The FuzzyCMeans model
    
    Returns
    -------
    None
    """
    # Convert the data to an FDataGrid object
    fdata = skfda.FDataGrid(test_df.values)
    # Get labels
    cluster_membership_train = fcm.labels_
    cluster_membership_test = fcm.predict(fdata)

    # Train
    cluster_percentages = []
    for cluster in range(n_clusters):
        percentage = np.sum(cluster_membership_train == cluster) / len(cluster_membership_train) * 100
        cluster_percentages.append(percentage)
    # Print the cluster percentages
    print("train data")
    for cluster, percentage in enumerate(cluster_percentages):
        print(f"Cluster {cluster} Percentage: {percentage:.2f}%")
    
    # Test
    cluster_percentages = []
    for cluster in range(n_clusters):
        percentage = np.sum(cluster_membership_test == cluster) / len(cluster_membership_test) * 100
        cluster_percentages.append(percentage)
    # Print the cluster percentages
    print("test data")
    for cluster, percentage in enumerate(cluster_percentages):
        print(f"Cluster {cluster} Percentage: {percentage:.2f}%")

print_cluster_percentages(best_params['n_clusters'], train_df, best_fcm)

train data
Cluster 0 Percentage: 14.18%
Cluster 1 Percentage: 13.82%
Cluster 2 Percentage: 20.33%
Cluster 3 Percentage: 18.05%
Cluster 4 Percentage: 15.64%
Cluster 5 Percentage: 1.29%
Cluster 6 Percentage: 16.69%
test data
Cluster 0 Percentage: 14.19%
Cluster 1 Percentage: 13.82%
Cluster 2 Percentage: 20.33%
Cluster 3 Percentage: 18.05%
Cluster 4 Percentage: 15.64%
Cluster 5 Percentage: 1.29%
Cluster 6 Percentage: 16.68%
