In [22]:
import pandas as pd
import numpy as np
import itertools
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

In [23]:
PATH = "./data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv"

In [24]:
# metric functions
# --------------------------------------------

def total_sum_of_squares_df(df, centroid = None):
    """ Calculates and returns the TTS of the given DataFrame """
    if centroid is None:
        centroid = find_centroid_df(df)
        
    return total_sum_of_squares(df.as_matrix(), centroid)

def total_sum_of_squares(data, centroid):
    """ Calculates and returns the TTS of the given matrix
    
    Arguments:
      data - Iterable<Iterable>
      centroid - Array
    """        
    total = 0
    
    for row in data:
        for index, value in enumerate(row):
            diff = value - centroid[index]
            diffsq = diff ** 2
            total += diffsq
            
    return total


def find_centroid_df(df):
    """ Calculates and returns the centroid for a DataFrame """
    return df.mean()

# clustering functions
# --------------------------

def get_cluster_indexes(cluster_assignments):
    cluster_slices = {}
    
    for index, assignment in enumerate(assignments):
        if assignment not in cluster_slices:
            cluster_slices[assignment] = list()
            
        cluster_slices[assignment].append(index)
        
    return cluster_slices

def get_cluster_data(df, assignments):
    cluster_indexes = get_cluster_indexes(assignments)
    
    cluster_data = {k: df.iloc[v] for k, v in cluster_indexes.items()}
    
    return cluster_data

def get_clusters(df, assignments):
    """Returns an array of tuples with (<cluster>, <cluster_centroid>, <cluster_data_points>)"""
    
    return [
        (cluster, find_centroid_df(cluster_data), cluster_data) 
        for cluster, cluster_data 
        in get_cluster_data(df, assignments).items()
    ]

# model runners
# ------------------------------------------

def run_gaussian_mixture(model, data):
    model.fit(data)
    return model.predict(data)

def run_kmeans(model, data):
    model.fit(data)
    return model.predict(data)

def run_hclustering(model, data):
    return model.fit_predict(data)

# data functions 
# -------------------------------------------

def clean_data(data):
    # Strip whitespaces from all string values
    # and replace "?" with None,
    # and drop all na rows
    data = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x) \
               .replace(["?"], [None]) \
               .dropna()

    data = data.iloc[:,2:22]
    return data

def prepare_data(data):
    return data

def read_data(path):
    dataset = pd.read_csv(path)
    dataset = clean_data(dataset)
    dataset = prepare_data(dataset)
    return dataset

In [25]:
df = read_data(PATH)

In [27]:
tss = total_sum_of_squares_df(df)
print("tss for data = %s" % tss)

tss for data = 3349.31828079


In [31]:
models = [
    ("KMeans", lambda k: KMeans(n_clusters=k), run_kmeans),
    ("H-Clustering", lambda k: AgglomerativeClustering(n_clusters=k), run_hclustering),
    ("Gaussian Mixture", lambda k: GaussianMixture(n_components=k, reg_covar=0.001), run_gaussian_mixture)
]

In [32]:
for model_name, create_model, run_model in models:
    print("-------------------------------")
    print(model_name)
    print("-------------------------------")
    print("")
    for k in range(1,11):
        print("Calculating %s clusters..." % k)
        print("")
        model = create_model(k)
        assignments = run_model(model, df)
        clusters = get_clusters(df, assignments)

        twss = 0
        for cluster, centroid, cluster_slice in clusters:
            cluster_tss = total_sum_of_squares_df(cluster_slice, centroid)
            print("cluster %s | tss = %s | size = %s" % (cluster, cluster_tss, len(cluster_slice)))
            twss += cluster_tss

        print("twss/tss = %s/%s = %s" % (twss, tss, twss / tss))
        print("")

-------------------------------
KMeans
-------------------------------

Calculating 1 clusters...

cluster 0 | tss = 3349.31828079 | size = 7195
twss/tss = 3349.31828079/3349.31828079 = 1.0

Calculating 2 clusters...

cluster 1 | tss = 1676.637478 | size = 3612
cluster 0 | tss = 425.598015616 | size = 3583
twss/tss = 2102.23549361/3349.31828079 = 0.627660711038

Calculating 3 clusters...

cluster 1 | tss = 378.943084823 | size = 1062
cluster 0 | tss = 883.984105137 | size = 2551
cluster 2 | tss = 419.431735816 | size = 3582
twss/tss = 1682.35892578/3349.31828079 = 0.502298911222

Calculating 4 clusters...

cluster 0 | tss = 371.716560818 | size = 1046
cluster 2 | tss = 506.337217334 | size = 1964
cluster 3 | tss = 150.890188737 | size = 608
cluster 1 | tss = 416.522259974 | size = 3577
twss/tss = 1445.46622686/3349.31828079 = 0.431570279586

Calculating 5 clusters...

cluster 2 | tss = 367.70256794 | size = 1040
cluster 0 | tss = 281.078991057 | size = 1580
cluster 3 | tss = 138.559556

cluster 4 | tss = 46.9319590226 | size = 753
twss/tss = 1608.01327183/3349.31828079 = 0.480101661599

Calculating 7 clusters...

cluster 1 | tss = 344.981596995 | size = 989
cluster 3 | tss = 777.263876341 | size = 1482
cluster 4 | tss = 21.2511156404 | size = 376
cluster 6 | tss = 105.289740224 | size = 1698
cluster 5 | tss = 55.7261499582 | size = 905
cluster 0 | tss = 46.9319590226 | size = 753
cluster 2 | tss = 64.267488959 | size = 992
twss/tss = 1415.71192714/3349.31828079 = 0.422686591256

Calculating 8 clusters...

cluster 2 | tss = 213.945614822 | size = 592
cluster 6 | tss = 450.732672179 | size = 797
cluster 7 | tss = 111.95412255 | size = 512
cluster 4 | tss = 16.2846448098 | size = 361
cluster 0 | tss = 195.105505651 | size = 733
cluster 1 | tss = 160.088921931 | size = 2348
cluster 5 | tss = 66.877382696 | size = 954
cluster 3 | tss = 40.0278223554 | size = 898
twss/tss = 1255.01668699/3349.31828079 = 0.374708099314

Calculating 9 clusters...

cluster 7 | tss = 193.589192