In [1]:
# Here are some libraries we will need:

import pandas as pd
import numpy as np
import itertools
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

In [2]:
PATH = "./data/Facebook_metrics/dataset_Facebook.csv"
SEPARATOR = ";"
DUMMY_COLUMNS = [
    "Type"
]

In [3]:
# metric functions
# --------------------------------------------
def get_cluster_indexes(cluster_assignments):
    cluster_slices = {}
    
    for index, assignment in enumerate(assignments):
        if assignment not in cluster_slices:
            cluster_slices[assignment] = list()
            
        cluster_slices[assignment].append(index)
        
    return cluster_slices

def get_cluster_slices(df, assignments):
    cluster_indexes = get_cluster_indexes(assignments)
    
    cluster_slices = {k: df.iloc[v] for k, v in cluster_indexes.items()}
    
    return cluster_slices

def total_sum_of_squares_df(df, centroid = None):
    """ Calculates and returns the TTS of the given DataFrame """
    if centroid is None:
        centroid = find_centroid_df(df)
        
    return total_sum_of_squares(df.as_matrix(), centroid)

def total_sum_of_squares(data, centroid):
    """ Calculates and returns the TTS of the given matrix
    
    Arguments:
      data - Iterable - This is a matrix (array of arrays)
      centroid - Array
    """        
    total = 0
    
    for row in data:
        for index, value in enumerate(row):
            diff = value - centroid[index]
            diffsq = diff ** 2
            total += diffsq
            
    return total

def find_centroid_df(df):
    """ Calculates and returns the centroid for a DataFrame """
    return df.mean()

def run_gaussian_mixture(model, data):
    model.fit(data)
    return model.predict(data)

def run_kmeans(model, data):
    model.fit(data)
    return model.predict(data)

def run_hclustering(model, data):
    return model.fit_predict(data)

# data functions 
# -------------------------------------------

def clean_data(data):
    # Strip whitespaces from all string values
    # and replace "?" with None,
    # and drop all na rows
    data = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x) \
               .replace(["?"], [None]) \
               .dropna()

    return data

def prepare_data(data):
    return pd.get_dummies(data, columns=DUMMY_COLUMNS)

def read_data(path):
    dataset = pd.read_csv(path, sep=SEPARATOR, header=0)
    dataset = clean_data(dataset)
    dataset = prepare_data(dataset)
    return dataset

In [4]:
# Reading the dataset
# df = pd.DataFrame({
#    "A": [1,2,2,1],
#    "B": [0,0,0,1]
# })

df = read_data(PATH)

In [5]:
print (df.columns)

Index(['Page total likes', 'Category', 'Post Month', 'Post Weekday',
       'Post Hour', 'Paid', 'Lifetime Post Total Reach',
       'Lifetime Post Total Impressions', 'Lifetime Engaged Users',
       'Lifetime Post Consumers', 'Lifetime Post Consumptions',
       'Lifetime Post Impressions by people who have liked your Page',
       'Lifetime Post reach by people who like your Page',
       'Lifetime People who have liked your Page and engaged with your post',
       'comment', 'like', 'share', 'Total Interactions', 'Type_Link',
       'Type_Photo', 'Type_Status', 'Type_Video'],
      dtype='object')


In [6]:
df.head()

Unnamed: 0,Page total likes,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,...,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions,Type_Link,Type_Photo,Type_Status,Type_Video
0,139441,2,12,4,3,0.0,2752,5091,178,109,...,1640,119,4,79.0,17.0,100,0,1,0,0
1,139441,2,12,3,10,0.0,10460,19057,1457,1361,...,6112,1108,5,130.0,29.0,164,0,0,1,0
2,139441,3,12,3,3,0.0,2413,4373,177,113,...,1503,132,0,66.0,14.0,80,0,1,0,0
3,139441,2,12,2,10,1.0,50128,87991,2211,790,...,32048,1386,58,1572.0,147.0,1777,0,1,0,0
4,139441,2,12,2,3,0.0,7244,13594,671,410,...,3200,396,19,325.0,49.0,393,0,1,0,0


In [7]:
tts = total_sum_of_squares_df(df)
print("tts for data = %s" % tts)

tts for data = 5.14204328974e+12


In [8]:
models = [
    ("Gaussian Mixture", lambda k: GaussianMixture(n_components=k, reg_covar=0.001), run_gaussian_mixture),
    ("KMeans", lambda k: KMeans(n_clusters=k), run_kmeans),
    ("H-Clustering", lambda k: AgglomerativeClustering(n_clusters=k), run_hclustering)
]

In [10]:
for model_name, create_model, run_model in models:
    print("-------------------------------")
    print(model_name)
    print("-------------------------------")
    print("")
    for k in range(1,11):
        print("Calculating %s clusters..." % k)
        print("")
        model = create_model(k)
        assignments = run_model(model, df)

        clusters = [
            (cluster, find_centroid_df(cluster_slice), cluster_slice) 
            for cluster, cluster_slice 
            in get_cluster_slices(df, assignments).items()
        ]

        ttws = 0

        for cluster, centroid, cluster_slice in clusters:
            cluster_tts = total_sum_of_squares_df(cluster_slice, centroid)
            print("cluster %s | tts = %s | size = %s" % (cluster, cluster_tts, len(cluster_slice)))
            ttws += cluster_tts

        print("ttws/tts = %s/%s = %s" % (ttws, tts, ttws / tts))
        print("")

-------------------------------
Gaussian Mixture
-------------------------------

Calculating 1 clusters...

cluster 0 | tts = 5.14204328974e+12 | size = 495
ttws/tts = 5.14204328974e+12/5.14204328974e+12 = 1.0

Calculating 2 clusters...

cluster 0 | tts = 1.9613809649e+12 | size = 493
cluster 1 | tts = 204331351308.0 | size = 2
ttws/tts = 2.1657123162e+12/5.14204328974e+12 = 0.421177379141

Calculating 3 clusters...

cluster 1 | tts = 149723615348.0 | size = 376
cluster 0 | tts = 1.17348930384e+12 | size = 117
cluster 2 | tts = 204331351308.0 | size = 2
ttws/tts = 1.52754427049e+12/5.14204328974e+12 = 0.297069508058

Calculating 4 clusters...

cluster 0 | tts = 136808856235.0 | size = 368
cluster 3 | tts = 534247589838.0 | size = 120
cluster 2 | tts = 388641769413.0 | size = 6
cluster 1 | tts = 0.0 | size = 1
ttws/tts = 1.05969821549e+12/5.14204328974e+12 = 0.206085043586

Calculating 5 clusters...

cluster 0 | tts = 128757475660.0 | size = 365
cluster 4 | tts = 280369644965.0 | size 

cluster 3 | tts = 15952762235.1 | size = 231
cluster 7 | tts = 19558111361.3 | size = 88
cluster 0 | tts = 91968220687.2 | size = 53
cluster 1 | tts = 65942760935.0 | size = 10
cluster 2 | tts = 37055750258.8 | size = 108
cluster 4 | tts = 0.0 | size = 1
cluster 6 | tts = 5280858658.67 | size = 3
cluster 5 | tts = 0.0 | size = 1
ttws/tts = 235758464136.0/5.14204328974e+12 = 0.0458491791787

Calculating 9 clusters...

cluster 1 | tts = 15952762235.1 | size = 231
cluster 7 | tts = 19558111361.3 | size = 88
cluster 8 | tts = 24675835427.2 | size = 37
cluster 3 | tts = 21379736032.6 | size = 16
cluster 0 | tts = 65942760935.0 | size = 10
cluster 2 | tts = 37055750258.8 | size = 108
cluster 4 | tts = 0.0 | size = 1
cluster 6 | tts = 5280858658.67 | size = 3
cluster 5 | tts = 0.0 | size = 1
ttws/tts = 189845814909.0/5.14204328974e+12 = 0.0369203066197

Calculating 10 clusters...

cluster 1 | tts = 15952762235.1 | size = 231
cluster 7 | tts = 19558111361.3 | size = 88
cluster 8 | tts = 246758