In [None]:
print(__doc__)

# Code source adapted from: Jaques Grobler
# License: BSD 3 clause

import random
import matplotlib.pyplot as plt
import numpy as np
import pandas
import os
import csv
import time
from cmath import sqrt
from tqdm import tqdm
from sklearn.datasets import load_digits
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import normalize

import pprint

pp = pprint.PrettyPrinter(indent=4)

In [None]:
def graph_add_scatter(x, y, c='black'):
    plt.scatter(x, y, color= c)

def graph_add_line(x, y, c='black'):
    plt.plot(x, y, color=c, linewidth=3)

def plot(name=""):
    plt.xticks()
    plt.yticks()
    
    if name!="":
        plt.savefig(name)
    plt.show()
    plt.close()

def distance(v1, v2):
    dist = 0
    for i in range(len(v1)):
        dist += (v1[i]-v2[i])*(v1[i]-v2[i])
    return sqrt(dist).real

def GetClosest(v, i):
    closest = -1
    for j in range(len(v)):
        if i == j:
            continue
        if closest == -1 or distance(v[closest],v[i]) > distance(v[j],v[i]):
            closest = j
    return closest

def GetGroup(labels, group, offset=0):
    indexes = []
    for i in range(len(labels)):
        if labels[i] == group:
            indexes.append(i+offset)
    return indexes

class DataReader():
    def __init__(self, file_path="health-dataset/health.txt"):
        f = open(file_path)
        self.lines = []
        for i, line in enumerate(f):
            s = line.split("|")
            self.lines.append([str(i) + " " + s[-1], False])
    
    def GetLine(self, line):
        if line < 0 and line >= len(self.lines):
            return str(line) + " Is Not a Valid Line"
        return self.lines[line][0]
    
    def GetLineGroup(self, lines):
        t = []
        for i in lines:
            if not self.lines[i][1]:
                t.append(self.lines[i][0])
        return t
    def DeleteLines(self, d):
        for i in range(len(d)-1,-1,-1):
            print(self.lines[i])
            del self.lines[i]
    def DeleteOutliers(self):
        for i in range(len(self.lines)-1,-1,-1):
            if self.lines[i][1]:
                print(self.lines[i])
                del self.lines[i]
    def SetOutliers(self, o):
        for i in o:
            self.lines[i][1] = True

data_reader = DataReader()

# Read Dataset

In [None]:
# Read and treat training dataset
dataset = pandas.read_csv('health-dataset/word2vec.csv').values
# dataset = (dataset-dataset.max()/2) / dataset.max()
dataset = normalize(dataset)
data_train = dataset[0:10000,:]
data_validation = dataset[10000:13227,:]

# dataset = load_digits(n_class=10)

# K-Means Algorithm

In [None]:
kmeans_costs = []
kmeans_clusters = []
kmeans_silhouette = []
kmeans_davies = []
for n in range(10,100,10):
    print("Training KMeans for " + str(n) + " clusters")
    kmeans = KMeans(n_clusters=n)
    kmeans.fit(data_train)
    
    
    labels = kmeans.predict(data_validation)
    s = silhouette_score(data_validation,labels)
    d = davies_bouldin_score(data_validation,labels)
    print("\t\t Cost: " + str(kmeans.inertia_))
    print("\t\t Silhouette Score: " + str(s))
    print("\t\t Davies Bouldin Score: " + str(d))
    
    kmeans_costs.append(kmeans.inertia_)
    kmeans_clusters.append(n)
    kmeans_silhouette.append(s)
    kmeans_davies.append(d)
    
for n in range(100,2001,100):
    print("Training KMeans for " + str(n) + " clusters")
    kmeans = KMeans(n_clusters=n)
    kmeans.fit(data_train)
    
    labels = kmeans.predict(data_validation)
    s = silhouette_score(data_validation,labels)
    d = davies_bouldin_score(data_validation,labels)
    print("\t\t Cost: " + str(kmeans.inertia_))
    print("\t\t Silhouette Score: " + str(s))
    print("\t\t Davies Bouldin Score: " + str(d))
    
    kmeans_costs.append(kmeans.inertia_)
    kmeans_clusters.append(n)
    kmeans_silhouette.append(s)
    kmeans_davies.append(d)

In [None]:
graph_add_line(kmeans_clusters, kmeans_costs)
graph_add_scatter(kmeans_clusters, kmeans_costs,c='blue')
plot("cost_nclusters_10_2000")

graph_add_line(kmeans_clusters, kmeans_silhouette)
graph_add_scatter(kmeans_clusters, kmeans_silhouette,c='blue')
plot("silhouette_nclusters_10_2000")

graph_add_line(kmeans_clusters, kmeans_davies)
graph_add_scatter(kmeans_clusters, kmeans_davies,c='blue')
plot("davies_nclusters_10_2000")

## Getting Clusters

Here we get the tweets using kmeans with 100 and 2000 clusters to compare

In [None]:
kmeans_100 = KMeans(n_clusters=100)
kmeans_100.fit(data_train)

In [None]:
labels = kmeans_100.predict(data_validation)
s100 = silhouette_score(data_validation,labels)
print("Cost: " + str(kmeans_100.inertia_))
print("Silhouette Score for 100 Clusters is: " + str(s100))
d100 = davies_bouldin_score(data_validation,labels)
print("Davies Bouldin Score for 100 Clusters is: " + str(d100))


for i in range(100):
    print("\n\nGroup " + str(i))
    print("Closest Group: " + str(GetClosest(kmeans_100.cluster_centers_,i)) + "\n")
    pp.pprint(data_reader.GetLineGroup(GetGroup(labels,i,offset=10000)))

In [None]:
kmeans_1000 = KMeans(n_clusters=1000)
kmeans_1000.fit(data_train)

In [None]:
labels = kmeans_1000.predict(data_validation)
s1000 = silhouette_score(data_validation,labels)
print("Silhouette Score for 1000 Clusters is: " + str(s1000))
d1000 = davies_bouldin_score(data_validation,labels)
print("Davies Bouldin Score for 1000 Clusters is: " + str(d1000))

print(GetClosest(kmeans_1000.cluster_centers_,58))
for i in range(1000):
    print("\n\nGroup " + str(i))
    print("Closest Group: " + str(GetClosest(kmeans_1000.cluster_centers_,i)) + "\n")
    pp.pprint(data_reader.GetLineGroup(GetGroup(labels,i,offset=10000)))

## Affinity Propagation

In [None]:
from sklearn.cluster import AffinityPropagation

print("Training Affinity")
affinity = AffinityPropagation()
affinity.fit(data_train)

In [None]:
labels = affinity.predict(data_validation)
print("Silhouette Score: " + str(silhouette_score(data_validation,labels)))
print("Davies Bouldin Score: " + str(davies_bouldin_score(data_validation,labels)))

print(len(affinity.cluster_centers_indices_))
for i in range(len(affinity.cluster_centers_indices_)):
    print("\nGroup " + str(i))
    print("Closest Group: " + str(GetClosest(affinity.cluster_centers_,i)) + "\n")
    pp.pprint(data_reader.GetLineGroup(GetGroup(labels,i, offset=10000)))

## PCA Analysis

Here we use the best algorithm and cluster number we got from previous experiments and run it again using different number of features, using PCA to reduce dimensionality

In [None]:
from sklearn.decomposition import PCA

In [None]:
n_features = data_train.shape[1]

pca = PCA()

pca.fit(dataset)
variance = pca.explained_variance_ratio_.cumsum()

# Get Dimensionality with variance of 0.95 
fn = 0
for i in range(len(variance)):
    if variance[i] > 0.95:
        fn = i
        break

print("Found feature subset of size " + str(fn+1) + " with variance of " + str(variance[fn]))
pca = PCA(n_components=n_features-fn)
data_train_pca = pca.fit_transform(data_train)

# Fit Kmeans algorithm for 0.95 variance
print("Training KMeans with " + str(fn+1) + " features")
kmeans_pca = KMeans(n_clusters=1000)
kmeans_pca.fit(data_train_pca)

data_val_pca = pca.fit_transform(data_validation)

labels = kmeans_pca.predict(data_val_pca)

s = silhouette_score(data_val_pca,labels)
d = davies_bouldin_score(data_val_pca,labels)

print("\t\t Data Variance: " + str(pca.explained_variance_ratio_[0]))
print("\t\t Silhouette Score: " + str(s))
print("\t\t Davies Bouldin Score: " + str(d))

In [None]:
for i in range(1000):
    print("\n\nGroup " + str(i) + "\n")
    pp.pprint(data_reader.GetLineGroup(GetGroup(labels,i,offset=10000)))

In [None]:
n_features = data_train.shape[1]

kmeans_costs = []
kmeans_feature_number = []
kmeans_silhouette = []
kmeans_davies = []
for fn in range(1,n_features,10):
    pca = PCA(n_components=fn)
    pca.fit(data_train)

    # Fit Kmeans algorithm for 0.95 variance
    print("Training KMeans for 1000 clusters with " + str(fn) + " features")
    data_train_pca = pca.transform(data_train)
    kmeans_pca = KMeans(n_clusters=1000)
    kmeans_pca.fit(data_train_pca)

    data_validation_pca = pca.transform(data_validation)
    labels = kmeans_pca.predict(data_validation_pca)
    s = silhouette_score(data_validation_pca,labels)
    d = davies_bouldin_score(data_validation_pca,labels)
    
    print("\t\t Data Variance: " + str(pca.explained_variance_ratio_.cumsum()[-1]))
    print("\t\t Cost: " + str(kmeans_pca.inertia_))
    print("\t\t Silhouette Score: " + str(s))
    print("\t\t Davies Bouldin Score: " + str(d))
    
    kmeans_costs.append(kmeans_pca.inertia_)
    kmeans_feature_number.append(fn)
    kmeans_silhouette.append(s)
    kmeans_davies.append(d)
    
graph_add_line(kmeans_feature_number, kmeans_costs)
graph_add_scatter(kmeans_feature_number, kmeans_costs,c='blue')
plot("cost_features_10_2000")

graph_add_line(kmeans_feature_number, kmeans_silhouette)
graph_add_scatter(kmeans_feature_number, kmeans_silhouette,c='blue')
plot("silhouette_features_10_2000")

graph_add_line(kmeans_feature_number, kmeans_davies)
graph_add_scatter(kmeans_feature_number, kmeans_davies,c='blue')
plot("davies_features_10_2000")

# DBSCAN - Detecting Outliers

Since the experimentes were subpar, we try to detect outliers and cluster again using DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

Here, we try different values for the "eps" parameter, which is the maximum distance between two samples for them to be considered neighbours.

We then plot graphs to see the results

In [None]:
n_eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
outliers = []
clusters = []
sil = []
biggest_cluster = []
for i in n_eps:
    print("DBSCAN for eps = " + str(i))
    db = DBSCAN(eps=i)
    db.fit(dataset)
    labels = db.labels_
    
    sil.append(silhouette_score(dataset,labels))
    
    outliers.append(len(data_reader.GetLineGroup(GetGroup(labels,-1))))
    
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    clusters.append(n_clusters)
    
    biggest_cluster.append(max([len(data_reader.GetLineGroup(GetGroup(labels,i))) for i in range(n_clusters)]))


In [None]:
print("------------------------------------------")
print("eps x number of clusters found")
graph_add_line(n_eps, clusters)
graph_add_scatter(n_eps, clusters,c='blue')
plot()

print("------------------------------------------")
print("eps x silhouette score")
graph_add_line(n_eps, sil)
graph_add_scatter(n_eps, sil,c='blue')
plot()

print("------------------------------------------")
print("eps x number of outliers")
graph_add_line(n_eps, outliers)
graph_add_scatter(n_eps, outliers,c='blue')
plot()

print("------------------------------------------")
print("eps x biggest cluster")
graph_add_line(n_eps, biggest_cluster)
graph_add_scatter(n_eps, biggest_cluster,c='blue')
plot()

With the results above, we can see that between eps = 0.5 and eps = 0.8 the number of outliers diminishes and the size of the biggest cluster increases. We think that all the outliers are being grouped together. We can check that below

In [None]:
# Get outliers
db = DBSCAN(eps=0.5)
db.fit(dataset)

labels = db.labels_

outliers = data_reader.GetLineGroup(GetGroup(labels,-1))

# Get Biggest cluster
db = DBSCAN(eps=0.8)
db.fit(dataset)

labels = db.labels_

group_lengths = [len(data_reader.GetLineGroup(GetGroup(labels,i))) for i in range(n_clusters)]
biggest_c = data_reader.GetLineGroup(GetGroup(labels,group_lengths.index(max(group_lengths))))

# Compare if biggest cluster has outliers
common = 0
for d in biggest_c:
    if d in outliers:
        common+=1
        
print("number of outliers = " + str(len(outliers)))
print("Number of elements in common = " + str(common))

With this result, we can see that most of the elements from the outliers are inside a new group. This proves that there are a lot of outliers in out dataset

In [None]:
db = DBSCAN(eps=0.5)
db.fit(dataset)

Next, we remove all the outliers and fit DBSCAN again for the final results

In [None]:
labels = db.labels_

print("------------------------------------------------------")
print("OUTLIERS\n")
outliers = data_reader.GetLineGroup(GetGroup(labels,-1))
print("There are " + str(len(outliers)) + " outliers in the dataset\n")
pp.pprint(outliers)

data_reader.SetOutliers([i for i,k in enumerate(labels) if k==-1])

# Delete Outliers
new_data = np.delete(dataset, [i for i,k in enumerate(labels) if k==-1], axis=0)
data_reader.DeleteOutliers()

In [None]:
db = DBSCAN()
db.fit(new_data)
print("------------------------------------------------------")
print("SCORES\n")
labels = db.labels_
s = silhouette_score(new_data,labels)
# print("Cost: " + str(db.inertia_))
print("Silhouette Score for 100 Clusters is: " + str(s))
d = davies_bouldin_score(new_data,labels)
print("Davies Bouldin Score for 100 Clusters is: " + str(d))

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print("There are " + str(n_clusters_) + " different clusters")

print(len(db.components_))
for i in range(n_clusters_):
    print("\n\nGroup " + str(i))
    print("Closest Group: " + str(GetClosest(db.components_,i)) + "\n")
    pp.pprint(data_reader.GetLineGroup(GetGroup(labels,i,offset=0)))