# Module 5 - Clustering!

## Topic 1

### To start, we are going to step through a standard example.  We will first create 4 sets of random numbers in clusters and then cluster them (which should obviously give great results)

In [None]:
# import statements
from sklearn.datasets import make_blobs
import numpy as np
import matplotlib.pyplot as plt
# create blobs
data = make_blobs(n_samples=200, n_features=2, centers=4, cluster_std=1.6, random_state=50)
# print(data[0])  We want the first array in this array because it has the datapoints.  The other array in this array is the clusters of the data
# create np array for data points
points = data[0]
print(points[0:5])
print(points.shape)
print(type(points))

### Let's see the data on a scatter plot first to get an idea of what we are looking at

In [None]:
# create scatter plot
plt.scatter(data[0][:,0], data[0][:,1]) # data[0][:,1] is the first data column in the first array
plt.xlim(-15,15)
plt.ylim(-15,15)
plt.show()
plt.scatter(data[0][:,0], data[0][:,1], c=data[1], cmap='viridis') #Adding cluster coloring from the creation of the random clusters
plt.xlim(-15,15)
plt.ylim(-15,15)
plt.show()

### Now to build a k-means model to assign these to clusters

In [None]:
# import KMeans
from sklearn.cluster import KMeans

In [None]:
# create kmeans object
kmeans = KMeans(n_clusters=4)
print("data type:\n",type(kmeans),end='\n\n')
# fit kmeans object to data
kmeans.fit(points)
# print location of clusters learned by kmeans object
print("centers:\n",kmeans.cluster_centers_, end='\n\n')
# save new clusters for chart
y_km = kmeans.fit_predict(points)
print("assigned cluster:\n",y_km,end='\n\n')
print("cluster prediction variable type:\n",type(y_km),end="\n\n")
print("cluster prediction variable shape:\n",y_km.shape)

In [None]:
#print(y_km==0) #Shows the result of the comparison operator on y_km
#print(points[y_km ==0,0]) #Shows the result of y_comparison and the data point from points
plt.scatter(points[y_km ==0,0], points[y_km == 0,1], s=100, c='red')
plt.scatter(points[y_km ==1,0], points[y_km == 1,1], s=100, c='green')
plt.scatter(points[y_km ==2,0], points[y_km == 2,1], s=100, c='blue')
plt.scatter(points[y_km ==3,0], points[y_km == 3,1], s=100, c='cyan')
# This part adds the discovered cluster centers to the plot
for x_pt,y_pt in kmeans.cluster_centers_:
    #print(x_pt,y_pt)
    plt.scatter(x_pt,y_pt, color='black',s=100)

### Let's see how this same data looks with hierarchical clustering

In [None]:
# import hierarchical clustering libraries
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering

In [None]:
with plt.rc_context(): # This lets us change the size of just this visual without affecting the rest of the notebook
    plt.rc('figure',figsize=(28,10))
    # create dendrogram
    dendrogram = sch.dendrogram(sch.linkage(points, method='ward'))
    # create clusters
    hc = AgglomerativeClustering(n_clusters=4, affinity = 'euclidean', linkage = 'ward')
    # save clusters for chart
    y_hc = hc.fit_predict(points)

### Let's plot the points with their assigned clusters again for hierarchical

In [None]:
plt.scatter(points[y_hc ==0,0], points[y_hc == 0,1], s=100, c='red')
plt.scatter(points[y_hc==1,0], points[y_hc == 1,1], s=100, c='black')
plt.scatter(points[y_hc ==2,0], points[y_hc == 2,1], s=100, c='blue')
plt.scatter(points[y_hc ==3,0], points[y_hc == 3,1], s=100, c='cyan')

### Now that we've seen the pretty clustering with faked up data, let's give this a try with something real.  Last week we worked with the interests dataset and I want to see how it clusters

In [None]:
import pandas as pd
interest_data = pd.read_csv('responses.csv')

### For the first part, let's look specifically at the music data

In [None]:
music_type_interest = interest_data.iloc[:,1:18].to_numpy()
# music_type_interest_pd = interest_data.iloc[:,1:18] # Could also do pandas dataframe but code below was easy in numpy

In [None]:
print(music_type_interest)

### Now let's cluster it

In [None]:
# Note:this fails on purpose!
# create kmeans object
kmeans = KMeans(n_clusters=4)
# fit kmeans object to data
kmeans.fit(music_type_interest_pd)
# print location of clusters learned by kmeans object
print(kmeans.cluster_centers_)
# save new clusters for chart
y_km = kmeans.fit_predict(music_type_interest)

### What happened?  I made a mistake by not prepping my data properly.  It turns out that there are several missing values in these columns of data. 


### Recall from DAT201 that we have to decide how we handle missing values.  We could delete the rows missing data, fill with 1's or 0's, or use an average of the rest of the data points.

### First, let's see how many missing pieces of data we have

In [None]:
np.argwhere(np.isnan(music_type_interest))

### Now, let's look at one of those rows

In [None]:
print(music_type_interest[8,:])

### I've decided to replace all of the nan values with the average for that column because this should have the least affect on my clustering

### First, we'll figure out the average for each of the columns

In [None]:
col_mean = np.nanmean(music_type_interest, axis=0)
print("column average ignoring NaN:\n",col_mean, end="\n\n")
nans_to_replace = np.where(np.isnan(music_type_interest)) #tuple of arrays for row/column pairs that need to be updated
print("datatype of nans_to_replace:\n",type(nans_to_replace), end = "\n\n")
print("tuple of arrays for NaN's to replace:\n",nans_to_replace, end="\n\n") #This is a tuple of two arrays.  The first is the row with the NaNs and the second is the column

In [None]:
music_type_interest[nans_to_replace] = np.take(col_mean,nans_to_replace[1])

In [None]:
print(music_type_interest[8,:])

### Now let's try our kmeans again now that the data is cleaned up

In [None]:
# create kmeans object
kmeans = KMeans(n_clusters=5)
# fit kmeans object to data
kmeans.fit(music_type_interest)
# print location of clusters learned by kmeans object
print(kmeans.cluster_centers_)
# save new clusters for chart
y_km = kmeans.fit_predict(music_type_interest)

### That's a lot of numbers.  We need to look at our best metric to see if these clusters are actually valid/worthwhile...Silhouette score!

In [None]:
from sklearn.metrics import silhouette_score
score = silhouette_score(music_type_interest, y_km)

In [None]:
print(score)

### It doesn't look like there's very good clustering here.  However, to be sure, I'd like to run a range of cluster counts to see the highest silhouette coefficient

In [None]:
sse = {}
for n_clusters in range(2,12):
    clusterer = KMeans(n_clusters=n_clusters)
    preds = clusterer.fit_predict(music_type_interest)
    centers = clusterer.cluster_centers_
    sse[n_clusters] = clusterer.inertia_
    score = silhouette_score(music_type_interest, preds)
    print("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))
print(sse)
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()        

### We'll be doing some visualization to see things more clearly next week.  However, I want to have a better result so I'm going to cheat a bit.  From the dataset, I'm going to grab the columns that seem the most likely to be different (this is an example of inserting my own bias...)

### Let's use the columns Folk, Raggae/Ska, and Latino.  I doubt that everybody has the same level of interest between these three.

In [None]:
folk_raggae_latino_interests = music_type_interest[:,[2,11,15]]
print(music_type_interest[0:4])

In [None]:
print(folk_raggae_latino_interests[0:4])

In [None]:
sse = {}
for n_clusters in range(2,12):
    clusterer = KMeans(n_clusters=n_clusters)
    preds = clusterer.fit_predict(folk_raggae_latino_interests)
    centers = clusterer.cluster_centers_
    sse[n_clusters] = clusterer.inertia_
    score = silhouette_score(folk_raggae_latino_interests, preds)
    print("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()        

### A bit better but still not significant.

### Let's try it with hierachical clustering and see if we get a different result

In [None]:
# import hierarchical clustering libraries (again)
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering

In [None]:
with plt.rc_context():
    plt.rc('figure',figsize=(28,28))# create dendrogram
    dendrogram = sch.dendrogram(sch.linkage(folk_raggae_latino_interests, method='ward'))
    # create clusters
    hc = AgglomerativeClustering(n_clusters=6, compute_full_tree=False, affinity = 'euclidean', linkage = 'ward')#, distance_threshold = 40)
    print(hc)
    # save clusters for chart
    y_hc = hc.fit_predict(folk_raggae_latino_interests)
    score_hier = silhouette_score(folk_raggae_latino_interests, y_hc)

In [None]:
print(score_hier)