In [1]:
import numpy as np, pandas as pd, os,sys
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import colorConverter
import gmplot
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import pairwise_distances_argmin_min
from scipy.spatial.distance import cdist,pdist


import sklearn.datasets as datasets


In [2]:
OUTPUT_FOLDER = "output"

csv_file = "july-oct-batch3-train.csv" ## raw_input("Enter csv file to load:")
## USED TO CONVERT BACK FROM DECOMPOSED VARIABLES TO GEOCORDS
raw_map_df = pd.read_csv("output/july-oct-raw.csv") 
file_str = "%s/%s" %(OUTPUT_FOLDER, csv_file)

a = pd.read_csv(file_str) 
a.columns = ['index1', 'address','city', 'day','hour','type','latitude','longitude','parent_incident','state']
a.drop('index1',axis=1,inplace=True)

## MODELS
X = a[['address','city', 'day','hour','type','latitude','longitude','parent_incident','state']] ## MODEL 1

## DECOMPOSE VARIABLE DIMENSIONS
pca = PCA(n_components=2).fit(X)
pca_2d = pd.DataFrame(pca.transform(X))

# Add PCA cols to DF
X['pca1'] = pca_2d[0]
X['pca2'] = pca_2d[1]

pca_out = X[['pca1','pca2']]
pca_out.to_csv("output/july-oct-batch3-train-pca.csv")

In [3]:
def optimalK(data, nrefs=3, minClusters=1, maxClusters=15):
    """
    Calculates KMeans optimal K using Gap Statistic from Tibshirani, Walther, Hastie
    Params:
        data: ndarry of shape (n_samples, n_features)
        nrefs: number of sample reference datasets to create
        maxClusters: Maximum number of clusters to test for
    Returns: (gaps, optimalK)
    """
    gaps = np.zeros((len(range(minClusters, maxClusters)),))
    resultsdf = pd.DataFrame({'clusterCount':[], 'gap':[]})
    for gap_index, k in enumerate(range(minClusters, maxClusters)):

        # Holder for reference dispersion results
        refDisps = np.zeros(nrefs)

        # For n references, generate random sample and perform kmeans getting resulting dispersion of each loop
        for i in range(nrefs):
            
            # Create new random reference set
            randomReference = np.random.random_sample(size=data.shape)
            
            # Fit to it
            km = KMeans(k)
            km.fit(randomReference)
            
            refDisp = km.inertia_
            refDisps[i] = refDisp

        # Fit cluster to original data and create dispersion
        km = KMeans(k)
        km.fit(data)
        
        origDisp = km.inertia_

        # Calculate gap statistic
        gap = np.log(np.mean(refDisps)) - np.log(origDisp)

        # Assign this loop's gap statistic to gaps
        gaps[gap_index] = gap
        
        resultsdf = resultsdf.append({'clusterCount':k, 'gap':gap}, ignore_index=True)

    return (gaps.argmax() + 1, resultsdf)  # Plus 1 because index of 0 means 1 cluster is optimal, index 2 = 3 clusters are optimal
    

In [4]:
k, gapdf = optimalK(pca_2d, nrefs=5, minClusters=1, maxClusters=50)
print 'Optimal k is: ', k

Optimal k is:  3


In [None]:
plt.figure(figsize=(12,6))
k_clusters = plt.plot(gapdf.clusterCount, gapdf.gap, c='b', linewidth=1)
red_dot = plt.scatter(gapdf[gapdf.clusterCount == k].clusterCount, gapdf[gapdf.clusterCount == k].gap, s=30, c='r')
plt.grid(True)
plt.xlabel('Cluster Counts')
plt.ylabel('Gap')
plt.legend([red_dot],['Optimal Cluster'],loc='best')
plt.title('Gap Statistic Values by Clusters, k=1 to 50\nSelected k = %s'%(k))
plt.show()

In [None]:
km = KMeans(k)
km.fit(X)

df = pd.DataFrame(X, columns=['pca1','pca2'])
df['label'] = km.labels_

colors = plt.cm.Spectral(np.linspace(0, 1, len(df.label.unique())))

for color, label in zip(colors, df.label.unique()):
    
    tempdf = df[df.label == label]
    plt.scatter(tempdf.pca1, tempdf.pca2, c=color)
    
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:, 1], c='r', s=500, alpha=0.7, )
plt.grid(True)
plt.show()

In [None]:
num, gapdf = optimalK(km.cluster_centers_, maxClusters=11)
plt.plot(gapdf.clusterCount, gapdf.gap, linewidth=3)
plt.scatter(gapdf[gapdf.clusterCount == num].clusterCount, gapdf[gapdf.clusterCount == num].gap, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()

In [None]:
km.cluster_centers_

In [None]:
subKm = KMeans(n_clusters=num)
subKm.fit(km.cluster_centers_)

df = pd.DataFrame(km.cluster_centers_, columns=[0,1])
df['label'] = subKm.labels_

colors = plt.cm.Spectral(np.linspace(0, 1, len(df.label.unique())))

for color, label in zip(colors, df.label.unique()):
    
    tempdf = df[df.label == label]
    plt.scatter(tempdf.x, tempdf.y, c=color, s=250)
    
plt.scatter(subKm.cluster_centers_[:,0], subKm.cluster_centers_[:, 1], c='r', s=500, alpha=0.7, )
plt.grid(True)
plt.show()