#### IMPORT STATEMENTS

In [1]:
import os,sys
import numpy as np, pandas as pd
from time import time
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import colorConverter
import gmplot
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import pairwise_distances_argmin_min
from scipy.spatial.distance import cdist,pdist


#### GET DATA

In [8]:
OUTPUT_FOLDER = "../data/output"
SOURCE_FOLDER = "../data/src"

## CREATE FOLDER IF OUTPUT FOLDER IS UNDEFINED
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

## IF NO 'data' FOLDER EXISTS, CRATE ONE...
if not os.path.exists(SOURCE_FOLDER):
    os.makedirs(SOURCE_FOLDER)
 
csv_file = "july-oct-train.csv" ## raw_input("Enter csv file to load:")
# csv_file = "july-oct-batch3-test.csv" ## raw_input("Enter csv file to load:")
## USED TO CONVERT BACK FROM DECOMPOSED VARIABLES TO GEOCORDS
raw_file = "%s/july-oct-raw.csv" %(SOURCE_FOLDER)
raw_map_df = pd.read_csv(raw_file) 

file_str = "%s/%s" %(SOURCE_FOLDER, csv_file)
a = pd.read_csv(file_str) 
a.columns = ['index1', 'address','city', 'day','hour','type','latitude','longitude','parent_incident','state']
a.drop('index1',axis=1,inplace=True)

X = a[['address','city', 'day','hour','type','latitude','longitude','parent_incident','state']]

len(X), X.columns
X = X.as_matrix()

### ELBOW TEST

In [9]:
def plot_variance(start_,end_,interval_=1):
    plt.subplots(figsize=(12,5))
    
#     X = X.as_matrix()
    k_list = np.arange(start_,end_+1,interval_)
    k_var = [KMeans(n_clusters=k).fit(X) for k in k_list]
    centroids = [c.cluster_centers_ for c in k_var]
    k_euclidian = [cdist(X, cent, 'euclidean') for cent in centroids]
    dist = [np.min(ke, axis=1) for ke in k_euclidian]

    ## Total Within cluster SS
    wcss = [sum(d**2) for d in dist]
    
    ## Total SS
    tss = sum(pdist(X)**2) / X.shape[0]

    ## Between cluster SS
    bss = tss - wcss    
    print bss 
    plt.title("KMeans Explained Variance")
    plt.plot(k_list,bss)
    points = plt.scatter(k_list,bss,c='r', s=25)
    plt.ylabel('Variance')
    plt.xlabel('K-value')
   
    plt.grid('on', which='major', axis='x' )
    plt.grid('on', which='major', axis='y' )
   
    plt.legend([points],['Variance'],loc='best')  
    plt.xticks(k_list)

start_k = int(raw_input("Enter starting K: "))
end_k = int(raw_input("Enter ending K: "))
int_k = int(raw_input("Enter Interval: "))

plot_variance(start_k, end_k,int_k)
plt.show()

Enter starting K: 2
Enter ending K: 3
Enter Interval: 2
[ 13285.11226305]


#### REVERT BACK TO ORIGINAL DATASET, GENERATE A MAP

In [15]:
CLUSTER_NUMBER = int(raw_input("Enter k: "))
def revert_to_map(X,a):
    '''
    This runs K-Means and translates created clusters
    to the original data set. It shows a scatter plot with lon/lat as x/y coords
    The params X is the dataset in matrix and  a as the loaded dataset prior to conversion as matrix
    '''
    
    km = KMeans(n_clusters=CLUSTER_NUMBER, init='k-means++').fit(X)
    km_pred = KMeans(n_clusters=CLUSTER_NUMBER, init='k-means++').fit_predict(X)
    labels = km.labels_
    clust_centers = km.cluster_centers_

    ## CREATE DF FOR NEW CENTROIDS
    a_centroids = pd.DataFrame([])
    gmap_centroids = pd.DataFrame([])

    ## CREATE A LABELED DF BASED ON ORIGINAL 'a' DF
    a_labeled = a
    a_labeled['label'] = labels

    for i in clust_centers:
        closest, y = pairwise_distances_argmin_min(i, X)
        a_centroids = a_centroids.append(a_labeled.ix[closest[0]])
        gmap_centroids = gmap_centroids.append(raw_map_df.ix[closest[0]])

    ## CREATE SCATTER PLOT POINTS AND LABELS
    gmap_centroids['label'] = a_centroids['label']
    points = plt.scatter(a_labeled['longitude'],  a_labeled['latitude'],marker='.', c=a_labeled['label'],s=3,alpha=0.5)
    clust = plt.scatter(a_centroids['longitude'],a_centroids['latitude'],c=a_centroids['label'],s=150, marker='o',edgecolors='w', alpha=0.8)

    ## PLOT INFO AND STYLE
    plt.title("K-Means k=%s With Centroids Mapped into Raw Dataset" %(CLUSTER_NUMBER))
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.legend([points, clust],['Points','Clusters'],loc='best')
    plt.show()

revert_to_map(X,a)

Enter k: 2




### GENERATE GOOGLE MAP

In [108]:
## GENERATE GOOGLE MAP IN SEPARATE HTML FILE
map_name = raw_input("Enter name of your map: ")
gmap = gmplot.GoogleMapPlotter(29.8, -95.4, 9.0)
gmap.scatter(raw_map_df['latitude'], raw_map_df['longitude'], '#3B0B39', alpha=0.4, size=60, marker=False)
gmap.heatmap(gmap_centroids['latitude'], gmap_centroids['longitude'],radius=(20))

## PATH IS OUTPUT FOLDER/FILENAME + SEQUENCE NUMBER.html
map_file = "%s/KMeans-k%s-%s-%s.html" %(OUTPUT_FOLDER,CLUSTER_NUMBER,map_name,CLUSTER_NUMBER)
gmap.draw(map_file)

Enter name of your map: test2


In [109]:
#### Create aKML FILE FOR FUSION MAP/ arcGIS
kml_name = raw_input("Enter name of your map: ")
map_file = "%s/KMeans-k%s-%s-%s.kml.csv" %(OUTPUT_FOLDER,CLUSTER_NUMBER,kml_name,CLUSTER_NUMBER)
kml = gmap_centroids[['latitude','longitude','label']]
kml.to_csv(map_file)

Enter name of your map: coords


#### Generate Multiple K-Means Graphs

In [12]:

## REDUCE DATA USING PCA
pca = PCA(n_components=2).fit(X)
pca_2d = pd.DataFrame(pca.transform(X))

# Add PCA cols to DF
a.loc[:,('pca1')] = pca_2d[0]
a.loc[:,('pca2')] = pca_2d[1]

ari_list = np.array([])
a_labeled = a
a_labeled['label'] = labels
a_centroids = pd.DataFrame([])

x_axis = np.arange(3,4,2)
for i in x_axis:
    ## RUN K-MEANS
    km = KMeans(n_clusters=i, init='k-means++').fit(pca_2d)
    km_pred = KMeans(n_clusters=i, init='k-means++').fit_predict(pca_2d)
    labels = km.labels_
    clust_centers = km.cluster_centers_
    
    for j in clust_centers:
        closest, y = pairwise_distances_argmin_min(j, pca_2d)
        a_centroids = a_centroids.append(a_labeled.ix[closest[0]])

    
    ## GET ARI 
    ari = metrics.adjusted_rand_score(km.labels_, km_pred) 
    ari_list = np.append(ari_list, ari)
    sil = metrics.silhouette_score(X, km_pred)
    
    print ari_list
    ## PLOT
    title_str = "K-Means, Clusters=%s, ARI=%.3f, Silhouette Coefficient=%.3f" %(i,ari,sil)
#     title_str = "K-Means, Clusters=%s, ARI=%.3f, Silhouette Coefficient=" %(i,ari)
    fig, (ax1,ax2) = plt.subplots(1,2, figsize=(16,6))
#     fig.set_size_inches(16, 6)
    points = ax1.scatter(pca_2d[1],pca_2d[0],c=labels,s=5, alpha=0.6)
    centroids = ax1.scatter(a_centroids['pca1'],a_centroids['pca2'], c=a_centroids['label'], marker='o',s=75, edgecolors='w')
    ax1.legend([points,centroids],['Points','Centroids'])
    ax1.set_title(title_str)
    ax1.set_ylabel("PCA 1")
    ax1.set_xlabel("PCA 2")

# for i in x_axis:
ari_score = ax2.scatter(x_axis,ari_list,c='r',s=40)
ax2.plot(x_axis,ari_list,c='b')
ax2.legend([ari_score],['Adjusted Rand Score'])
ax2.set_title("Adjusted Rand Index for K-Means")
ax2.set_ylabel("Adjusted Rand Score")
ax2.set_xlabel("Clusters")
plt.show()

NameError: name 'labels' is not defined

### END OF K-MEANS CODE

#### BENCHMARKING

In [13]:
def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))

In [14]:
n_digits = 10
sample_size = 1000

n_samples, n_features = X.shape
print(79 * '_')
print "Evaluate K-Means"
print("n_digits: %d, \t n_samples %d, \t n_features %d"
      % (n_digits, n_samples, n_features))
print(79 * '_')
print('% 9s' % 'init'
      '         time  inertia    homo   compl  v-meas     ARI AMI  silhouette')

bench_k_means(KMeans(init='k-means++', n_clusters=n_digits, n_init=10),
              name="k-means++", data=X)

bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),
              name="random", data=X)
              
# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
pca = PCA(n_components=n_digits-1).fit(X)
bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits-1, n_init=1),
              name="PCA-based",
              data=X)
print(79 * '_')
print "Evaluation Complete!"

_______________________________________________________________________________
Evaluate K-Means
n_digits: 10, 	 n_samples 12645, 	 n_features 9
_______________________________________________________________________________
init         time  inertia    homo   compl  v-meas     ARI AMI  silhouette


NameError: global name 'labels' is not defined

### K-MEANS CODE END

##### REFERENCES:
Part of the code source comes from  scikitlearn website: http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html