#### IMPORT STATEMENTS

In [7]:
import numpy as np, pandas as pd, os,sys
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import colorConverter
import gmplot
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import pairwise_distances_argmin_min
from scipy.spatial.distance import cdist,pdist


#### GET DATA

In [8]:
OUTPUT_FOLDER = "output"
## july-oct-raw2-train.csv
# csv_file = "july-oct-batch2-train.csv" ## raw_input("Enter csv file to load:")
# csv_file = "batch2-train.csv" ## raw_input("Enter csv file to load:")

csv_file = "july-oct-batch4-train.csv" ## raw_input("Enter csv file to load:")
# csv_file = "july-oct-batch3-test.csv" ## raw_input("Enter csv file to load:")
## USED TO CONVERT BACK FROM DECOMPOSED VARIABLES TO GEOCORDS
raw_map_df = pd.read_csv("output/july-oct-raw.csv") 
file_str = "%s/%s" %(OUTPUT_FOLDER, csv_file)

a = pd.read_csv(file_str) 
a.columns = ['index1', 'address','city', 'day','hour','type','latitude','longitude','parent_incident','state']
# a.columns = ['address','city', 'day','hour','type','latitude','longitude','parent_incident','state']
a.drop('index1',axis=1,inplace=True)

## MODELS
X = a[['address','city', 'day','hour','type','latitude','longitude','parent_incident','state']] ## MODEL 1
# X = a[['address', 'day','hour','type','parent_incident','latitude','longitude']] ## MODEL 2
# X = a[['address', 'day','hour','type','latitude','longitude']] ## MODEL 3

## DECOMPOSE VARIABLE DIMENSIONS
pca = PCA(n_components=2).fit(X)
pca_2d = pd.DataFrame(pca.transform(X))

# Add PCA cols to DF
# X['pca1'] = pca_2d[0]
# X['pca2'] = pca_2d[1]


In [3]:
len(X)

12645

#### REVERT BACK TO ORIGINAL DATASET, GENERATE A MAP

In [15]:
CLUSTER_NUMBER = 114
km = KMeans(n_clusters=CLUSTER_NUMBER, init='k-means++').fit(X)
km_pred = KMeans(n_clusters=CLUSTER_NUMBER, init='k-means++').fit_predict(X)
labels = km.labels_
clust_centers = km.cluster_centers_



In [16]:
a_centroids = pd.DataFrame([])
a_labeled = a
a_labeled['label'] = labels

gmap_centroids = pd.DataFrame([])
for i in clust_centers:
    closest, y = pairwise_distances_argmin_min(i, X)
    a_centroids = a_centroids.append(a_labeled.ix[closest[0]])
    gmap_centroids = gmap_centroids.append(raw_map_df.ix[closest[0]])

## Match the labels for the gmap coords
gmap_centroids['label'] = a_centroids['label']

points = plt.scatter(a['longitude'],a['latitude'],c='k',s=3,alpha=0.5)
clust = plt.scatter(a_centroids['longitude'],a_centroids['latitude'],c='r',s=150, marker='o',edgecolors='w', alpha=0.8)

plt.title("K-Means k=%s With Centroids Mapped into Raw Dataset" %(CLUSTER_NUMBER))
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend([points, clust],['Points','Clusters'],loc='best')
plt.show()



#### GENERATE GOOGLE MAP

In [None]:
## GENERATE GOOGLE MAP IN SEPARATE HTML FILE

gmap = gmplot.GoogleMapPlotter(29.8, -95.4, 9.0)
gmap.scatter(raw_map_df['latitude'], raw_map_df['longitude'], '#3B0B39', alpha=0.4, size=60, marker=False)
gmap.heatmap(gmap_centroids['latitude'], gmap_centroids['longitude'],radius=(20))

## PATH IS OUTPUT FOLDER/FILENAME + SEQUENCE NUMBER.html
map_file = "%s/%s-%s.html" %(OUTPUT_FOLDER,"KMEANS",CLUSTER_NUMBER)
gmap.draw(map_file)

In [17]:
#### Create aKML FILE FOR FUSION MAP/ arcGIS
kml = gmap_centroids[['latitude','longitude','label']]
kml.to_csv("output/kmeans-114k-heatmap.kml.csv")

### ELBOW TEST

In [None]:
# %matplotlib 


def plot_variance(start_,end_,interval_=1):
    plt.subplots(figsize=(12,5))
    
#     X = X.as_matrix()
    k_list = np.arange(start_,end_+1,interval_)

    k_var = [KMeans(n_clusters=k).fit(X) for k in k_list]

    centroids = [c.cluster_centers_ for c in k_var]

    k_euclidian = [cdist(X, cent, 'euclidean') for cent in centroids]
    dist = [np.min(ke, axis=1) for ke in k_euclidian]

    ## Total Within cluster SS
    wcss = [sum(d**2) for d in dist]
    
    ## Total SS
    tss = sum(pdist(X)**2) / pca_2d.shape[0]

    ## Between cluster SS
    bss = tss - wcss    
    print bss 
    plt.title("KMeans Explained Variance")
    plt.plot(k_list,bss)
    points = plt.scatter(k_list,bss,c='r', s=25)
    plt.ylabel('Variance')
    plt.xlabel('K-value')
   
    plt.grid('on', which='major', axis='x' )
    plt.grid('on', which='major', axis='y' )
   
    plt.legend([points],['Variance'],loc='best')  
    plt.xticks(k_list)

start_k = int(raw_input("Enter starting K: "))
end_k = int(raw_input("Enter ending K: "))
int_k = int(raw_input("Enter Interval: "))

plot_variance(start_k, end_k,int_k)
plt.show()

In [None]:
plt.legend([points],['Variance'],loc="best")  
plt.show()

#### Generate Multiple K-Means Graphs

In [None]:
ari_list = np.array([])
pca = PCA(n_components=2).fit(X)
pca_2d = pd.DataFrame(pca.transform(X))

x_axis = np.arange(114,115,3)
for i in x_axis:
    ## RUN K-MEANS
    km = KMeans(n_clusters=i, init='k-means++').fit(pca_2d)
    km_pred = KMeans(n_clusters=i, init='k-means++').fit_predict(pca_2d)
    labels = km.labels_
    clust_centers = km.cluster_centers_
   
    ## GET ARI 
    ari = metrics.adjusted_rand_score(km.labels_, km_pred) 
    ari_list = np.append(ari_list, ari)
    sil = metrics.silhouette_score(X, km_pred)
    ## PLOT
    title_str = "K-Means, Clusters=%s, ARI=%.3f, Silhouette Coefficient=%.3f" %(i,ari,sil)
#     title_str = "K-Means, Clusters=%s, ARI=%.3f" %(i,ari)
    fig, (ax1) = plt.subplots(1)
    fig.set_size_inches(12, 4)
    points = ax1.scatter(pca_2d[1],pca_2d[0],c=labels,s=5, alpha=0.6)
    centroids = ax1.scatter(clust_centers[:,1],clust_centers[:,0], marker='o',s=75, edgecolors='w')
    ax1.legend([points,centroids],['Points','Centroids'])
    ax1.set_title(title_str)
    ax1.set_ylabel("PCA 1")
    ax1.set_xlabel("PCA 2")

# for i in x_axis:
    fig, (ax2) = plt.subplots(1)
    fig.set_size_inches(16, 6)  
    ari_score = ax2.scatter(x_axis,ari_list,c='r',s=40)
    ax2.plot(x_axis,ari_list,c='b')
    ax2.legend([ari_score],['Adjusted Rand Score'])
    ax2.set_title("Adjusted Rand Index for K-Means")
    ax2.set_ylabel("Adjusted Rand Score")
    ax2.set_xlabel("Clusters")
plt.show()

### END OF K-MEANS CODE