In [1]:
import os, sys, time, random
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.neighbors import DistanceMetric
from scipy.spatial.distance import cdist


In [2]:
OUTPUT_FOLDER = "../data/output"
SOURCE_FOLDER = "../data/src"

## CREATE FOLDER IF OUTPUT FOLDER IS UNDEFINED
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

## IF NO 'data' FOLDER EXISTS, CRATE ONE...
if not os.path.exists(SOURCE_FOLDER):
    os.makedirs(SOURCE_FOLDER)
 
csv_file = "july-oct-train.csv" ## raw_input("Enter csv file to load:")
# csv_file = "july-oct-batch3-test.csv" ## raw_input("Enter csv file to load:")
## USED TO CONVERT BACK FROM DECOMPOSED VARIABLES TO GEOCORDS
raw_file = "%s/july-oct-raw.csv" %(SOURCE_FOLDER)
raw_map_df = pd.read_csv(raw_file) 

file_str = "%s/%s" %(SOURCE_FOLDER, csv_file)
a = pd.read_csv(file_str) 
a.columns = ['index1', 'address','city', 'day','hour','type','latitude','longitude','parent_incident','state']
a.drop('index1',axis=1,inplace=True)

X = a[['address','city', 'day','hour','type','latitude','longitude','parent_incident','state']]

len(X), X.columns
X = X.as_matrix()

In [28]:
cluster_nums1 = 3 #int(raw_input("Number of clusters: "))
cluster_nums2 = 114 #int(raw_input("Number of clusters: "))
batch_size = 100# int(raw_input("Batch size (default 100): "))

## REDUCE DATA USING PCA
pca = PCA(n_components=2).fit(X)
pca_2d = pd.DataFrame(pca.transform(X))

# Add PCA cols to DF
a.loc[:,('pca1')] = pca_2d[0]
a.loc[:,('pca2')] = pca_2d[1]

## k=3
mbk1 = MiniBatchKMeans(init='k-means++', n_clusters=cluster_nums1, batch_size=batch_size)
%timeit mbk1
mbk1.fit(X)
mbk_pred1 = mbk1.fit_predict(pca_2d)
mbk_means_cluster_centers1 = np.sort(mbk1.cluster_centers_, axis=0)
mbk_means_labels1 = pairwise_distances_argmin(pca_2d, mbk_means_cluster_centers1)


## K=114
mbk2 = MiniBatchKMeans(init='k-means++', n_clusters=cluster_nums2, batch_size=batch_size)
%timeit mbk2
mbk2.fit(X)
mbk_pred2 = mbk2.fit_predict(pca_2d)
mbk_means_cluster_centers2 = np.sort(mbk2.cluster_centers_, axis=0)
mbk_means_labels2 = pairwise_distances_argmin(pca_2d, mbk_means_cluster_centers2)



10000000 loops, best of 3: 138 ns per loop


  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_sampl

The slowest run took 14.03 times longer than the fastest. This could mean that an intermediate result is being cached.
10000000 loops, best of 3: 136 ns per loop


  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_sampl

### VISUALIZE USING PCA DATA

In [None]:
# create colors function
def random_color():
    rgbl=[255,0,0]
    random.shuffle(rgbl)
    return tuple(rgbl)

## K=3
colors = [random_color() for i in range(cluster_nums1)] ## create colors based on number of clusters
# print cc
fig,(ax1, ax2) = plt.subplots(1,2, figsize=(16,8))
for this_centroid, k, col in zip(mbk_means_cluster_centers1,
                                 range(cluster_nums1), colors):

    center_scatter = ax1.scatter(pca_2d[0], pca_2d[1],c=mbk_pred1, s=40, alpha=0.2)
#     ax1.plot(this_centroid[0], this_centroid[1], '.', mec='w',mew=2,ms=20,alpha=.7)
    
ax1.set_title("Minibatch K-Means, k=3, Batch size = 100")
ax1.set_xlabel("Longitude")
ax1.set_ylabel("Latitude")

## K=114
colors = [random_color() for i in range(cluster_nums2)] ## create colors based on number of clusters


for this_centroid, k, col in zip(mbk_means_cluster_centers2,
                                 range(cluster_nums2), colors):

    center_scatter = ax2.scatter(pca_2d[0], pca_2d[1],c=mbk_pred2, s=40, alpha=0.2)
#     ax2.plot(this_centroid[0], this_centroid[1], '.', mec='w',mew=2,ms=20,alpha=.7)
    
ax2.set_title("Minibatch K-Means, k=114, Batch size = 100")
ax2.set_xlabel("Longitude")
ax2.set_ylabel("Latitude")


plt.show()

### GOOGLE MAP

In [10]:
import random, gmplot

# %matplotlib
mbk_means_labels_unique = np.unique(mbk.labels_)

def random_color():
    rgbl=[255,0,0]
    random.shuffle(rgbl)
    return tuple(rgbl)

colors = [random_color() for i in range(cluster_nums)]
# print cc
fig,ax = plt.subplots(figsize=(16,8))
for this_centroid, k, col in zip(mbk_means_cluster_centers,
                                 range(cluster_nums), colors):
    mask = mbk.labels_ == k
    center_scatter = ax.scatter(X['longitude'], X['latitude'],c=mbk_pred, s=40, alpha=0.2)
    ax.plot(this_centroid[1], this_centroid[0], '.', mec='w',mew=2,ms=20,alpha=.7)
    
ax.set_ylabel("Latitude")
ax.set_xlabel("Longitude")
ax.set_title("MiniBatchKMeans")

# GENERATE GOOGLE MAP IN SEPARATE HTML FILE
gmap = gmplot.GoogleMapPlotter(29.8, -95.4, 9.0)
gmap.scatter(X['latitude'], X['longitude'], '#3B0B39', alpha=0.4, size=60, marker=False)
gmap.heatmap(mbk_means_cluster_centers[:,0], mbk_means_cluster_centers[:,1],radius=(10))

## PATH IS OUTPUT FOLDER/FILENAME + SEQUENCE NUMBER.html
map_file = "%s/%s-%s.html" %(OUTPUT_FOLDER,"minibatch-gmap",i)
gmap.draw(map_file)

# ax.set_autoscaley_on(False)
plt.show()

#### END MINIBATCH KMEANS CODE

##### REFERENCES:
Part of the code source comes from  scikitlearn website: http://scikit-learn.org/stable/auto_examples/cluster/plot_mini_batch_kmeans.html#sphx-glr-auto-examples-cluster-plot-mini-batch-kmeans-py