In [1]:
import matplotlib
matplotlib.use('Agg')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist, pdist
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from nltk.stem.wordnet import WordNetLemmatizer


def elbow_silhouette_kmeans(X, year=None):
    dists = []
    # s_scores = []
    for k in xrange(2, 50):
#         print year, k
        model = KMeans(n_clusters=k)
        model.fit(X)
        centroids = model.cluster_centers_
        labels = model.labels_
        dist = cdist(X, centroids)
        dists.append(dist.min(axis=1).sum())
        # s_scores.append(silhouette_score(X, labels, metric='euclidean'))
    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    ax1.plot(xrange(2, 50), dists, 'b*-')
    ax1.grid(True)
    ax1.set_xlabel('Number of clusters')
    ax1.set_ylabel('sum of squares')
    ax1.set_title('Elbow for KMeans clustering {}'.format(year))
    # ax2 = fig.addsubplot(122)
    # ax2.plot(xrange(2, 20), s_scores)
    # ax2.grid(True)
    # ax2.set_title('Choose K based on Silhouette Score')
    # ax2.set_xlabel('Number of Cluster')
    # ax2.set_ylabel('Silhouette Score')
    # plt.show()
    plt.savefig('img/{}_kmeans.png'.format(year))


def kmeans_by_year(X, yrs, years):
    for yr in years:
        print yr
        X_yr = X[yrs == yr]
        elbow_silhouette_kmeans(X_yr, yr)
    # plt.show()
    print 'kmeans done!'


def get_lemmatized_word(line):
    lem = WordNetLemmatizer()
    line = line.lower().split()
    line = [lem.lemmatize(x.strip(string.punctuation).decode("utf8")) for x in line]
    return ' '.join(line)


def get_tfidf(content):
    content = [get_lemmatized_word(x)  for x in content]
    vec = TfidfVectorizer(stop_words='english')
    tfidf = vec.fit_transform(content)
    return tfidf
    


In [3]:
sfdata = pd.read_csv('data/sfpd_clean.csv')
dropLst = ['Descript', 'PdDistrict', 'Address', 'Year', 'Category', 'X', 'Unnamed: 0']
sfdata['nnX'] = -sfdata['X'] 
X = sfdata.drop(dropLst, axis=1).values
fea_names = sfdata.drop(dropLst, axis=1).columns
yrs = sfdata['Year'].values
years = sorted(sfdata['Year'].unique())


In [37]:
fea_names

Index([u'DayOfWeek', u'Y', u'CrimeCat', u'Hour', u'Month', u'Day', u'nnX'], dtype='object')

In [None]:
kmeans_by_year(X, yrs, years)

2003
2004


In [5]:
def build_nmf(X, yrs, years):
    Ws = []
    Hs = []
    for yr in years:
        print yr
        X_yr = X[yrs == yr]
        nmfModel = NMF()
        W = nmfModel.fit_transform(X_yr)
        H = nmfModel.components_
        Ws.append(W)
        Hs.append(H)
        print H
        plt.figure()
        sns.heatmap(H)
        plt.title('Heatmap of Lattent Feature - {}'.format(yr))
        plt.savefig('img/Heatmap of Lattent Feature - {}.png'.format(yr))
    return Ws, Hs



In [16]:
Ws, Hs = build_nmf(X, yrs, years)

2003
[[  1.33579571e+04   0.00000000e+00   8.69809020e-01   0.00000000e+00
    0.00000000e+00   0.00000000e+00   3.15955456e-01   1.58987495e+00]
 [  1.86455081e+03   0.00000000e+00   5.83506942e-02   0.00000000e+00
    0.00000000e+00   0.00000000e+00   2.16354313e-02   2.68718313e-03]
 [  5.15149622e+02   1.06670570e-04   7.95846139e-04   9.70572892e-05
    4.22732907e-04   2.60237086e-04   5.24857872e-04   0.00000000e+00]
 [  2.80667648e+01   1.73626340e+01   8.92020745e+01   1.51433869e+01
    6.40172558e+01   2.82361138e+01   4.29988131e+01   2.29438460e+02]
 [  2.45177533e+02   1.72392030e-03   1.32355611e-03   1.49016374e-03
    5.67656777e-03   2.71628596e-03   8.91520595e-04   7.01928124e-02]
 [  3.86440426e+03   8.96508305e-02   7.25765816e-01   6.66897589e-02
    3.28343774e-01   1.47834788e-01   3.07291431e-01   2.09022778e+00]
 [  3.04674176e+03   2.27986468e-02   3.53255283e-01   2.06814783e-02
    7.16403735e-02   3.78864502e-02   1.39232939e-01   1.18429074e+00]
 [  4.05

In [4]:
tfidf = get_tfidf(sfdata['Descript'].values)


In [17]:
n, m = tfidf.shape
print n, m

1056819 359


In [32]:
X.shape

(1056819, 7)

In [9]:
X_t = np.concatenate((X, tfidf))

ValueError: all the input arrays must have same number of dimensions

In [6]:
yr = years[0]
X_yr = X[yrs == yr]
nmfModel = NMF()
W = nmfModel.fit_transform(X_yr)
H = nmfModel.components_

In [47]:
index = np.argsort(W[:,0])[:-500:-1]
index2 = np.argsort(W[:,1])[:-500:-1]

In [50]:
plt.figure()
plt.scatter(X[index, 1], X[index, 6], color='r')
plt.scatter(X[index2, 1], X[index2,6], color='b')
plt.savefig('test.png')

In [51]:
print W[:10,:]

[[  3.94996536e+00   9.70055673e+00   2.63360370e+00   1.38754970e-01
    2.06847622e+00   2.65644908e-02   2.84818789e+00]
 [  3.22356993e+00   1.34277477e+01   6.25196561e+00   0.00000000e+00
    3.64168085e-01   8.20626731e-02   8.64419501e-01]
 [  2.95957951e+00   1.13987090e+01   8.14298794e+00   0.00000000e+00
    1.78925232e-01   2.11998294e-02   1.56264641e+00]
 [  3.24387370e+00   4.45298759e+00   0.00000000e+00   6.68621527e-01
    6.24092958e-01   1.76334115e-01   1.71682841e+00]
 [  4.24053684e+00   0.00000000e+00   5.27455145e-01   2.24306794e-01
    0.00000000e+00   1.54380614e-01   3.73029799e+00]
 [  3.80431309e+00   4.37415889e+00   1.23609507e-01   7.85341874e-01
    3.12286002e+00   1.87947120e-03   9.22743500e-01]
 [  3.97511153e+00   1.35167142e+01   1.68057984e+00   1.02281314e-02
    2.52936978e+00   1.11646283e-01   2.49779565e+00]
 [  3.91282050e+00   2.32680900e+00   2.03572196e+00   6.27474136e-01
    1.83426832e+00   1.03896701e-02   3.54099891e-01]
 [  3.50

In [None]:
kmeans_by_year(X_t, yrs, years)
Ws, Hs = build_nmf(X_t, yrs, years)