In [3]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
reference = pd.read_table('data/cases 20349.txt', header = None)
reference.columns = ['id', 'site', 'category', 'title']
reference.head()

Unnamed: 0,id,site,category,title
0,350946226735,0,20349,Connecticut Huskies Wordmark on BlackBerry Tor...
1,310801215722,0,20349,Der Delight Windows Bracket Case Battery Cover...
2,151052793175,0,20349,NFC Leather Housing Battery Flip Case Cover Sa...
3,141174411773,0,20349,FOR Sprint LG Optimus G LS970 HARD Protector S...
4,310647790715,0,20349,Skinit 505 Silhouettes Skin for LG Cosmos VN250


# approach 1: cluster by tdm

### 1A) read feature vectors into memory and reduce

In [5]:
tdm_data = pd.read_csv('features/cases/cases_tdm.csv')
tdm_data.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,490.1,491.1,492.1,493.1,494.1,495.1,496.1,497.1,498.1,499.1
0,0,-0.018615,0.044593,0.033754,0.017147,-0.013408,-0.008801,0.02652,-0.001875,0.003662,...,0.000354,2.7e-05,0.00015,-8.9e-05,-8.266202e-07,-0.000216,9.4e-05,0.000139,-0.00011,-4.2e-05
1,1,-0.020404,0.044956,0.034604,0.017118,-0.01158,-0.010824,0.028206,-0.007174,0.002715,...,0.0011,0.000579,-3.9e-05,0.000113,-0.0009858511,0.000142,-0.002124,0.000107,0.000526,-0.001291
2,2,0.067482,0.002221,-0.008682,-0.064167,-0.082925,0.04653,0.019452,-0.015765,-0.020444,...,-0.001167,-9e-05,-0.0005,0.000304,2.823215e-06,0.000739,-0.000327,-0.000484,0.000385,0.000147
3,3,-0.022178,0.046221,0.040738,0.021026,-0.003364,-0.019198,0.025294,0.004982,0.015839,...,-0.051811,-0.03541,-0.016383,-0.02295,-0.02903728,0.008659,0.004858,-0.00201,-0.020509,-0.027004
4,4,-0.019379,0.047132,0.032588,0.018126,-0.014113,-0.011335,0.0299,-0.003417,0.003236,...,0.010197,-0.00449,-0.000406,0.006426,0.01011754,0.007087,-0.005874,-0.007453,-0.009313,-0.01681


In [6]:
tdm_data_2 = tdm_data.drop(['Unnamed: 0', '0'], axis = 1)
tdm_data_2.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,490.1,491.1,492.1,493.1,494.1,495.1,496.1,497.1,498.1,499.1
0,0.044593,0.033754,0.017147,-0.013408,-0.008801,0.02652,-0.001875,0.003662,0.001454,0.021749,...,0.000354,2.7e-05,0.00015,-8.9e-05,-8.266202e-07,-0.000216,9.4e-05,0.000139,-0.00011,-4.2e-05
1,0.044956,0.034604,0.017118,-0.01158,-0.010824,0.028206,-0.007174,0.002715,-0.000914,0.025136,...,0.0011,0.000579,-3.9e-05,0.000113,-0.0009858511,0.000142,-0.002124,0.000107,0.000526,-0.001291
2,0.002221,-0.008682,-0.064167,-0.082925,0.04653,0.019452,-0.015765,-0.020444,-0.061294,-0.030251,...,-0.001167,-9e-05,-0.0005,0.000304,2.823215e-06,0.000739,-0.000327,-0.000484,0.000385,0.000147
3,0.046221,0.040738,0.021026,-0.003364,-0.019198,0.025294,0.004982,0.015839,0.012907,0.029395,...,-0.051811,-0.03541,-0.016383,-0.02295,-0.02903728,0.008659,0.004858,-0.00201,-0.020509,-0.027004
4,0.047132,0.032588,0.018126,-0.014113,-0.011335,0.0299,-0.003417,0.003236,0.001189,0.021974,...,0.010197,-0.00449,-0.000406,0.006426,0.01011754,0.007087,-0.005874,-0.007453,-0.009313,-0.01681


In [7]:
tdm_reducer = PCA(n_components = 500)

In [8]:
tdm_data_3 = tdm_reducer.fit_transform(tdm_data_2)

### 1B) cluster via kmeans

In [9]:
model = KMeans(n_clusters=10, init='k-means++', max_iter=100, n_init=1)
model.fit(tdm_data_3)

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=10, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [10]:
categories = model.predict(tdm_data_3)

### 1C) examine clusters

In [24]:
cluster_label = 5

cluster = np.where(categories == cluster_label)

In [25]:
for c in cluster:
    print reference['title'][c]

58      Bug's Life Hard Case Cover For Samsung Galaxy ...
81                                          Iphone 4 Case
159     2D Hard Cover Case for HTC Wildfire S G13 U.S....
197     For iPhone 5 5G 6th Combo Rugged Rubber Matte ...
208     HARRY POTTER Salazar Slytherin school case for...
267     Dual Color Skin Leather Case With Stand For LG...
288     HEAD CASE RED LONDON TELEPHONE BOX KIOSK BOOTH...
307     Wallet Flip Leather Case Cover For Apple iphon...
317     4 PIECE S-LINE TPU GEL SKIN CASE COVERS FOR IP...
383     ORANGE ROCKET HYBRID HARD PHONE CASE COVER W/ ...
417     HEAD CASE DESIGNS BEST OF PLACES SERIES 3 HARD...
468              Skinit Wolf Dragon Moon Skin for LG 500G
503             For Samsung A197 Case Cover LF Lime Green
516     For Samsung Galaxy Rush M830 Carbon Fiber Prot...
616     HEAD CASE NINJAS PROTECTIVE SNAP-ON HARD BACK ...
634     Blk/Blue HYBRID TPU Huawei Ascend M860 Facepla...
700     Noble Clear Transparent Back Thin Hard Case Co...
706     iPhone

# approach 2: cluster by neighbors vector

In [26]:
neighbor_data_1 = pd.read_csv('features/cases/cases_neighbors.csv')
neighbor_data_2 = neighbor_data_1.drop(['Unnamed: 0'], axis = 1)
neighbor_data_2.columns = ['nearest_neighbors']
neighbor_data_2.head()

Unnamed: 0,nearest_neighbors
0,0 2221 14118 15008 15577 18803 19926 29179 398...
1,1 9865 73633 48145 6777 13752 17987 4624 83515...
2,2 23056 44984 52322 88750 87882 85071 57264 39...
3,3 40921 43457 77458 47032 46350 47516 84636 14...
4,4 718 15027 54756 69955 70178 72202 72334 8027...


In [27]:
# remove primary key...

neighbor_data_2['clean'] = map(lambda x: x[1:], neighbor_data_2['nearest_neighbors'])
neighbor_data_2.head()

Unnamed: 0,nearest_neighbors,clean
0,0 2221 14118 15008 15577 18803 19926 29179 398...,2221 14118 15008 15577 18803 19926 29179 3981...
1,1 9865 73633 48145 6777 13752 17987 4624 83515...,9865 73633 48145 6777 13752 17987 4624 83515 ...
2,2 23056 44984 52322 88750 87882 85071 57264 39...,23056 44984 52322 88750 87882 85071 57264 396...
3,3 40921 43457 77458 47032 46350 47516 84636 14...,40921 43457 77458 47032 46350 47516 84636 140...
4,4 718 15027 54756 69955 70178 72202 72334 8027...,718 15027 54756 69955 70178 72202 72334 80270...


### 2B) create matrix of neighbors

In [28]:
neighbors_vec = CountVectorizer(max_features=1000)

In [29]:
neighbors_tdm_sparse = neighbors_vec.fit_transform(neighbor_data_2['clean'])
neighbors_tdm_dense = neighbors_tdm_sparse.toarray()
neighbors_tdm_dense_df = pd.DataFrame(neighbors_tdm_dense)
neighbors_tdm_dense_df.shape

(99999, 1000)

In [30]:
model = KMeans(n_clusters=10, init='k-means++', max_iter=100, n_init=1)
model.fit(neighbors_tdm_dense_df)

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=10, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [31]:
categories = model.predict(neighbors_tdm_dense_df)

In [32]:
cluster_label = 9

cluster = np.where(categories == cluster_label)

for c in cluster:
    print reference['title'][c]

335      pittsburgh penguins iPhone 4 4s 5 5s 5c case c...
1110     Finn and jake adventure time iPhone 4 4s 5 5s ...
2363     Lionel Messi Barcelona for iPhone 4 4s Case Co...
2596     cute iphone5 iphone 5 hard case cover miss pin...
2964     GIVENCHY VERY RARE BLACK PVC SHARKTOOTH FLORAL...
4003            Iphone 4 & 4 S Case Cover The Walking Dead
5548     Every Damn Just Do it NIKE Fresh Blue for iPho...
6166                     Michael  Kors Iphone 5 Case Cover
8863     harry potter hogwarts for iPhone 4 4s 5 5s 5c ...
9155     my little pony for iPhone 4 or 4s Plastic blac...
10919    Rhinestone hr.RLily TracFone/Net10 ZTE Merit 9...
14167    kingdom hearts for iPhone 4 4s 5 5s 5c Black C...
14999    avengers iron man mark vii for iPhone 4 4s 4G ...
19046    IPHONE 5 CASE/ PEARLS/ RHINESTONE/ BLING/ FASH...
19371      846-212red Nike Football iPhone 5 5s Case Cover
21714    Plain Yellow and Black Mustache Design on iPho...
23051    2x FC Barcelona iPhone 4 4s Case Cover David V.