In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
reference = pd.read_table('data/mp3 players 73839.txt', header = None)
reference.columns = ['id', 'site', 'category', 'title']
reference.head()

Unnamed: 0,id,site,category,title
0,301074677039,0,73839,Apple iPod nano 7th Generation Purple (16 GB) ...
1,191049351783,0,73839,APPLE IPOD TOUCH 16GB 4TH GEN WHITE MP3 PLAYER
2,251402664662,0,73839,Sport Sunglasses Headset Sun Glasses FOR IPHO...
3,370975588476,0,73839,DISNEY PARKS WHERE DREAMS COME TRUE MP3 PLAYER...
4,251398873497,0,73839,Mp3 Player Sunglasses 8gb Black w/ Bluetooth b...


# approach 1: cluster by tdm

### 1A) read feature vectors into memory and reduce

In [5]:
tdm_data = pd.read_csv('features/cases/cases_tdm.csv')
tdm_data.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,490.1,491.1,492.1,493.1,494.1,495.1,496.1,497.1,498.1,499.1
0,0,-0.018615,0.044593,0.033754,0.017147,-0.013408,-0.008801,0.02652,-0.001875,0.003662,...,0.000354,2.7e-05,0.00015,-8.9e-05,-8.266202e-07,-0.000216,9.4e-05,0.000139,-0.00011,-4.2e-05
1,1,-0.020404,0.044956,0.034604,0.017118,-0.01158,-0.010824,0.028206,-0.007174,0.002715,...,0.0011,0.000579,-3.9e-05,0.000113,-0.0009858511,0.000142,-0.002124,0.000107,0.000526,-0.001291
2,2,0.067482,0.002221,-0.008682,-0.064167,-0.082925,0.04653,0.019452,-0.015765,-0.020444,...,-0.001167,-9e-05,-0.0005,0.000304,2.823215e-06,0.000739,-0.000327,-0.000484,0.000385,0.000147
3,3,-0.022178,0.046221,0.040738,0.021026,-0.003364,-0.019198,0.025294,0.004982,0.015839,...,-0.051811,-0.03541,-0.016383,-0.02295,-0.02903728,0.008659,0.004858,-0.00201,-0.020509,-0.027004
4,4,-0.019379,0.047132,0.032588,0.018126,-0.014113,-0.011335,0.0299,-0.003417,0.003236,...,0.010197,-0.00449,-0.000406,0.006426,0.01011754,0.007087,-0.005874,-0.007453,-0.009313,-0.01681


In [6]:
tdm_data_2 = tdm_data.drop(['Unnamed: 0', '0'], axis = 1)
tdm_data_2.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,490.1,491.1,492.1,493.1,494.1,495.1,496.1,497.1,498.1,499.1
0,0.044593,0.033754,0.017147,-0.013408,-0.008801,0.02652,-0.001875,0.003662,0.001454,0.021749,...,0.000354,2.7e-05,0.00015,-8.9e-05,-8.266202e-07,-0.000216,9.4e-05,0.000139,-0.00011,-4.2e-05
1,0.044956,0.034604,0.017118,-0.01158,-0.010824,0.028206,-0.007174,0.002715,-0.000914,0.025136,...,0.0011,0.000579,-3.9e-05,0.000113,-0.0009858511,0.000142,-0.002124,0.000107,0.000526,-0.001291
2,0.002221,-0.008682,-0.064167,-0.082925,0.04653,0.019452,-0.015765,-0.020444,-0.061294,-0.030251,...,-0.001167,-9e-05,-0.0005,0.000304,2.823215e-06,0.000739,-0.000327,-0.000484,0.000385,0.000147
3,0.046221,0.040738,0.021026,-0.003364,-0.019198,0.025294,0.004982,0.015839,0.012907,0.029395,...,-0.051811,-0.03541,-0.016383,-0.02295,-0.02903728,0.008659,0.004858,-0.00201,-0.020509,-0.027004
4,0.047132,0.032588,0.018126,-0.014113,-0.011335,0.0299,-0.003417,0.003236,0.001189,0.021974,...,0.010197,-0.00449,-0.000406,0.006426,0.01011754,0.007087,-0.005874,-0.007453,-0.009313,-0.01681


In [7]:
tdm_reducer = PCA(n_components = 500)

In [8]:
tdm_data_3 = tdm_reducer.fit_transform(tdm_data_2)

### 1B) cluster via kmeans

In [9]:
model = KMeans(n_clusters=10, init='k-means++', max_iter=100, n_init=1)
model.fit(tdm_data_3)

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=10, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [10]:
categories = model.predict(tdm_data_3)

### 1C) examine clusters

In [24]:
cluster_label = 5

cluster = np.where(categories == cluster_label)

In [25]:
for c in cluster:
    print reference['title'][c]

58      Bug's Life Hard Case Cover For Samsung Galaxy ...
81                                          Iphone 4 Case
159     2D Hard Cover Case for HTC Wildfire S G13 U.S....
197     For iPhone 5 5G 6th Combo Rugged Rubber Matte ...
208     HARRY POTTER Salazar Slytherin school case for...
267     Dual Color Skin Leather Case With Stand For LG...
288     HEAD CASE RED LONDON TELEPHONE BOX KIOSK BOOTH...
307     Wallet Flip Leather Case Cover For Apple iphon...
317     4 PIECE S-LINE TPU GEL SKIN CASE COVERS FOR IP...
383     ORANGE ROCKET HYBRID HARD PHONE CASE COVER W/ ...
417     HEAD CASE DESIGNS BEST OF PLACES SERIES 3 HARD...
468              Skinit Wolf Dragon Moon Skin for LG 500G
503             For Samsung A197 Case Cover LF Lime Green
516     For Samsung Galaxy Rush M830 Carbon Fiber Prot...
616     HEAD CASE NINJAS PROTECTIVE SNAP-ON HARD BACK ...
634     Blk/Blue HYBRID TPU Huawei Ascend M860 Facepla...
700     Noble Clear Transparent Back Thin Hard Case Co...
706     iPhone

# approach 2: cluster by neighbors vector

In [3]:
neighbor_data_1 = pd.read_csv('features/mp3/mp3_neighbors.csv')
neighbor_data_2 = neighbor_data_1.drop(['Unnamed: 0'], axis = 1)
neighbor_data_2.columns = ['nearest_neighbors']
neighbor_data_2.head()

Unnamed: 0,nearest_neighbors
0,0 709 4392 7350 7917 7977 8383 6506 3802 1298 ...
1,1 525 924 1153 1225 1310 1415 1533 1624 1683 2146
2,2 2777 5042 6626 6482 9953 2076 4684 6069 5005...
3,3 8359 5185 9334 6459 9825 5073 8113 1137 7712...
4,4 349 2859 7883 5816 15 6017 8212 2889 8006 2118


In [4]:
# remove primary key...

neighbor_data_2['clean'] = map(lambda x: x[1:], neighbor_data_2['nearest_neighbors'])
neighbor_data_2.head()

Unnamed: 0,nearest_neighbors,clean
0,0 709 4392 7350 7917 7977 8383 6506 3802 1298 ...,709 4392 7350 7917 7977 8383 6506 3802 1298 1694
1,1 525 924 1153 1225 1310 1415 1533 1624 1683 2146,525 924 1153 1225 1310 1415 1533 1624 1683 2146
2,2 2777 5042 6626 6482 9953 2076 4684 6069 5005...,2777 5042 6626 6482 9953 2076 4684 6069 5005 ...
3,3 8359 5185 9334 6459 9825 5073 8113 1137 7712...,8359 5185 9334 6459 9825 5073 8113 1137 7712 ...
4,4 349 2859 7883 5816 15 6017 8212 2889 8006 2118,349 2859 7883 5816 15 6017 8212 2889 8006 2118


### 2B) create matrix of neighbors

In [5]:
neighbors_vec = CountVectorizer(max_features=1000)

In [6]:
neighbors_tdm_sparse = neighbors_vec.fit_transform(neighbor_data_2['clean'])
neighbors_tdm_dense = neighbors_tdm_sparse.toarray()
neighbors_tdm_dense_df = pd.DataFrame(neighbors_tdm_dense)
neighbors_tdm_dense_df.shape

(10001, 1000)

In [7]:
model = KMeans(n_clusters=10, init='k-means++', max_iter=100, n_init=1)
model.fit(neighbors_tdm_dense_df)

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=10, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [8]:
categories = model.predict(neighbors_tdm_dense_df)

  X = self._check_test_data(X)


In [11]:
cluster_label = 4

cluster = np.where(categories == cluster_label)

for c in cluster:
    print reference['title'][c]

0       Apple iPod nano 7th Generation Purple (16 GB) ...
2        Sport Sunglasses Headset Sun Glasses FOR IPHO...
3       DISNEY PARKS WHERE DREAMS COME TRUE MP3 PLAYER...
4       Mp3 Player Sunglasses 8gb Black w/ Bluetooth b...
6       Apple iPod Touch 32GB 5th Generation Pink - Ne...
7           Apple iPod touch 4th Generation Black (32 GB)
8       ELEMENT ELECTRONICS USB 1GB + MP3 & WMA Player...
9       Mini One Eye Monster Mummy Music MP3 Player TF...
10      Apple iPod touch 4th Generation 32GB White Bro...
11          Apple iPod touch 4th Generation White (32 GB)
12      KDQ2 Mini LCD Screen Metal Clip MP3 Music Play...
13      Apple iPod nano 5th Generation Pink 16 GB MC07...
14      Brand new ARCHOS Gmini402 20G mp4 Camcorder an...
15                Bluetooth + MP3 Player Sunglasses - 4GB
16         APPLE IPOD TOUCH 16GB 4TH GEN BLACK MP3 PLAYER
17      DISNEY MIX STICK HIGH SCHOOL MUSICAL ORANGE MI...
18      Apple iPod Touch 3rd Generation Black (32 GB) ...
19            