In [36]:
import json
from sklearn.cluster import MeanShift
import numpy as np
import pandas as pd

In [60]:
with open('open_raw.json', 'r') as f:
    features = json.load(f)
features

[[[0.5959437451125904, -0.489986933951721],
  [0.17254193826120662, -0.17561631937375233],
  [0.1849698131092862, -0.8720092270327563],
  [0.9790076465248021, -0.18449510936020813],
  [0.41696823292018387, -0.13234652598594615],
  [0.3329456635967151, -0.22850811823947234],
  [0.46213558553005785, -0.09653629602278517],
  [0.32875251140755185, -0.09554912951225564],
  [0.9660245090152464, 0.16164238266051006],
  [0.43306817011788623, 0.11026521805794857],
  [0.29045287541532394, -0.049590947761376104],
  [0.9104666871976447, 0.37524121515796466],
  [0.3068064842552228, 0.2861307534294226],
  [0.22664424966280006, 0.015385946614197323],
  [0.44806795248829817, 0.22704659713857425],
  [0.18349430485579737, 1.3643840594392698],
  [0.1622959137816728, 1.8618216194518316],
  [0.20751520367682685, 2.1086903842836238]],
 [[0.8036141415705474, -0.7933849777428789],
  [0.2940058964710164, -0.36999581526497427],
  [0.27059843970206, -0.7229862030918848],
  [1.0364820689291567, -0.243759355393604

In [61]:
raw_data = np.array(features)

## Data Cleaning

In [62]:
## stack data - each row is 1 feature
stacked_data = []
for feat in raw_data:
    feat = np.hstack(feat)
    stacked_data.append(feat)
stacked_data = np.array(stacked_data)
stacked_data

array([[ 0.59594375, -0.48998693,  0.17254194, ...,  1.86182162,
         0.2075152 ,  2.10869038],
       [ 0.80361414, -0.79338498,  0.2940059 , ...,  1.94253712,
         0.22973846,  2.24056673],
       [ 0.7943759 , -0.86170734,  0.31187888, ...,  1.93844365,
         0.23552792,  2.21738433],
       ...,
       [ 0.78408665, -0.8258821 ,  0.32075661, ...,  1.93478649,
         0.24340279,  2.23100233],
       [ 0.77770194, -0.81860901,  0.31470099, ...,  1.9337868 ,
         0.24045072,  2.20832837],
       [ 0.78068004, -0.8141762 ,  0.31928259, ...,  1.93062371,
         0.23945694,  2.21295154]])

In [63]:
df = pd.DataFrame(stacked_data)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,0.595944,-0.489987,0.172542,-0.175616,0.18497,-0.872009,0.979008,-0.184495,0.416968,-0.132347,...,0.226644,0.015386,0.448068,0.227047,0.183494,1.364384,0.162296,1.861822,0.207515,2.10869
1,0.803614,-0.793385,0.294006,-0.369996,0.270598,-0.722986,1.036482,-0.243759,0.479898,-0.130182,...,0.267789,0.050315,0.547335,0.630326,0.250226,1.594142,0.229235,1.942537,0.229738,2.240567
2,0.794376,-0.861707,0.311879,-0.497581,0.237136,-0.66273,1.021056,-0.235583,0.480574,-0.154336,...,0.250282,0.032009,0.599257,0.654009,0.238431,1.540816,0.224226,1.938444,0.235528,2.217384
3,0.802014,-0.880249,0.311729,-0.516838,0.235992,-0.652838,1.030025,-0.242886,0.504094,-0.155689,...,0.269549,0.074539,0.613487,0.648508,0.247726,1.569954,0.228399,1.946077,0.249906,2.199568
4,0.800798,-0.899004,0.311973,-0.541294,0.252512,-0.688686,1.018589,-0.247568,0.499745,-0.159913,...,0.266546,0.041808,0.617668,0.6569,0.249913,1.520891,0.231475,1.926616,0.250566,2.222555


### Compute the number of classes

In [64]:
clustering = MeanShift(bandwidth=2).fit(stacked_data)

In [65]:
clustering.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [66]:
labels = np.unique(clustering.labels_)
labels

array([0, 1])

In [67]:
# number of classes
class_num = len(labels)
class_num

2

In [68]:
# centroids of each class
centroids = clustering.cluster_centers_
centroids

array([[ 0.81138813, -0.86044378,  0.33487043, -0.63518395,  0.26789785,
        -0.88505474,  1.02309412, -0.25453753,  0.57553186, -0.18012997,
         0.35882115, -0.16268512,  0.64083983, -0.04182266,  0.40668816,
        -0.05332961,  0.94628779,  0.24955936,  0.57789895,  0.18822928,
         0.3866402 ,  0.17693844,  0.88116202,  0.52737048,  0.44693303,
         0.37456521,  0.29983506,  0.28653065,  0.58993586,  0.64659657,
         0.25820422,  1.53183854,  0.24862497,  1.91804665,  0.26265939,
         2.21663232],
       [ 0.90365176,  0.62839652,  0.28283352,  0.62870167,  0.23806491,
         0.85139104,  1.08124409,  0.05927965,  0.63163288,  0.49573663,
         0.40225544,  0.56742318,  0.79686439,  0.8346711 ,  0.4874614 ,
         0.95462633,  0.85659673,  0.03569911,  0.78362283,  0.99683557,
         0.465193  ,  1.15265144,  0.69080609,  0.18385913,  0.57731274,
         0.98419257,  0.30804457,  1.17986795,  0.58271386, -0.93010595,
         0.1019758 , -2.46239

In [69]:
cen_df = pd.DataFrame(centroids)
cen_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,0.811388,-0.860444,0.33487,-0.635184,0.267898,-0.885055,1.023094,-0.254538,0.575532,-0.18013,...,0.299835,0.286531,0.589936,0.646597,0.258204,1.531839,0.248625,1.918047,0.262659,2.216632
1,0.903652,0.628397,0.282834,0.628702,0.238065,0.851391,1.081244,0.05928,0.631633,0.495737,...,0.308045,1.179868,0.582714,-0.930106,0.101976,-2.462392,0.14716,2.932313,0.201128,2.645568


In [70]:
cen_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,0.85752,-0.116024,0.308852,-0.003241,0.252981,-0.016832,1.052169,-0.097629,0.603582,0.157803,...,0.30394,0.733199,0.586325,-0.141755,0.18009,-0.465277,0.197892,2.42518,0.231894,2.4311
std,0.06524,1.052769,0.036796,0.893702,0.021095,1.227853,0.041118,0.221902,0.039669,0.47791,...,0.005805,0.631685,0.005107,1.114897,0.11047,2.824348,0.071747,0.717195,0.043509,0.303303
min,0.811388,-0.860444,0.282834,-0.635184,0.238065,-0.885055,1.023094,-0.254538,0.575532,-0.18013,...,0.299835,0.286531,0.582714,-0.930106,0.101976,-2.462392,0.14716,1.918047,0.201128,2.216632
25%,0.834454,-0.488234,0.295843,-0.319213,0.245523,-0.450943,1.037632,-0.176083,0.589557,-0.011163,...,0.301887,0.509865,0.584519,-0.53593,0.141033,-1.463835,0.172526,2.171613,0.216511,2.323866
50%,0.85752,-0.116024,0.308852,-0.003241,0.252981,-0.016832,1.052169,-0.097629,0.603582,0.157803,...,0.30394,0.733199,0.586325,-0.141755,0.18009,-0.465277,0.197892,2.42518,0.231894,2.4311
75%,0.880586,0.256186,0.321861,0.31273,0.26044,0.41728,1.066707,-0.019175,0.617608,0.32677,...,0.305992,0.956534,0.58813,0.252421,0.219147,0.533281,0.223259,2.678746,0.247276,2.538334
max,0.903652,0.628397,0.33487,0.628702,0.267898,0.851391,1.081244,0.05928,0.631633,0.495737,...,0.308045,1.179868,0.589936,0.646597,0.258204,1.531839,0.248625,2.932313,0.262659,2.645568


## Save centroids

In [71]:
# unstack data
unstack_data = []
for feat in centroids:
    feat = np.reshape(feat, (-1, 2))
    unstack_data.append(feat)
unstack_data = np.array(unstack_data)
unstack_data

array([[[ 0.81138813, -0.86044378],
        [ 0.33487043, -0.63518395],
        [ 0.26789785, -0.88505474],
        [ 1.02309412, -0.25453753],
        [ 0.57553186, -0.18012997],
        [ 0.35882115, -0.16268512],
        [ 0.64083983, -0.04182266],
        [ 0.40668816, -0.05332961],
        [ 0.94628779,  0.24955936],
        [ 0.57789895,  0.18822928],
        [ 0.3866402 ,  0.17693844],
        [ 0.88116202,  0.52737048],
        [ 0.44693303,  0.37456521],
        [ 0.29983506,  0.28653065],
        [ 0.58993586,  0.64659657],
        [ 0.25820422,  1.53183854],
        [ 0.24862497,  1.91804665],
        [ 0.26265939,  2.21663232]],

       [[ 0.90365176,  0.62839652],
        [ 0.28283352,  0.62870167],
        [ 0.23806491,  0.85139104],
        [ 1.08124409,  0.05927965],
        [ 0.63163288,  0.49573663],
        [ 0.40225544,  0.56742318],
        [ 0.79686439,  0.8346711 ],
        [ 0.4874614 ,  0.95462633],
        [ 0.85659673,  0.03569911],
        [ 0.78362283,  0.9

In [72]:
with open("open_new.json", "w") as f:
    json.dump(unstack_data.tolist(), f)