# K-means for sv_embeds

## Environment

In [4]:
%load_ext autoreload
%autoreload 2
%pylab
%matplotlib inline
    
import pandas as pd
import pickle
import numpy as np
import sys
import os
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


In [5]:
sys.path.append('/host/projects/sv_experiments/sv_system/')
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [6]:
def key2df(keys, delimeter="-"):
    key_df = pd.DataFrame(keys, columns=['key'])
    key_df['spk'] = key_df.key.apply(lambda x: x.split(delimeter)[0])
    key_df['label'] = key_df.groupby('spk').ngroup()
    key_df['origin'] = key_df.spk.apply(lambda x: 'voxc2' if x.startswith('id') else 'voxc1')
    
    return key_df

## Load embeds

In [7]:
import kaldi_io

sv_keys = []
sv_embeds = []
for k,v in kaldi_io.read_vec_flt_ark(
    "/host/projects/sv_experiments/sv_system/models/voxc12_fbank64_vad/tdnn_xvector_softmax/fbank64_200f_800f_v00/embeds/lda_feats.ark"):
    sv_keys += [k]
    sv_embeds += [v]

sv_embeds = np.array(sv_embeds)

## Run K-means

In [10]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=40, random_state=0).fit(sv_embeds)

In [14]:
sv_keys_df = key2df(sv_keys)

In [16]:
sv_keys_df['cluster_label'] = kmeans.labels_

In [40]:
spk2cluster = sv_keys_df.groupby('spk', group_keys=False).cluster_label.value_counts()

In [49]:
spk2cluster

spk      cluster_label
id10270  21               158
id10271  23                73
id10272  11                50
id10273  13               240
id10274  25                54
id10275  30                73
         27                 1
id10276  6                185
id10277  28                67
id10278  26                94
         10                93
id10279  31                61
         25                 1
         36                 1
id10280  14                67
id10281  1                 84
id10282  29                84
id10283  17               157
         30                49
         7                 27
id10284  34                90
id10285  4                 93
id10286  35               149
id10287  15                48
id10288  20                48
id10289  38                87
id10290  31               137
id10291  39                76
id10292  18               167
         0                 98
id10293  9                100
         37                94
id10294  12      

In [29]:
for item in spk2cluster.iteritems():
    print(item[0][0])

id10270
id10271
id10272
id10273
id10274
id10275
id10275
id10276
id10277
id10278
id10278
id10279
id10279
id10279
id10280
id10281
id10282
id10283
id10283
id10283
id10284
id10285
id10286
id10287
id10288
id10289
id10290
id10291
id10292
id10292
id10293
id10293
id10294
id10295
id10296
id10297
id10298
id10299
id10300
id10300
id10300
id10301
id10302
id10302
id10303
id10304
id10304
id10304
id10305
id10306
id10307
id10308
id10309
id10309


In [48]:
for i in range(10):
    id_str = 'id'+str(10300+i)
    print(*spk2cluster[id_str].values)
    

302 1 1
48
165 1
103
156 5 1
137
184
156
64
164 1
