In [1]:
import numpy as np

Для начала разберёмя с глаголом <i>торчать</i>:

In [2]:
torchat_vectors = np.load("torchat_context_vectors.npy")

In [26]:
torchat_vectors.shape

(2564, 300)

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

Построим матрицу сходства по косинусной близости:

In [5]:
affinity_matrix = cosine_similarity(torchat_vectors, torchat_vectors)

Что-то странные округления, на диагонали не всегда чистые единицы

Попробуем лучше средствами SciPy:

In [12]:
from scipy.spatial.distance import euclidean, cosine, pdist, squareform

In [13]:
def cosine_similarity(u, v):
    return 1 - cosine(u, v)

In [22]:
affinity_matrix = squareform(pdist(torchat_vectors, metric=cosine_similarity))

In [27]:
affinity_matrix.shape

(2564, 2564)

Так как функция squareform используется для построения матрицы расстояний, а не близости, на главную диагональ она ставит нули. Заменим нули на единицы:

In [28]:
for i in range(len(affinity_matrix)):
    affinity_matrix[i][i] = 1

In [29]:
affinity_matrix

array([[ 1.        ,  0.18362963,  0.19143231, ..., -0.23330173,
         0.50011109,  0.30826989],
       [ 0.18362963,  1.        ,  0.15421649, ...,  0.02815803,
         0.08029822,  0.05262811],
       [ 0.19143231,  0.15421649,  1.        , ...,  0.01757288,
         0.3454869 ,  0.31797122],
       ...,
       [-0.23330173,  0.02815803,  0.01757288, ...,  1.        ,
        -0.10239787, -0.03999091],
       [ 0.50011109,  0.08029822,  0.3454869 , ..., -0.10239787,
         1.        ,  0.48480015],
       [ 0.30826989,  0.05262811,  0.31797122, ..., -0.03999091,
         0.48480015,  1.        ]])

Теперь используем модный алгоритм AffinityPropagation, как в статье тех крутых челов с Диалога:

AffinityPropagation имеет $O(T* n^2)$  сложность, поэтому сначала попробуем на небольшом подмножестве датасета:

In [31]:
from time import time

In [32]:
from sklearn.cluster import AffinityPropagation

In [82]:
clusterizer1 = AffinityPropagation(affinity='precomputed', damping=0.99, max_iter=800)

In [53]:
clusterizer2 = AffinityPropagation(affinity='euclidean')

In [36]:
affinity_matrix[:10,:10].shape

(10, 10)

In [70]:
%time
test_cluster_labels1 = clusterizer1.fit_predict(affinity_matrix[:100,:100])

Wall time: 0 ns


In [71]:
set(test_cluster_labels1)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}

In [54]:
%time
test_cluster_labels2 = clusterizer2.fit_predict(torchat_vectors[:100])

Wall time: 0 ns


In [55]:
test_cluster_labels2

array([ 7,  9,  7,  0,  9,  7,  9,  4, 12,  1,  7, 10,  7, 10,  4,  9,  2,
        7,  4, 12, 12,  3, 10,  7, 10,  7,  7,  4,  7,  9,  9, 10, 10,  9,
        4,  4,  9, 10,  5, 10, 10, 12,  6,  9,  4,  7,  7,  7,  7,  9, 10,
       10,  7,  7, 10, 10,  4,  8,  9, 10, 11, 10,  4,  4,  7,  7, 10, 12,
       10, 12, 10, 12,  4, 10, 10, 10,  7,  7, 12, 10,  7,  7,  7,  9,  9,
        7, 10,  7,  9,  7, 12, 12, 10,  9,  7, 12, 12,  4, 10, 13],
      dtype=int64)

In [83]:
%%time
torchat_labels = clusterizer1.fit_predict(affinity_matrix)

Wall time: 13 s


Посмотрим, сколько кластеров у нас получилось:

In [84]:
len(set(torchat_labels))

149

И сколько объектов в каждом кластере:

In [85]:
torchat_labels = list(torchat_labels)

In [86]:
{i:torchat_labels.count(i) for i in torchat_labels}

{25: 36,
 101: 35,
 94: 25,
 15: 6,
 61: 18,
 71: 30,
 55: 23,
 122: 22,
 19: 16,
 120: 35,
 104: 26,
 24: 38,
 79: 10,
 140: 35,
 148: 25,
 59: 7,
 8: 34,
 53: 6,
 23: 45,
 95: 26,
 130: 50,
 12: 39,
 14: 7,
 119: 23,
 75: 52,
 124: 14,
 0: 17,
 107: 21,
 88: 18,
 129: 10,
 115: 23,
 98: 14,
 85: 12,
 56: 10,
 35: 9,
 117: 40,
 146: 25,
 80: 32,
 52: 20,
 57: 30,
 112: 18,
 62: 18,
 1: 20,
 32: 20,
 116: 64,
 63: 44,
 2: 4,
 131: 10,
 97: 15,
 84: 33,
 7: 29,
 51: 13,
 9: 15,
 87: 18,
 3: 36,
 76: 17,
 65: 19,
 34: 31,
 132: 37,
 50: 30,
 36: 15,
 17: 10,
 123: 43,
 144: 11,
 143: 40,
 29: 14,
 78: 19,
 81: 5,
 68: 10,
 105: 17,
 66: 24,
 72: 18,
 102: 16,
 4: 9,
 16: 8,
 96: 10,
 89: 13,
 60: 11,
 28: 13,
 47: 4,
 5: 5,
 134: 37,
 93: 21,
 113: 27,
 141: 31,
 133: 11,
 6: 6,
 118: 10,
 10: 32,
 58: 8,
 37: 6,
 114: 11,
 137: 29,
 26: 16,
 46: 25,
 13: 14,
 45: 25,
 74: 16,
 38: 32,
 109: 11,
 136: 5,
 21: 11,
 103: 14,
 86: 17,
 11: 6,
 126: 7,
 77: 12,
 41: 15,
 90: 12,
 40: 17,
 43

Получилось много мелких кластеров, это не есть хорошо

In [87]:
clusters = [[i for i in range(len(torchat_labels)) if torchat_labels[i] == label] for label in set(torchat_labels)]

In [89]:
import json

In [90]:
with open('torchat.json', 'r', encoding='utf-8') as inp:
    torchat_json = json.load(inp)

In [91]:
torchat_contexts_affinity_clusters = [[torchat_json[index][1] for index in cluster] for cluster in clusters]

In [100]:
N = len(torchat_vectors)
i = 0
s = 0

for cluster in sorted(torchat_contexts_affinity_clusters, key=len, reverse=True):
    s += len(cluster)
    x = round(s/N, 4)
    print(i, len(cluster), f", {x} contexts covered")
    i += 1

0 64 , 0.025 contexts covered
1 52 , 0.0452 contexts covered
2 50 , 0.0647 contexts covered
3 45 , 0.0823 contexts covered
4 44 , 0.0995 contexts covered
5 43 , 0.1162 contexts covered
6 40 , 0.1318 contexts covered
7 40 , 0.1474 contexts covered
8 39 , 0.1626 contexts covered
9 38 , 0.1775 contexts covered
10 37 , 0.1919 contexts covered
11 37 , 0.2063 contexts covered
12 36 , 0.2204 contexts covered
13 36 , 0.2344 contexts covered
14 35 , 0.248 contexts covered
15 35 , 0.2617 contexts covered
16 35 , 0.2754 contexts covered
17 34 , 0.2886 contexts covered
18 33 , 0.3015 contexts covered
19 32 , 0.314 contexts covered
20 32 , 0.3264 contexts covered
21 32 , 0.3389 contexts covered
22 31 , 0.351 contexts covered
23 31 , 0.3631 contexts covered
24 30 , 0.3748 contexts covered
25 30 , 0.3865 contexts covered
26 30 , 0.3982 contexts covered
27 29 , 0.4095 contexts covered
28 29 , 0.4208 contexts covered
29 27 , 0.4314 contexts covered
30 26 , 0.4415 contexts covered
31 26 , 0.4516 context

Посмотрим на распределение размера кластера:

In [97]:
round(0.991, 2)

0.99

Первые 103 кластера покрывают около 90% контекстов

In [102]:
torchat_contexts_affinity_clusters = sorted(torchat_contexts_affinity_clusters, key=len, reverse=True)

with open('torchat_affinity_clusters_cosine.json','w',encoding='utf-8') as outp:
    json.dump(torchat_contexts_affinity_clusters, outp, indent=4, ensure_ascii=False)

In [101]:
help(json.dump)

Help on function dump in module json:

dump(obj, fp, *, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, default=None, sort_keys=False, **kw)
    Serialize ``obj`` as a JSON formatted stream to ``fp`` (a
    ``.write()``-supporting file-like object).
    
    If ``skipkeys`` is true then ``dict`` keys that are not basic types
    (``str``, ``int``, ``float``, ``bool``, ``None``) will be skipped
    instead of raising a ``TypeError``.
    
    If ``ensure_ascii`` is false, then the strings written to ``fp`` can
    contain non-ASCII characters if they appear in strings contained in
    ``obj``. Otherwise, all such characters are escaped in JSON strings.
    
    If ``check_circular`` is false, then the circular reference check
    for container types will be skipped and a circular reference will
    result in an ``OverflowError`` (or worse).
    
    If ``allow_nan`` is false, then it will be a ``ValueError`` to
    serializ

Резюме кластеризации:

1 кластер - целиком про волосы и части тела, 1 контекст я успел заметить про одежду (торчали рукава), но преобладают волосы

2 кластер - в основном про части тела

В общем в 1 кластере контексты про волосы, во 2 в основном про части тела, в 3 - в основном про животных. Может и хорошее разделение, но в данной задаче нерелевантное. Впредь будем использовать алгоритмы с определяемым пользователем числом кластеров. Но напоследок ещё попробуем AffinityPropagation с Евклидовой близостью:

In [109]:
clusterizer2 = AffinityPropagation(affinity='euclidean', damping=0.99, max_iter=800)

In [110]:
torchat_labels2 = clusterizer2.fit_predict(torchat_vectors)

In [111]:
clusters2 = [[i for i in range(len(torchat_labels2)) if torchat_labels2[i] == label] for label in set(torchat_labels2)]

N = len(torchat_vectors)
i = 0
s = 0

for cluster in sorted(clusters2, key=len, reverse=True):
    s += len(cluster)
    x = round(s/N, 4)
    print(i, len(cluster), f", {x} contexts covered")
    i += 1

0 261 , 0.1018 contexts covered
1 204 , 0.1814 contexts covered
2 195 , 0.2574 contexts covered
3 166 , 0.3222 contexts covered
4 159 , 0.3842 contexts covered
5 141 , 0.4392 contexts covered
6 135 , 0.4918 contexts covered
7 95 , 0.5289 contexts covered
8 95 , 0.5659 contexts covered
9 90 , 0.601 contexts covered
10 90 , 0.6361 contexts covered
11 85 , 0.6693 contexts covered
12 84 , 0.702 contexts covered
13 80 , 0.7332 contexts covered
14 78 , 0.7637 contexts covered
15 67 , 0.7898 contexts covered
16 60 , 0.8132 contexts covered
17 59 , 0.8362 contexts covered
18 57 , 0.8584 contexts covered
19 46 , 0.8764 contexts covered
20 41 , 0.8924 contexts covered
21 34 , 0.9056 contexts covered
22 27 , 0.9161 contexts covered
23 26 , 0.9263 contexts covered
24 21 , 0.9345 contexts covered
25 17 , 0.9411 contexts covered
26 17 , 0.9477 contexts covered
27 17 , 0.9544 contexts covered
28 15 , 0.9602 contexts covered
29 13 , 0.9653 contexts covered
30 11 , 0.9696 contexts covered
31 8 , 0.9727

Хмм-м-м, 62 кластеров - уже получше, а первые 21 покрывают 90% контекстов

In [113]:
torchat_contexts_affinity_clusters2 = [[torchat_json[index][1] for index in cluster] for cluster in clusters2]

torchat_contexts_affinity_clusters2 = sorted(torchat_contexts_affinity_clusters2, key=len, reverse=True)

with open('torchat_affinity_clusters_euclidean.json','w',encoding='utf-8') as outp:
    json.dump(torchat_contexts_affinity_clusters2, outp, indent=4, ensure_ascii=False)

Тут вообще получается что-то неинформативное, пока оставим AffinityPropagation