### Initialization

In [1]:
import pandas
import time
import fasttext
import numpy
from importlib import reload

### Documents

In [2]:
import psycopg2
psycopg2_conn = psycopg2.connect(dbname='public_contracts', user='postgres', password='admin', host='localhost', port='5432')

In [3]:
from utils.document_processing import *

In [4]:
loader = DatabaseContractsLoader(psycopg2_conn)
loader.load_documents()
loader.prepare_documents()
loader.prepare_contracts()
df_contracts = pandas.DataFrame.from_dict(loader._contracts, orient='index')
df_contracts

Running query: select * from document where processed=True
Preparing total 1008 documents
Progress: 0.0%
Progress: 10.0%
Progress: 20.0%
Progress: 30.0%
Progress: 40.0%
Progress: 50.0%
Progress: 60.0%
Progress: 70.0%
Progress: 80.0%
Progress: 90.0%
Progress: 100.0%
Preparing total 159 contracts
Progress: 0.0%
Progress: 10.0%
Progress: 19.0%
Progress: 29.0%
Progress: 38.0%
Progress: 48.0%
Progress: 57.0%
Progress: 67.0%
Progress: 76.0%
Progress: 85.0%
Progress: 95.0%


Unnamed: 0,docs,text
2,"[{'id': 1, 'text': '  SMLOUVA O PROVÁDĚNÍ...",\n<FILE id=1>\n\n \n\n \n\nSMLOUVA O PROVÁDĚNÍ...
3,"[{'id': 3, 'text': '  SMLOUVA © DÍLO č.j....",\n<FILE id=3>\n\n \n\n \n\nSMLOUVA © DÍLO\nč.j...
6,"[{'id': 26, 'text': '  SMLOUVA O PROVÁDĚN...",\n<FILE id=26>\n\n \n\n \n\nSMLOUVA O PROVÁDĚN...
10,"[{'id': 35, 'text': 'N Á V R H SMLOUVA O DÍ...",\n<FILE id=35>\nN Á V R H\n\nSMLOUVA O DÍLO\...
7,"[{'id': 27, 'text': '  SMLOUVA © DÍLO č.j...",\n<FILE id=27>\n\n \n\n \n\nSMLOUVA © DÍLO\nč....
...,...,...
842,"[{'id': 4124, 'text': 'EVROPSKÁ UNIE p »007-1...",\n<FILE id=4124>\nEVROPSKÁ UNIE p\n\n»007-12\n...
845,"[{'id': 4133, 'text': 'c o X bi EVROPSKÁ UNIE ...",\n<FILE id=4133>\nc o X\nbi EVROPSKÁ UNIE . 20...
397,"[{'id': 2129, 'text': ' P??loha ?. 2 - Cenov? ...",\n<FILE id=2129>\n\nP??loha ?. 2 - Cenov? rozk...
715,"[{'id': 3433, 'text': ' P1_Katalog kancel??sk?...",\n<FILE id=3433>\n\nP1_Katalog kancel??sk?ch p...


### Subject

In [6]:
from recommender.component.database.postgres import SubjectItemDAO

In [7]:
sidao = SubjectItemDAO(psycopg2_conn)
df_contract_items = sidao.load()
df_contract_items

Running query: select contract_id, item_desc, embedding from subject_item with None
Result: 2284
Loading total 2284 items


Unnamed: 0,contract_id,subject_items,embeddings
0,64,[Podpora kvality v celoživotním a kvalifikační...,"[[0.00831495225429535, -0.0116907162591815, 0...."
1,23,[modernizace komunikací II. třídy v Jihočeském...,"[[0.000896796118468046, -0.0143153127282858, 0..."
2,773,"[Název položky:. Dodávání barev, laků a malířs...","[[-0.0124189304187894, -0.055815514177084, 0.0..."
3,798,[SAS 12Gbps HBA External Controller. Low Profi...,"[[-0.0630988776683807, -0.0175407528877258, 0...."
4,819,"[Silnice III/28620 Hrabačov - Křížlice, Silnic...","[[0.00564307533204556, -0.0115172788500786, 0...."
...,...,...,...
139,817,"[za tím účelem provede požadované činnosti, ne...","[[-0.0495850034058094, -0.0396976210176945, 0...."
140,836,[Nákup výrobní linky na zpracování plastů form...,"[[0.0157228671014309, -0.0530078709125519, 0.0..."
141,840,"[Nové biotechnologické produkty ÚEB AVČR, dodá...","[[-0.0254124347120523, -0.0627986714243889, 0...."
142,774,[EXTERNÍ POSKYTOVÁNÍ SLUŽEB KE ZPRACOVÁNÍ ŽÁDO...,"[[-0.0132491355761886, -0.0287381652742624, 0...."


### Distances

In [9]:
from recommender.component.similarity.vector_space import OptimalizedCosineDistanceVectorComputer

### Playground 

In [11]:
embeddings_col = 'embeddings'
vectors = []
vec_to_entity = []
for index, row in df_contract_items.iterrows():
    if not isinstance(row[embeddings_col], list):
        continue
    for i, e in enumerate(row[embeddings_col]):
        vectors.append(e)
        vec_to_entity.append((index, i))
nvectors = numpy.array(vectors, dtype=numpy.float32)
nvec_to_entity = numpy.array(vec_to_entity, dtype=numpy.int)

In [15]:
comp = OptimalizedCosineDistanceVectorComputer(nvectors)

In [18]:
result = comp._compute_sorted_distances(nvectors, None)
result

(array([[-3.5762787e-07,  2.7961272e-01,  2.5870121e-01, ...,
          2.3125821e-01,  3.0466998e-01,  2.5329459e-01],
        [ 2.7961272e-01, -1.1920929e-07,  1.6242200e-01, ...,
          2.3645210e-01,  2.9991841e-01,  2.3494542e-01],
        [ 2.5870121e-01,  1.6242200e-01,  1.1920929e-07, ...,
          1.7744136e-01,  2.3329782e-01,  2.1104014e-01],
        ...,
        [ 2.3125821e-01,  2.3645210e-01,  1.7744136e-01, ...,
         -2.3841858e-07,  8.4989011e-02,  1.4149469e-01],
        [ 3.0466998e-01,  2.9991841e-01,  2.3329782e-01, ...,
          8.4989011e-02, -2.3841858e-07,  2.0204163e-01],
        [ 2.5329459e-01,  2.3494542e-01,  2.1104014e-01, ...,
          1.4149469e-01,  2.0204163e-01,  1.1920929e-07]], dtype=float32),
 array([[   0, 2198,  231, ..., 1994,  667, 2127],
        [   1,  160,    2, ...,  667, 1994, 2127],
        [   2, 1586, 1579, ..., 1994,  667, 2127],
        ...,
        [1031, 2240,  614, ...,  667, 1994, 2127],
        [2241, 2282,  615, ...,  

In [20]:
distances = result[0]
distances

array([[-3.5762787e-07,  2.7961272e-01,  2.5870121e-01, ...,
         2.3125821e-01,  3.0466998e-01,  2.5329459e-01],
       [ 2.7961272e-01, -1.1920929e-07,  1.6242200e-01, ...,
         2.3645210e-01,  2.9991841e-01,  2.3494542e-01],
       [ 2.5870121e-01,  1.6242200e-01,  1.1920929e-07, ...,
         1.7744136e-01,  2.3329782e-01,  2.1104014e-01],
       ...,
       [ 2.3125821e-01,  2.3645210e-01,  1.7744136e-01, ...,
        -2.3841858e-07,  8.4989011e-02,  1.4149469e-01],
       [ 3.0466998e-01,  2.9991841e-01,  2.3329782e-01, ...,
         8.4989011e-02, -2.3841858e-07,  2.0204163e-01],
       [ 2.5329459e-01,  2.3494542e-01,  2.1104014e-01, ...,
         1.4149469e-01,  2.0204163e-01,  1.1920929e-07]], dtype=float32)

In [21]:
import hdbscan

In [24]:
clusterer = hdbscan.HDBSCAN(metric='precomputed')
clusterer.fit(distances.astype(numpy.double))
clusterer.labels_

array([-1, -1, -1, ..., 65, 57, 53], dtype=int64)

In [26]:
for l in clusterer.labels_:
    print(l)

-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
49
49
49
49
49
56
50
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
34
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
46
52
56
50
-1
-1
-1
-1
-1
-1
49
49
49
49
49
49
46
52
56
50
-1
-1
41
-1
41
-1
41
-1
-1
23
23
23
-1
20
33
31
30
-1
23
-1
-1
23
23
23
-1
20
33
31
30
-1
23
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
13
-1
-1
60
-1
63
-1
-1
-1
59
-1
26
-1
51
65
-1
-1
-1
-1
-1
-1
60
63
-1
59
-1
26
43
51
-1
-1
-1
-1
-1
-1
-1
65
57
53
60
65
57
53
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
48
47
-1
-1
48
47
-1
-1
48
47
-1
-1
48
47
-1
48
48
-1
47
-1
48
47
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
22
22
39
25
19
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
12
-1
-1
-1
-1
-1
41
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
29
-1
-1
-1
15
9
14
0
3
10
21
17
4
7
2
35
16
5
15
9
14
0
3
10
21
17

In [29]:
from collections import Counter

In [30]:
Counter(clusterer.labels_)

Counter({-1: 1635,
         49: 11,
         56: 13,
         50: 9,
         34: 6,
         46: 8,
         52: 8,
         41: 12,
         23: 8,
         20: 6,
         33: 8,
         31: 8,
         30: 8,
         13: 11,
         60: 17,
         63: 12,
         59: 7,
         26: 10,
         51: 8,
         65: 18,
         43: 6,
         57: 10,
         53: 10,
         48: 7,
         47: 6,
         22: 13,
         39: 6,
         25: 7,
         19: 6,
         12: 18,
         29: 6,
         15: 10,
         9: 8,
         14: 10,
         0: 10,
         3: 8,
         10: 10,
         21: 10,
         17: 10,
         4: 8,
         7: 8,
         2: 8,
         35: 10,
         16: 8,
         5: 10,
         28: 16,
         40: 15,
         37: 10,
         66: 13,
         64: 5,
         42: 11,
         54: 6,
         8: 12,
         36: 6,
         27: 7,
         55: 7,
         67: 7,
         61: 7,
         58: 7,
         62: 6,
         32: 9,
   

In [31]:
clusterer.exemplars_

AttributeError: Currently exemplars require the use of vector input datawith a suitable metric. This will likely change in the future, but for now no exemplars can be provided

In [35]:
from sklearn.preprocessing import Normalizer

In [37]:
normalizer = Normalizer(norm='l2').fit(nvectors)
nnvectors = normalizer.transform(nvectors)

In [38]:
clusterer2 = hdbscan.HDBSCAN()

In [39]:
clusterer2.fit_predict(nnvectors)

array([-1, -1, -1, ..., 94, 80, 75], dtype=int64)

In [41]:
exemplars = clusterer2.exemplars_
exemplars

[array([[-0.01348634, -0.02848106,  0.04876509, ...,  0.0085598 ,
          0.1109511 ,  0.05401701],
        [-0.01348634, -0.02848106,  0.04876509, ...,  0.0085598 ,
          0.1109511 ,  0.05401701],
        [-0.01348634, -0.02848106,  0.04876509, ...,  0.0085598 ,
          0.1109511 ,  0.05401701],
        ...,
        [-0.01348634, -0.02848106,  0.04876509, ...,  0.0085598 ,
          0.1109511 ,  0.05401701],
        [-0.01348634, -0.02848106,  0.04876509, ...,  0.0085598 ,
          0.1109511 ,  0.05401701],
        [-0.01348634, -0.02848106,  0.04876509, ...,  0.0085598 ,
          0.1109511 ,  0.05401701]]),
 array([[-0.04368933, -0.03304733,  0.11858913, ..., -0.01179676,
          0.00902384, -0.03852155],
        [-0.1091104 , -0.09828337,  0.03246592, ..., -0.0635753 ,
          0.04705647,  0.02813848],
        [-0.0205087 , -0.06920846,  0.0547258 , ..., -0.07289302,
          0.01343928, -0.01207689],
        [-0.1091104 , -0.09828337,  0.03246592, ..., -0.0635753 ,
 

In [43]:
len(exemplars)

97

In [47]:
nexemplars = list(map(lambda x: len(x), exemplars))
nexemplars

[10,
 6,
 8,
 8,
 6,
 10,
 5,
 6,
 6,
 8,
 8,
 8,
 11,
 6,
 6,
 8,
 5,
 6,
 8,
 5,
 10,
 5,
 6,
 6,
 6,
 10,
 5,
 6,
 6,
 6,
 5,
 5,
 5,
 5,
 6,
 15,
 6,
 6,
 6,
 7,
 6,
 5,
 10,
 5,
 6,
 5,
 10,
 6,
 6,
 10,
 5,
 8,
 5,
 5,
 5,
 6,
 5,
 5,
 17,
 8,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 9,
 8,
 8,
 5,
 5,
 5,
 5,
 10,
 6,
 5,
 5,
 9,
 5,
 5,
 6,
 5,
 5,
 5,
 5,
 5,
 6,
 5,
 6,
 8,
 7,
 5,
 10,
 5,
 6]

In [48]:
sum(nexemplars)

633

In [49]:
len(distances)

2284