### Initialization

In [623]:
import pandas
import time
import fasttext
import numpy
from importlib import reload

### Documents

In [238]:
import psycopg2
psycopg2_conn = psycopg2.connect(dbname='public_contracts', user='postgres', password='admin', host='localhost', port='5432')

In [3]:
from utils.document_processing import *

In [4]:
loader = DatabaseContractsLoader(psycopg2_conn)
loader.load_documents()
loader.prepare_documents()
loader.prepare_contracts()
df_contracts = pandas.DataFrame.from_dict(loader._contracts, orient='index')
df_contracts

Running query: select * from document where processed=True
Preparing total 1008 documents
Progress: 0.0%
Progress: 10.0%
Progress: 20.0%
Progress: 30.0%
Progress: 40.0%
Progress: 50.0%
Progress: 60.0%
Progress: 70.0%
Progress: 80.0%
Progress: 90.0%
Progress: 100.0%
Preparing total 159 contracts
Progress: 0.0%
Progress: 10.0%
Progress: 19.0%
Progress: 29.0%
Progress: 38.0%
Progress: 48.0%
Progress: 57.0%
Progress: 67.0%
Progress: 76.0%
Progress: 85.0%
Progress: 95.0%


Unnamed: 0,docs,text
2,"[{'id': 1, 'text': '  SMLOUVA O PROVÁDĚNÍ...",\n<FILE id=1>\n\n \n\n \n\nSMLOUVA O PROVÁDĚNÍ...
3,"[{'id': 3, 'text': '  SMLOUVA © DÍLO č.j....",\n<FILE id=3>\n\n \n\n \n\nSMLOUVA © DÍLO\nč.j...
6,"[{'id': 26, 'text': '  SMLOUVA O PROVÁDĚN...",\n<FILE id=26>\n\n \n\n \n\nSMLOUVA O PROVÁDĚN...
10,"[{'id': 35, 'text': 'N Á V R H SMLOUVA O DÍ...",\n<FILE id=35>\nN Á V R H\n\nSMLOUVA O DÍLO\...
7,"[{'id': 27, 'text': '  SMLOUVA © DÍLO č.j...",\n<FILE id=27>\n\n \n\n \n\nSMLOUVA © DÍLO\nč....
...,...,...
839,"[{'id': 4117, 'text': 'EVROPSKÁ UNIE 6 9 EVROP...",\n<FILE id=4117>\nEVROPSKÁ UNIE 6 9\nEVROPSKÝ ...
840,"[{'id': 4120, 'text': 'c o X bi EVROPSKÁ UNIE ...",\n<FILE id=4120>\nc o X\nbi EVROPSKÁ UNIE . 20...
841,"[{'id': 4121, 'text': '. a EVROPSKÁ UNIE . G0...",\n<FILE id=4121>\n.\n\na EVROPSKÁ UNIE .\nG0. ...
842,"[{'id': 4124, 'text': 'EVROPSKÁ UNIE p »007-1...",\n<FILE id=4124>\nEVROPSKÁ UNIE p\n\n»007-12\n...


### Subject

In [5]:
from utils.subject_extraction import *
from utils.context_extraction import *
from utils.subject_context_preprocessing import *
from utils.conllu_preprocessing import *

In [7]:
from udapi.core.document import Document
from ufal.udpipe import *
udpipe_model = Model.load("../model/udpipe/udpipe-ud-2.5-191206/czech-pdt-ud-2.5-191206.udpipe")

In [8]:
udp_pipeline = Pipeline(udpipe_model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")

In [9]:
import ufal.udpipe

class ComplexSubjectExtractor(SubjectExtractor):
    
    def __init__(self,
                 subj_context_extractor=None,
                 subj_context_preprocessor=None,
                 attributes_extractor=None,
                 text_annotator=None,
                 udapi_doc_creator=None,
                 conllu_attributes_preprocessor=None,
                 udapi_to_str_transformer=None,
                 items_tagger=None,
                 attribute_merger=None,
                 tag_cleaner=None,
                 concatenator=None):
        self._subj_context_extractor = subj_context_extractor if subj_context_extractor is not None else \
            AdvancedSubjectContextExtractor()
        self._subj_context_preprocessor = subj_context_preprocessor if subj_context_preprocessor is not None else \
            SubjectContextPreprocessor()
        self._attribures_extractor = attributes_extractor if attributes_extractor is not None else \
            SubjectContextPreprocessor(transformers=[AttributeExtractor(keep_text=False, keep_attributes=True)])
        self._text_annotator = text_annotator if text_annotator is not None else \
            TextAnnotator()
        self._udapi_doc_creator = udapi_doc_creator if udapi_doc_creator is not None else \
            UdapiFromConlluTransformer()
        self._conllu_attributes_preprocessor = conllu_attributes_preprocessor if conllu_attributes_preprocessor is not None else \
            ConlluSubjectContextPreprocessor()
        self._udapi_to_str_transformer = udapi_to_str_transformer if udapi_to_str_transformer is not None else \
            UdapiToStrTransformer()
        self._items_tagger = items_tagger if items_tagger is not None else \
            SubjectContextPreprocessor(transformers=[AttributeTagger(attr_tag='<ITEM>;<ITEM/>', keep_text=False)])
        self._attribute_merger = attribute_merger if attribute_merger is not None else \
            AttributeMerger()
        self._tag_cleaner = tag_cleaner if tag_cleaner is not None else \
            SubjectContextPreprocessor(transformers=[AttributeTagCleaner(attr_pattern=r'<[A-Z_]+>(.*)<[A-Z_]+/>')])
        self._concatenator = concatenator if concatenator is not None else \
            AttributeConcatenator()

    def extract(self, text):
        subj_context = self._subj_context_extractor.process(text, True)
        filtered_context = self._subj_context_preprocessor.process(subj_context, True)
        attributes = self._attribures_extractor.process(filtered_context, True)
        decomposition = self._text_annotator.process(filtered_context, True)
        udapi_doc = self._udapi_doc_creator.process(decomposition, True)
        filtered_doc = self._conllu_attributes_preprocessor.process(udapi_doc, True)
        filtered_doc_text = self._udapi_to_str_transformer.process(filtered_doc, True)
        attributes2 = self._items_tagger.process(filtered_doc_text, True)
        merged_attributes = self._attribute_merger.process(list(zip(attributes, attributes2)), True)
        subject_items = self._tag_cleaner.process(merged_attributes, True)
        subject = self._concatenator.process(subject_items)
        return subject

In [10]:
subj_context_transformers = [
    NumeralLinesFilter(too_many_numerals_ratio_threshold=0.5),
    TooShortLinesFilter(too_short_line_threshold=5),
    IrrelevantLinesFilter(keywords=['strana', 'stránka', 'e-mail'], max_line_length=75, lower=True),
    IrrelevantLinesFilter(keywords=['Tel:', 'Fax:', 'IČ:', 'IČO:', 'DIČ:'], max_line_length=75,
                          lower=False),
    IrrelevantLinesRegexFilter(patterns=[r'www', r'[\w\-\.]+\s*@\s*([\w\-]+\.)+[\w\-]{2,4}']),  # email
    IrrelevantLinesRegexFilter(patterns=[r'(\+\d{2,3}){0,1}(\s{0,1}\d{3}){3}']),  # phone
    RegexReplaceTransformer(pattern_to_transform=r',([\s]+[A-Z][a-z ])', result_pattern='.\g<1>'),  # . vs , correction
    RegexReplaceTransformer(pattern_to_transform=r'\n[ \t]*([^\d]{0,1}[\d]{1,2}[^\d])+[ \t]*',  # paragraph numbers
                            result_pattern='\n'),
    BlankLinesFilter(replacement='\n', top_n_frequency=200, top_n_var_threshold=5,
                     full_line_threshold=0.85, min_max_line_length=0),
    ReplaceMarksTransformer(marks_to_transform='„“', result_mark='"'),
    TooLongLinesTransformer(forbidden_delimiters='aábcčdďeéěfghiíjklmnňoópqrřsštťuúůvwxyýzž0123456789',
                            special_delimiters={'-': (r'[\s,\.](-)[\s]+[^(Kč)]', 1)},
                            too_long_line_treshold=200),
    RegexReplaceTransformer(pattern_to_transform=r'\([^\n()]*\)', result_pattern=''),  # bracket erasing
    RegexReplaceTransformer(pattern_to_transform=r'([^\n ])[ ]*\n', result_pattern='\g<1>.\n'),  # . filling
    RegexReplaceTransformer(pattern_to_transform=r'(([Nn]ázev|[Pp]opis)[^\n,.:"()]{5,})(\s[A-Z][^\n:]{10})',
                            result_pattern='\g<1>:\g<3>'),  # : filling
    ReplaceMarksTransformer(marks_to_transform=[':'], result_mark=':.'),
    ReplaceMarksTransformer(marks_to_transform=[';.', ',.'], result_mark='.'),
    ReplaceMarksTransformer(marks_to_transform=['..'], result_mark='.'),
    RegexReplaceTransformer(pattern_to_transform=r'[ ]*\.', result_pattern='.'),
]
subj_context_preprocessor = SubjectContextPreprocessor(transformers = subj_context_transformers)

attribute_transformers = [
    StructureItemEnumerationExtractor(),
    CharTupleItemEnumerationExtractor(),
    ItemColonExtractor(),
    HeaderItemEnumerationExtractor(),
    QuotedContractNameExtractor(),
    StructuredContractNameExtractor(),
    AttributeExtractor(keep_text=False, keep_attributes=True)
]
attributes_extractor = SubjectContextPreprocessor(transformers = attribute_transformers)

text_annotator = TextAnnotator(pipeline=udp_pipeline)

conllu_transformers = [
    UdapiWordOccurrencePartSentenceFilter(keywords=['cena', 'hodnota', 'DPH']),
    UdapiWordOccurrencePartSentenceFilter(keywords=['příloha', 'dále', 'jen']),
    NonSubjectPartSentenceFilter(),
    EmptyBundlesFilter(),
]
conllu_preprocessor = ConlluSubjectContextPreprocessor(transformers=conllu_transformers)

extractor = ComplexSubjectExtractor(subj_context_preprocessor=subj_context_preprocessor,
                                    attributes_extractor=attributes_extractor,
                                    text_annotator=text_annotator,
                                    conllu_attributes_preprocessor=conllu_preprocessor)

In [667]:
import recommender.component.database.postgres
reload(recommender.component.database.postgres)
from recommender.component.database.postgres import DBManager, SubjectItemManager, CPVItemManager, ContractItemManager

In [668]:
simngr = SubjectItemManager(psycopg2_conn)
df_contract_items = simngr.load()
df_contract_items

Running query: select contract_id, item_desc, embedding from subject_item
Loading total 2291 items
Progress: 0.0%
Progress: 10.0%
Progress: 20.0%
Progress: 30.0%
Progress: 40.0%
Progress: 50.0%
Progress: 60.0%
Progress: 70.0%
Progress: 80.0%
Progress: 90.0%
Progress: 100.0%


Unnamed: 0,contract_id,subject_items,embeddings
0,2,[krmení zvěře jádrovým a granulovým krmivem s ...,"[[0.00463546579703689, -0.079517237842083, 0.0..."
1,68,"[otovitel se zavazuje, že vyzve objednatele k ...","[[-0.00496369041502476, -0.0264993589371443, 0..."
2,23,[otovitel je povinen uchovávat veškerou dokume...,"[[-0.0100010493770242, -0.0357969366014004, 0...."
3,3,"[zhotovení díla, a to:. část č. 1. Oprava oplo...","[[-0.00123070902191103, -0.0510847456753254, 0..."
4,26,"[, VRSTVY PRO OBNOVU A OPRAVY Z ASF BETONU ACO...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...,...
154,839,"[Nové biotechnologické produkty ÚEB AVČR, reg....","[[-0.0160277131944895, -0.0321324989199638, 0...."
155,840,"[Nové biotechnologické produkty ÚEB AVČR, dodá...","[[-0.0254124347120523, -0.0627986714243889, 0...."
156,841,"[Nové biotechnologické produkty UEB AVČR, chem...","[[-0.0470611900091171, -0.0829258784651756, 0...."
157,842,"[Nové biotechnologické produkty ÚEB AVČR, chem...","[[-0.0254124347120523, -0.0627986714243889, 0...."


In [669]:
cpvimngr = CPVItemManager(psycopg2_conn)
df_contract_cpv_items = cpvimngr.load()
df_contract_cpv_items

Running query: select cntr.contract_id, cpv.name, cpv.embedding
                                from contract_cpv cntr join cpv_code cpv on cntr.cpv_id=cpv.id 
Loading total 186 items
Progress: 0.0%
Progress: 10.0%
Progress: 20.0%
Progress: 30.0%
Progress: 39.0%
Progress: 49.0%
Progress: 59.0%
Progress: 68.0%
Progress: 78.0%
Progress: 88.0%
Progress: 97.0%


Unnamed: 0,contract_id,cpv_items,embeddings
0,10,[Modernizace budov],"[[0.0765372440218925, -0.00218193605542183, 0...."
1,15,[Stavební práce ],"[[0.0609345808625221, -0.0535003654658794, 0.1..."
2,16,[Stavební práce ],"[[0.0609345808625221, -0.0535003654658794, 0.1..."
3,17,[Úklidové práce ],"[[0.0329727455973625, -0.0746283680200577, 0.1..."
4,22,"[Sklenářské, malířské a natěračské práce, Stav...","[[0.0486427582800388, -0.0805132985115051, 0.1..."
...,...,...,...
84,799,[Balík programů pro zabezpečení ],"[[0.0401324555277824, -0.0114254346117377, 0.0..."
85,800,[Balík programů pro kreslení a tvorbu obrázků ],"[[0.0246526505798101, -0.0477145910263062, 0.0..."
86,813,"[Práce na údržbě silnic , Stavební úpravy při ...","[[0.046866200864315, -0.0408554710447788, 0.07..."
87,839,"[Činidla pro elektroforézu, Laboratorní činidla]","[[0.000705273530911654, -0.110241234302521, 0...."


In [670]:
cimngr = ContractItemManager(psycopg2_conn)
df_contract_items_merged = cimngr.load()
df_contract_items_merged

Running query: select contract_id, item_desc, embedding from subject_item
Loading total 2291 items
Progress: 0.0%
Progress: 10.0%
Progress: 20.0%
Progress: 30.0%
Progress: 40.0%
Progress: 50.0%
Progress: 60.0%
Progress: 70.0%
Progress: 80.0%
Progress: 90.0%
Progress: 100.0%
Running query: select cntr.contract_id, cpv.name, cpv.embedding
                                from contract_cpv cntr join cpv_code cpv on cntr.cpv_id=cpv.id 
Loading total 186 items
Progress: 0.0%
Progress: 10.0%
Progress: 20.0%
Progress: 30.0%
Progress: 39.0%
Progress: 49.0%
Progress: 59.0%
Progress: 68.0%
Progress: 78.0%
Progress: 88.0%
Progress: 97.0%


Unnamed: 0,contract_id,embeddings,items
0,2,"[[0.00463546579703689, -0.079517237842083, 0.0...",[krmení zvěře jádrovým a granulovým krmivem s ...
1,68,"[[-0.00496369041502476, -0.0264993589371443, 0...","[otovitel se zavazuje, že vyzve objednatele k ..."
2,23,"[[-0.0100010493770242, -0.0357969366014004, 0....",[otovitel je povinen uchovávat veškerou dokume...
3,3,"[[-0.00123070902191103, -0.0510847456753254, 0...","[zhotovení díla, a to:. část č. 1. Oprava oplo..."
4,26,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[, VRSTVY PRO OBNOVU A OPRAVY Z ASF BETONU ACO..."
...,...,...,...
154,839,"[[-0.0160277131944895, -0.0321324989199638, 0....","[Nové biotechnologické produkty ÚEB AVČR, reg...."
155,840,"[[-0.0254124347120523, -0.0627986714243889, 0....","[Nové biotechnologické produkty ÚEB AVČR, dodá..."
156,841,"[[-0.0470611900091171, -0.0829258784651756, 0....","[Nové biotechnologické produkty UEB AVČR, chem..."
157,842,"[[-0.0254124347120523, -0.0627986714243889, 0....","[Nové biotechnologické produkty ÚEB AVČR, chem..."


### Locality

In [671]:
import recommender.component.database.postgres
reload(recommender.component.database.postgres)
from recommender.component.database.postgres import ContractLocalityManager

In [12]:
from utils.location import *

In [13]:
geocoder = Geocoder()

In [672]:
clmngr = ContractLocalityManager(psycopg2_conn)

In [673]:
df_contract_locality = clmngr.load()
df_contract_locality

Running query: select c.contract_id, e.address, e.latitude, e.longitude
                              from contract c join
                                submitter s on c.submitter_id=s.submitter_id join
                                entity e on s.entity_id=e.entity_id
Loading total 849 contracts
Progress: 0.0%
Progress: 10.0%
Progress: 20.0%
Progress: 30.0%
Progress: 40.0%
Progress: 50.0%
Progress: 60.0%
Progress: 70.0%
Progress: 80.0%
Progress: 90.0%
Progress: 99.0%


Unnamed: 0,contract_id,address,gps
0,1,"Horní Bečva,Horní Bečva,,,550,75657","(49.4316105349, 18.2886182473)"
1,2,"Lány,Lány,,Lesní,140,27061","(50.1179485696, 13.9435876649)"
2,3,"Lány,Lány,,Lesní,140,27061","(50.1179485696, 13.9435876649)"
3,4,"Horní Bečva,Horní Bečva,,,550,75657","(49.4316105349, 18.2886182473)"
4,5,"Horní Bečva,Horní Bečva,,,550,75657","(49.4316105349, 18.2886182473)"
...,...,...,...
844,845,"Praha,Lysolaje,Praha-Lysolaje,Rozvojová,263,16500","(50.1272006765, 14.3841478791)"
845,846,"Praha,Lysolaje,Praha-Lysolaje,Rozvojová,263,16500","(50.1272006765, 14.3841478791)"
846,847,"Praha,Lysolaje,Praha-Lysolaje,Rozvojová,263,16500","(50.1272006765, 14.3841478791)"
847,848,"Jedlá,Jedlá,,,15,58401","(49.7402817568, 15.2358324335)"


### Entity subject

In [685]:
import recommender.component.database.postgres
reload(recommender.component.database.postgres)
from recommender.component.database.postgres import EntitySubjectManager, ContractEntitySubjectManager

In [686]:
esmngr = EntitySubjectManager(psycopg2_conn)
df_entity_subject = esmngr.load()
df_entity_subject

Running query: select entity_id, description, embedding from entity_subject
Loading total 14708 records
Progress: 0.0%
Progress: 11.0%
Progress: 21.0%
Progress: 31.0%
Progress: 41.0%
Progress: 51.0%
Progress: 61.0%
Progress: 71.0%
Progress: 81.0%
Progress: 91.0%


Unnamed: 0,entity_items,entity_embeddings
14,[Silniční motorová doprava - nákladní provozov...,"[[-0.00698897382244468, -0.0479855164885521, 0..."
15,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.00947303231805563, -0.050309382379055, 0.0..."
17,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.00947303231805563, -0.050309382379055, 0.0..."
18,"[výroba, instalace, opravy elektrických strojů...","[[0.0267897713929415, -0.0520677752792835, 0.0..."
19,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.00947303231805563, -0.050309382379055, 0.0..."
...,...,...
481,"[výroba, instalace, opravy elektrických strojů...","[[0.0191271640360355, -0.0536193549633026, 0.0..."
482,"[Výroba a úprava kvasného lihu, konzumního lih...","[[0.00534356897696853, -0.0773240029811859, 0...."
483,"[Zemědělská výroba, Silniční motorová doprava ...","[[0.0101674236357212, -0.0249374806880951, 0.1..."
484,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.00947303231805563, -0.050309382379055, 0.0..."


In [554]:
def extend_items(items):
    extended_items = []
    for item in items:
        extended_items.extend(item.split('\n'))
    return extended_items

In [555]:
df_entity_subject['subject_items'] = df_entity_subject['subject_items'].apply(extend_items)

In [556]:
df_entity_subject['embeddings'] = df_entity_subject['subject_items'].apply(lambda desc: embedder.process(desc))
df_entity_subject

Unnamed: 0,subject_items,embeddings
14,[Silniční motorová doprava - nákladní provozov...,"[[-0.006988974, -0.047985516, 0.06216523, 0.01..."
15,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.009473032, -0.050309382, 0.05635079, 0.024..."
17,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.009473032, -0.050309382, 0.05635079, 0.024..."
18,"[výroba, instalace, opravy elektrických strojů...","[[0.026789771, -0.052067775, 0.07255103, 0.084..."
19,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.009473032, -0.050309382, 0.05635079, 0.024..."
...,...,...
481,"[výroba, instalace, opravy elektrických strojů...","[[0.019127164, -0.053619355, 0.06265986, 0.066..."
482,"[Výroba a úprava kvasného lihu, konzumního lih...","[[0.005343569, -0.077324, 0.05343414, 0.061003..."
483,"[Zemědělská výroba, Silniční motorová doprava ...","[[0.010167424, -0.02493748, 0.132525, 0.098278..."
484,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.009473032, -0.050309382, 0.05635079, 0.024..."


In [569]:
esmngr.save_to_db(df_entity_subject)

14
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
15
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
17
    0
    1
    2
    3
    4
    5
    6
    7
18
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
19
    0
    1
    2
    3
20
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
21
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
22
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    1

    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    243
    244
    245
    246
    247
    248
    249
    250
    251
41
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
42
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
43
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
44
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
45
    0
   

    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161


    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
105
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    12

    21
    22
    23
    24
    25
    26
    27
149
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
150
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
151
    0
    1
    2
    3
152
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
153
    0
    1
    2
    3
154
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
157
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
 

    10
    11
    12
    13
    14
    15
188
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
189
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
190
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
191
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
192
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
193
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13


    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
225
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
226
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
227
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
228
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
  

    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
272
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
   

    15
    16
    17
    18
    19
327
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
328
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
329
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
   

    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
372
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
373
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
374
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    

    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
409
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
410
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
411
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
  

    3
    4
    5
    6
    7
    8
    9
    10
    11
440
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
441
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
442
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
443
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
444
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
 

    12
    13
    14
    15
474
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
475
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
476
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
477
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
478
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
479
    0
    1
    2
    3
480
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
481
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
482
    0
    1
    2
    3
    4
    5
    6
    7
    8
    

In [687]:
cesmngr = ContractEntitySubjectManager(psycopg2_conn)
df_contract_entity_subject = cesmngr.load()
df_contract_entity_subject

Running query: select c.contract_id, es.description, es.embedding
                              from contract c join
                                submitter s on c.submitter_id=s.submitter_id join
                                entity e on s.entity_id=e.entity_id join
                                entity_subject es on e.entity_id=es.entity_id
Loading total 372 records
Progress: 0.0%
Progress: 11.0%
Progress: 21.0%
Progress: 31.0%
Progress: 41.0%
Progress: 52.0%
Progress: 62.0%
Progress: 72.0%
Progress: 82.0%
Progress: 92.0%


Unnamed: 0,contract_id,entity_items,entity_embeddings
0,7,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.00947303231805563, -0.050309382379055, 0.0..."
1,6,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.00947303231805563, -0.050309382379055, 0.0..."
2,3,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.00947303231805563, -0.050309382379055, 0.0..."
3,2,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.00947303231805563, -0.050309382379055, 0.0..."
4,70,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.00947303231805563, -0.050309382379055, 0.0..."
5,71,"[Hostinská činnost, Hostinská činnost, Hostins...","[[-0.0197652950882912, -0.0814318656921387, 0...."
6,72,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.00947303231805563, -0.050309382379055, 0.0..."
7,804,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.00947303231805563, -0.050309382379055, 0.0..."
8,805,[kontrolní testování profesionálních zařízení ...,"[[-0.00137067469768226, -0.0487685315310955, 0..."
9,806,"[Výroba, obchod a služby neuvedené v přílohách...","[[0.00947303231805563, -0.050309382379055, 0.0..."


### User profile 

In [29]:
class UserProfile:
    
    def __init__(self, user_id=None, address=None, gps=(None, None), interest_items=[], embeddings=[]):
        self.user_id = user_id
        self.address = address
        self.gps = gps
        self.interest_items = interest_items
        self.embeddings = embeddings

    def __repr__(self):
        return 'UP{}[{}]({})'.format(self.user_id, self.address,
                                     [(item, emb[:2]) for item, emb in zip(self.interest_items, self.embeddings)])
        
class UserProfileFactory:
    
    def __init__(self, embedder, geocoder):
        self._embedder = embedder
        self._geocoder = geocoder
        
    def create(self, address, items):
        print('Getting GPS for '+address)
        gps = self._geocoder.gps_for_address(address)
        print('Embedding total '+str(len(items))+' items')
        embeddings = self._embedder.process(items)
        profile = UserProfile(None, address, gps, items, embeddings)
        return profile

In [682]:
import recommender.component.database.postgres
reload(recommender.component.database.postgres)
from recommender.component.database.postgres import UserProfileManager

In [684]:
upmngr = UserProfileManager(psycopg2_conn)
df_user_profile = upmngr.load()
df_user_profile

Running query: select u.user_id, u.address, u.latitude, u.longitude, i.item_desc, i.embedding
                                from user_profile u join
                                interest_item i on u.user_id=i.user_id
Loading total 5 items
Progress: 0.0%
Progress: 20.0%
Progress: 40.0%
Progress: 60.0%
Progress: 80.0%


Unnamed: 0,user_id,address,gps,interest_items,embeddings
0,13,"K Vejrychovsku 1074, Jilemnice","(50.5999042303745, 15.5145712200147)",[automobily],"[[-0.0220996141433716, -0.109728731215, 0.1329..."
1,14,"Praha 1-Nové Město, nábřeží Ludvíka Svobody 12...","(50.0933520045633, 14.4343605745105)","[silniční komunikace, stavby]","[[-0.0160830970853567, -0.0245096459984779, 0...."
2,15,"Luční 1067, Jilemnice","(50.5995494322684, 15.5113817448695)",[výpočetní technika],"[[0.00984129682183266, -0.0624870434403419, 0...."
3,16,"Letenská 525/15; Praha 1, Malá Strana","(50.0891449249535, 14.4071893630789)",[politika],"[[0.0671171844005585, 0.00226269592531025, -0...."


### Embedding 

In [149]:
import fasttext
import time
import numpy

class Embedder:

    def process(self, data):
        if isinstance(data, list):
            return [self.embed(text) for text in data]
        if isinstance(data, str):
            return self.embed(data)
        return None
    
    
class RandomEmbedder(Embedder):
    
    def __init__(self, model=300):
        self._model = model
    
    def embed(self, token):
        return numpy.random.rand(self._model)
    
    

class FastTextEmbedder(Embedder):
    
    def __init__(self, model='../model/fasttext/wiki.cs/wiki.cs.bin'):
        if isinstance(model, str):
            print('Loading FastText model from: ' + model)
            start = time.time()
            model = fasttext.load_model(model)
            end = time.time()
            print('Model loaded in: ' + str(end-start) + ' sec')
        if not isinstance(model, fasttext.FastText._FastText):
            raise ValueError('model must be ' + fasttext.FastText._FastText + ' or a path to fasttext binary model')
        self._model = model
    
    def embed(self, token):
        embedding = self._model.get_sentence_vector(token)
        return embedding

In [150]:
%%time

model = fasttext.load_model('../model/fasttext/wiki.cs/wiki.cs.bin')

Wall time: 30.1 s





In [27]:
embedder = RandomEmbedder()

In [151]:
embedder = FastTextEmbedder(model)

### Similarity

In [705]:
from recommender.component.similarity.common import *
from recommender.component.similarity.geodesic import *
from recommender.component.similarity.vector_space import *

In [706]:
aisc = AggregatedItemSimilarityComputer(df_contract_items)

In [707]:
aisc.compute_most_similar(df_user_profile)

{13: [{'contract_id': 2, 'similarity': 0.3969723406723996}],
 14: [{'contract_id': 4, 'similarity': 0.5235745397972891}],
 15: [{'contract_id': 5, 'similarity': 0.8445491496390173}],
 16: [{'contract_id': 5, 'similarity': 0.2769113253504405}]}

### Playground 

In [12]:
%%time

df_contracts['subject'] = df_contracts['text'].apply(lambda text: extractor.extract(text))
df_contracts['subject_items'] = df_contracts['subject'].apply(lambda text: text.split('\n'))
df_contracts['embedding'] = df_contracts['subject_items'].apply(lambda items: embedder.process(items))

AdvancedSubjectContextExtractor: Elapsed time: 0.0227 seconds
SubjectContextPreprocessor: Elapsed time: 0.0120 seconds
SubjectContextPreprocessor: Elapsed time: 0.0036 seconds
TextAnnotator: Elapsed time: 0.1748 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0032 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0179 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AdvancedSubjectContextExtractor: Elapsed time: 0.0199 seconds
SubjectContextPreprocessor: Elapsed time: 0.0254 seconds
SubjectContextPreprocessor: Elapsed time: 0.0122 seconds
TextAnnotator: Elapsed time: 1.8739 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0430 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.1848 seconds
UdapiToStrTransformer: Elapsed time: 0.0002 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds

AdvancedSubjectContextExtractor: Elapsed time: 0.0246 seconds
SubjectContextPreprocessor: Elapsed time: 0.0014 seconds
SubjectContextPreprocessor: Elapsed time: 0.0022 seconds
TextAnnotator: Elapsed time: 0.1984 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0035 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0180 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AdvancedSubjectContextExtractor: Elapsed time: 0.0184 seconds
SubjectContextPreprocessor: Elapsed time: 0.0064 seconds
SubjectContextPreprocessor: Elapsed time: 0.0038 seconds
SubjectContextPreprocessor: Elapsed time: 0.0011 seconds
SubjectContextPreprocessor: Elapsed time: 0.0050 seconds
TextAnnotator: Elapsed time: 0.1602 seconds
TextAnnotator: Elapsed time: 0.7472 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0026 seconds
UdapiFromConl

ConlluSubjectContextPreprocessor: Elapsed time: 0.1403 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.1502 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AdvancedSubjectContextExtractor: Elapsed time: 0.7666 seconds
SubjectContextPreprocessor: Elapsed time: 0.0123 seconds
SubjectContextPreprocessor: Elapsed time: 0.0016 seconds
SubjectContextPreprocessor: Elapsed time: 0.0007 seconds
SubjectContextPreprocessor: Elapsed time: 0.0151 seconds
SubjectContextPreprocessor: Elapsed time: 0.0046 seconds
SubjectContextPreprocessor: Elapsed time: 0.0064 seconds
SubjectContextPreprocessor: Elapsed time: 0.00

AdvancedSubjectContextExtractor: Elapsed time: 0.0470 seconds
SubjectContextPreprocessor: Elapsed time: 0.0061 seconds
SubjectContextPreprocessor: Elapsed time: 0.0105 seconds
SubjectContextPreprocessor: Elapsed time: 0.0066 seconds
SubjectContextPreprocessor: Elapsed time: 0.0121 seconds
TextAnnotator: Elapsed time: 0.4070 seconds
TextAnnotator: Elapsed time: 1.8443 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0082 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0424 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0329 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.1659 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0003 seconds
SubjectContextPreprocessor: Elapsed time: 0.0001 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextP

AdvancedSubjectContextExtractor: Elapsed time: 23.6431 seconds
SubjectContextPreprocessor: Elapsed time: 0.0049 seconds
SubjectContextPreprocessor: Elapsed time: 0.0059 seconds
SubjectContextPreprocessor: Elapsed time: 0.0039 seconds
SubjectContextPreprocessor: Elapsed time: 0.0061 seconds
SubjectContextPreprocessor: Elapsed time: 0.0061 seconds
SubjectContextPreprocessor: Elapsed time: 0.0017 seconds
TextAnnotator: Elapsed time: 0.8370 seconds
TextAnnotator: Elapsed time: 0.8394 seconds
TextAnnotator: Elapsed time: 0.2348 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0231 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0189 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0035 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0723 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0739 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0240 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds

TextAnnotator: Elapsed time: 0.0678 seconds
TextAnnotator: Elapsed time: 0.0676 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0017 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0011 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0033 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0063 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AdvancedSubjectContextExtractor: Elapsed time: 1.0508 seconds
SubjectContextPreprocessor: Elapsed time: 0.0080 seconds
SubjectContextPreprocessor: Elapsed time: 0.0055 seconds
TextAnnotator: Elapsed time: 0.7923 seconds
UdapiFromConlluTransformer: 

AdvancedSubjectContextExtractor: Elapsed time: 0.1837 seconds
SubjectContextPreprocessor: Elapsed time: 0.0013 seconds
SubjectContextPreprocessor: Elapsed time: 0.0054 seconds
SubjectContextPreprocessor: Elapsed time: 0.0008 seconds
SubjectContextPreprocessor: Elapsed time: 0.0046 seconds
TextAnnotator: Elapsed time: 0.0644 seconds
TextAnnotator: Elapsed time: 0.9645 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0011 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0223 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0138 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0883 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0001 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextP

TextAnnotator: Elapsed time: 1.4432 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0410 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.1377 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0001 seconds
AdvancedSubjectContextExtractor: Elapsed time: 1.2606 seconds
SubjectContextPreprocessor: Elapsed time: 0.0008 seconds
SubjectContextPreprocessor: Elapsed time: 0.0021 seconds
SubjectContextPreprocessor: Elapsed time: 0.0015 seconds
SubjectContextPreprocessor: Elapsed time: 0.0009 seconds
SubjectContextPreprocessor: Elapsed time: 0.0035 seconds
SubjectContextPreprocessor: Elapsed time: 0.0009 seconds
TextAnnotator: Elapsed time: 0.0745 seconds
TextAnnotator: Elapsed time: 0.2612 seconds
TextAnnotator: Elapsed time: 0.1714 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0011 seconds
UdapiFromConlluTransformer: Ela

ConlluSubjectContextPreprocessor: Elapsed time: 0.1879 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.2940 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.3835 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0017 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0019 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0020 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0042 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0029 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0020 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransfo

TextAnnotator: Elapsed time: 1.1827 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0302 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0917 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AdvancedSubjectContextExtractor: Elapsed time: 0.0438 seconds
SubjectContextPreprocessor: Elapsed time: 0.0009 seconds
SubjectContextPreprocessor: Elapsed time: 0.0023 seconds
SubjectContextPreprocessor: Elapsed time: 0.0005 seconds
SubjectContextPreprocessor: Elapsed time: 0.0015 seconds
TextAnnotator: Elapsed time: 0.0271 seconds
TextAnnotator: Elapsed time: 0.2610 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0008 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0047 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0015 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0244 seconds
UdapiT

AdvancedSubjectContextExtractor: Elapsed time: 2.0772 seconds
SubjectContextPreprocessor: Elapsed time: 0.0305 seconds
SubjectContextPreprocessor: Elapsed time: 0.0072 seconds
SubjectContextPreprocessor: Elapsed time: 0.0285 seconds
SubjectContextPreprocessor: Elapsed time: 0.0015 seconds
TextAnnotator: Elapsed time: 3.7148 seconds
TextAnnotator: Elapsed time: 0.3659 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0870 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0093 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.3031 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0274 seconds
UdapiToStrTransformer: Elapsed time: 0.0002 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0001 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0002 seconds
SubjectContextP

TextAnnotator: Elapsed time: 0.2298 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0088 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0194 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AdvancedSubjectContextExtractor: Elapsed time: 0.0233 seconds
SubjectContextPreprocessor: Elapsed time: 0.0032 seconds
SubjectContextPreprocessor: Elapsed time: 0.0018 seconds
TextAnnotator: Elapsed time: 0.1855 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0069 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0276 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0001 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AdvancedSubjectContextExtractor: Elapsed time: 0.0303 seconds
SubjectCon

ConlluSubjectContextPreprocessor: Elapsed time: 0.0227 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0334 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.1400 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
UdapiToStrTransformer: Elapsed time: 0.0010 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seco

TextAnnotator: Elapsed time: 0.1713 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0029 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0219 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
SubjectContextPreprocessor: Elapsed time: 0.0001 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AdvancedSubjectContextExtractor: Elapsed time: 0.0085 seconds
SubjectContextPreprocessor: Elapsed time: 0.0029 seconds
SubjectContextPreprocessor: Elapsed time: 0.0011 seconds
TextAnnotator: Elapsed time: 0.1412 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0021 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0126 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AdvancedSubjectContextExtractor: Elapsed time: 0.0833 seconds
SubjectCon

ConlluSubjectContextPreprocessor: Elapsed time: 0.0346 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
UdapiToStrTransformer: Elapsed time: 0.0002 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
UdapiToStrTransformer: Elapsed time: 0.0002 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
SubjectContextPreprocessor: Elapsed time: 0.0001 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
Subject

TextAnnotator: Elapsed time: 0.3903 seconds
TextAnnotator: Elapsed time: 0.2855 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0075 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0056 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0314 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0335 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AdvancedSubjectContextExtractor: Elapsed time: 0.0132 seconds
SubjectContextPreprocessor: Elapsed time: 0.0028 seconds
SubjectContextPreprocessor: Elapsed time: 0.0012 seconds
TextAnnotator: Elapsed time: 0.2900 seconds
UdapiFromConlluTransformer: 

TextAnnotator: Elapsed time: 0.1632 seconds
TextAnnotator: Elapsed time: 0.1148 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0034 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0025 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0177 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0128 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AdvancedSubjectContextExtractor: Elapsed time: 0.0201 seconds
SubjectContextPreprocessor: Elapsed time: 0.0041 seconds
SubjectContextPreprocessor: Elapsed time: 0.0020 seconds
TextAnnotator: Elapsed time: 0.1729 seconds
UdapiFromConlluTransformer: 

UdapiFromConlluTransformer: Elapsed time: 0.0049 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0112 seconds
UdapiToStrTransformer: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AdvancedSubjectContextExtractor: Elapsed time: 0.0134 seconds
SubjectContextPreprocessor: Elapsed time: 0.0029 seconds
SubjectContextPreprocessor: Elapsed time: 0.0032 seconds
TextAnnotator: Elapsed time: 0.2991 seconds
UdapiFromConlluTransformer: Elapsed time: 0.0074 seconds
ConlluSubjectContextPreprocessor: Elapsed time: 0.0297 seconds
UdapiToStrTransformer: Elapsed time: 0.0001 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AttributeMerger: Elapsed time: 0.0000 seconds
SubjectContextPreprocessor: Elapsed time: 0.0000 seconds
AdvancedSubjectContextExtractor: Elapsed time: 0.0243 seconds
SubjectContextPreprocessor: Elapsed time: 0.0269 secon

In [691]:
upfac = UserProfileFactory(embedder, geocoder)
interests = [['automobily'], ['silniční komunikace', 'stavby'], ['výpočetní technika'], ['politika']]
addresses = ['K Vejrychovsku 1074, Jilemnice', 'Praha 1-Nové Město, nábřeží Ludvíka Svobody 1222/12', 'Luční 1067, Jilemnice','Letenská 525/15; Praha 1, Malá Strana']
ups = [upfac.create(addr, intr) for addr, intr in zip(addresses, interests)]
user_profile_items = [{'user_id':None, 'address': up.address, 'gps': up.gps, 'interest_items':up.interest_items, 'embeddings':up.embeddings} for up in ups]
df_user_profile = pandas.DataFrame(user_profile_items)
df_user_profile

Getting GPS for K Vejrychovsku 1074, Jilemnice
Embedding total 1 items
Getting GPS for Praha 1-Nové Město, nábřeží Ludvíka Svobody 1222/12
Embedding total 2 items
Getting GPS for Luční 1067, Jilemnice
Embedding total 1 items
Getting GPS for Letenská 525/15; Praha 1, Malá Strana
Embedding total 1 items


Unnamed: 0,user_id,address,gps,interest_items,embeddings
0,,"K Vejrychovsku 1074, Jilemnice","(50.5999042304, 15.51457122)",[automobily],"[[-0.022099614, -0.10972873, 0.13296181, 0.015..."
1,,"Praha 1-Nové Město, nábřeží Ludvíka Svobody 12...","(50.0933520046, 14.4343605745)","[silniční komunikace, stavby]","[[-0.016083097, -0.024509646, 0.019968826, 0.0..."
2,,"Luční 1067, Jilemnice","(50.5995494323, 15.5113817449)",[výpočetní technika],"[[0.009841297, -0.062487043, 0.04388702, 0.046..."
3,,"Letenská 525/15; Praha 1, Malá Strana","(50.089144925, 14.4071893631)",[politika],"[[0.067117184, 0.002262696, -0.0105991345, 0.0..."


In [697]:
interests = [['výpočetní technika', 'počítače'], ['elektrická energie', 'plyn'], ['školení logistiky', 'dopravník'], ['jablka', 'hrušky'], ['stavba školy'],['početní technika']]
addresses = ['V Jilmu 229 514 01 Jilemnice', 'Na Poříčí 24 Praha 1', 'Francouzská 6167/5 708 00 Ostrava-Poruba','Čtveřín 60 Pěnčín u Liberce', 'Masarykovo náměstí 82 514 01 Jilemnice', 'Lánovská 1658, 543 01 Vrchlabí']
cs = [upfac.create(addr, intr) for addr, intr in zip(addresses, interests)]
contract_items = [{'contract_id':i, 'address': c.address, 'gps': c.gps, 'items':c.interest_items, 'embeddings':c.embeddings} for i,c in enumerate(cs)]
df_contract_items = pandas.DataFrame(contract_items)
df_contract_items

Getting GPS for V Jilmu 229 514 01 Jilemnice
Embedding total 2 items
Getting GPS for Na Poříčí 24 Praha 1
Embedding total 2 items
Getting GPS for Francouzská 6167/5 708 00 Ostrava-Poruba
Embedding total 2 items
Getting GPS for Čtveřín 60 Pěnčín u Liberce
Embedding total 2 items
Getting GPS for Masarykovo náměstí 82 514 01 Jilemnice
Embedding total 1 items
Getting GPS for Lánovská 1658, 543 01 Vrchlabí
Embedding total 1 items


Unnamed: 0,contract_id,address,gps,items,embeddings
0,0,V Jilmu 229 514 01 Jilemnice,"(50.60629521, 15.509971345)","[výpočetní technika, počítače]","[[0.009841297, -0.062487043, 0.04388702, 0.046..."
1,1,Na Poříčí 24 Praha 1,"(50.0896730981, 14.4336646907)","[elektrická energie, plyn]","[[0.009689108, -0.06226731, 0.033098727, 0.016..."
2,2,Francouzská 6167/5 708 00 Ostrava-Poruba,"(49.8275383936, 18.1865474908)","[školení logistiky, dopravník]","[[0.072922386, -0.030289192, 0.08900586, 0.039..."
3,3,Čtveřín 60 Pěnčín u Liberce,"(50.5907967629, 15.102488997)","[jablka, hrušky]","[[0.042488422, -0.07491765, 0.04933689, 0.0934..."
4,4,Masarykovo náměstí 82 514 01 Jilemnice,"(50.6089454106, 15.5059879975)",[stavba školy],"[[0.07771003, -0.08573899, 0.091318145, 0.0090..."
5,5,"Lánovská 1658, 543 01 Vrchlabí","(50.6207364452, 15.6231454345)",[početní technika],"[[-0.010320026, -0.015345275, 0.056282263, -0...."


In [700]:
interests = [['obchod s elektronikou'], ['energetický poskytovatel'], ['školení logistiky'], ['pěstování plodin'], ['správa města'],['obchod s alektronikou']]
addresses = ['V Jilmu 229 514 01 Jilemnice', 'Na Poříčí 24 Praha 1', 'Francouzská 6167/5 708 00 Ostrava-Poruba','Čtveřín 60 Pěnčín u Liberce', 'Masarykovo náměstí 82 514 01 Jilemnice', 'Lánovská 1658, 543 01 Vrchlabí']
cs = [upfac.create(addr, intr) for addr, intr in zip(addresses, interests)]
contract_items = [{'contract_id':i, 'entity_items':c.interest_items, 'entity_embeddings':c.embeddings} for i,c in enumerate(cs)]
df_contract_items2 = pandas.DataFrame(contract_items)
df_contract_items2

Getting GPS for V Jilmu 229 514 01 Jilemnice
Embedding total 1 items
Getting GPS for Na Poříčí 24 Praha 1
Embedding total 1 items
Getting GPS for Francouzská 6167/5 708 00 Ostrava-Poruba
Embedding total 1 items
Getting GPS for Čtveřín 60 Pěnčín u Liberce
Embedding total 1 items
Getting GPS for Masarykovo náměstí 82 514 01 Jilemnice
Embedding total 1 items
Getting GPS for Lánovská 1658, 543 01 Vrchlabí
Embedding total 1 items


Unnamed: 0,contract_id,entity_items,entity_embeddings
0,0,[obchod s elektronikou],"[[-0.003762368, -0.07310595, 0.08018466, 0.119..."
1,1,[energetický poskytovatel],"[[0.012319274, -0.05742455, -0.009580512, 0.06..."
2,2,[školení logistiky],"[[0.072922386, -0.030289192, 0.08900586, 0.039..."
3,3,[pěstování plodin],"[[0.03276872, -0.11388534, 0.035737667, 0.1059..."
4,4,[správa města],"[[0.0006197457, -0.07027029, -0.005452789, 0.0..."
5,5,[obchod s alektronikou],"[[-0.00647911, -0.07109018, 0.08066723, 0.1239..."


In [702]:
df_contract_items = pandas.merge(df_contract_items, df_contract_items2, on='contract_id')
df_contract_items

Unnamed: 0,contract_id,address,gps,items,embeddings,entity_items,entity_embeddings
0,0,V Jilmu 229 514 01 Jilemnice,"(50.60629521, 15.509971345)","[výpočetní technika, počítače]","[[0.009841297, -0.062487043, 0.04388702, 0.046...",[obchod s elektronikou],"[[-0.003762368, -0.07310595, 0.08018466, 0.119..."
1,1,Na Poříčí 24 Praha 1,"(50.0896730981, 14.4336646907)","[elektrická energie, plyn]","[[0.009689108, -0.06226731, 0.033098727, 0.016...",[energetický poskytovatel],"[[0.012319274, -0.05742455, -0.009580512, 0.06..."
2,2,Francouzská 6167/5 708 00 Ostrava-Poruba,"(49.8275383936, 18.1865474908)","[školení logistiky, dopravník]","[[0.072922386, -0.030289192, 0.08900586, 0.039...",[školení logistiky],"[[0.072922386, -0.030289192, 0.08900586, 0.039..."
3,3,Čtveřín 60 Pěnčín u Liberce,"(50.5907967629, 15.102488997)","[jablka, hrušky]","[[0.042488422, -0.07491765, 0.04933689, 0.0934...",[pěstování plodin],"[[0.03276872, -0.11388534, 0.035737667, 0.1059..."
4,4,Masarykovo náměstí 82 514 01 Jilemnice,"(50.6089454106, 15.5059879975)",[stavba školy],"[[0.07771003, -0.08573899, 0.091318145, 0.0090...",[správa města],"[[0.0006197457, -0.07027029, -0.005452789, 0.0..."
5,5,"Lánovská 1658, 543 01 Vrchlabí","(50.6207364452, 15.6231454345)",[početní technika],"[[-0.010320026, -0.015345275, 0.056282263, -0....",[obchod s alektronikou],"[[-0.00647911, -0.07109018, 0.08066723, 0.1239..."


In [731]:
df_contract_items.to_pickle('df_contracts.pickle')

In [153]:
target = numpy.array(embedder.process(['plyn','hrušky']))
target

array([[-2.11298210e-03, -1.82957441e-01,  5.43357581e-02,
         3.68261561e-02, -2.87125707e-02,  5.07773012e-02,
        -1.76154040e-02,  1.68402586e-03,  1.31049259e-02,
        -7.74341226e-02, -3.82218882e-02, -8.23296327e-03,
        -3.83996256e-02,  6.80910796e-02, -2.72737276e-02,
        -2.83118756e-03,  4.68822978e-02,  9.90544632e-02,
        -7.25169852e-03, -9.92601272e-03, -7.76487887e-02,
         6.20452501e-02, -4.20110002e-02, -4.40651271e-03,
         3.66706029e-02,  4.29575257e-02,  3.24910507e-03,
        -7.27823302e-02,  9.13716033e-02,  9.50580314e-02,
         4.21908386e-02,  5.82099408e-02, -1.56948995e-02,
        -4.12731282e-02, -5.98687753e-02,  3.88855487e-02,
        -5.31117246e-02, -1.57179423e-02, -5.97531758e-02,
         1.01783291e-01, -5.41335568e-02, -3.94700356e-02,
        -2.85113584e-02,  7.49275014e-02, -8.19963869e-03,
         9.70901623e-02, -6.23237640e-02,  3.86645943e-02,
        -3.53591517e-02,  1.30062653e-02, -3.70422751e-0

In [729]:
csic = SimilarItemsComputer(df_contract_items, distance_computer=ItemDistanceComputer(df_contract_items))

In [730]:
result = csic.compute_most_similar(df_user_profile, 2)
result

{13: {'automobily': [{'contract_id': 2,
    'item': 'dopravník',
    'distance': 0.5445505853395554,
    'similarity': 0.45544941466044464},
   {'contract_id': 0,
    'item': 'počítače',
    'distance': 0.6848318412862568,
    'similarity': 0.31516815871374315}]},
 14: {'silniční komunikace': [{'contract_id': 2,
    'item': 'dopravník',
    'distance': 0.5253005365480455,
    'similarity': 0.47469946345195446},
   {'contract_id': 2,
    'item': 'školení logistiky',
    'distance': 0.6207491421313175,
    'similarity': 0.3792508578686825}],
  'stavby': [{'contract_id': 4,
    'item': 'stavba školy',
    'distance': 0.3729893913065885,
    'similarity': 0.6270106086934115},
   {'contract_id': 2,
    'item': 'školení logistiky',
    'distance': 0.7078051590536205,
    'similarity': 0.2921948409463795}]},
 15: {'výpočetní technika': [{'contract_id': 0,
    'item': 'výpočetní technika',
    'distance': 0.0,
    'similarity': 1.0},
   {'contract_id': 5,
    'item': 'početní technika',
    'd

In [710]:
df_user_profile

Unnamed: 0,user_id,address,gps,interest_items,embeddings
0,13,"K Vejrychovsku 1074, Jilemnice","(50.5999042303745, 15.5145712200147)",[automobily],"[[-0.0220996141433716, -0.109728731215, 0.1329..."
1,14,"Praha 1-Nové Město, nábřeží Ludvíka Svobody 12...","(50.0933520045633, 14.4343605745105)","[silniční komunikace, stavby]","[[-0.0160830970853567, -0.0245096459984779, 0...."
2,15,"Luční 1067, Jilemnice","(50.5995494322684, 15.5113817448695)",[výpočetní technika],"[[0.00984129682183266, -0.0624870434403419, 0...."
3,16,"Letenská 525/15; Praha 1, Malá Strana","(50.0891449249535, 14.4071893630789)",[politika],"[[0.0671171844005585, 0.00226269592531025, -0...."


In [50]:
class CPVManager(DBManager):
    
    def __init__(self, connection):
        super().__init__(connection)
        self._load_query = 'select * from contract_cpv'
        self._load_enum_query = 'select * from cpv_code'
        self._cpv_enum = None
        
    def loadCPVEnumFromDB(self, parts=10):
        
        print("Running query: " + self._load_enum_query)
        raw_data = self.runQuery(self._load_enum_query)
        
        cpv_codes = {}
        total_codes = len(raw_data)
        print("Loading total " + str(total_codes) + " codes")
        for i, cpv in enumerate(raw_data):
            if i % (int(total_codes / parts)+1) == 0:
                print("Progress: {}%".format(numpy.ceil(i * 100 / total_codes)))
            cpv_id = cpv[0]
            code = cpv[1]
            name = cpv[2]
            cpv_parent_id = cpv[3]
            embedding = cpv[4]
            cpv_codes[code] = {'id': cpv_id, 'name': name, 'cpv_parent_id': cpv_parent_id, 'embedding': embedding}
        self._cpv_enum = cpv_codes
        return pandas.DataFrame.from_dict(cpv_codes, orient='index')
       
    def loadFromDB(self, parts=10):
        print("Running query: " + self._load_query)
        raw_data = self.runQuery(self._load_query)
        
        contract_cpvs = {}
        total_recors = len(raw_data)
        print("Loading total " + str(total_recors) + " records")
        for i, item in enumerate(raw_data):
            if i % (int(total_recors / parts)+1) == 0:
                print("Progress: {}%".format(numpy.ceil(i * 100 / total_recors)))
            contract_id = item[0]
            cpv_id = item[1]
            contract_cpv = contract_cpvs.get(contract_id, {'cpv_codes':[]})
            contract_cpv['cpv_codes'].append(cpv_id)
            contract_cpvs[contract_id] = contract_cpv
        return pandas.DataFrame.from_dict(contract_cpvs, orient='index')

    
    def _truncateDB(self):
        self.runQuery('truncate table contract_cpv')

    def saveToDB(self, df_contracts):
        self._truncateDB()
        for index, row in df_contracts.iterrows():
            contract_id = index
            print(contract_id)
            cpv_codes = row['cpv_codes']

            for i, cpv_code in enumerate(cpv_codes):
                cpv = self._cpv_enum.get(cpv_code, None)
                if not cpv:
                    continue
                print('    '+cpv_code)
                cpv_id = cpv['id']
                cursor = self._connection.cursor()

                postgres_insert_query = """INSERT INTO contract_cpv (contract_id, cpv_id)
                                            VALUES (%s,%s)"""
                record_to_insert = (contract_id, cpv_id)
                cursor.execute(postgres_insert_query, record_to_insert)

                self._connection.commit()
                count = cursor.rowcount
                cursor.close()
                
cpvmngr = CPVManager(psycopg2_conn)
df_cpv_codes = cpvmngr.loadCPVEnumFromDB()
df_cpv_codes

Running query: select * from cpv_code
Loading total 9455 codes
Progress: 0.0%
Progress: 11.0%
Progress: 21.0%
Progress: 31.0%
Progress: 41.0%
Progress: 51.0%
Progress: 61.0%
Progress: 71.0%
Progress: 81.0%
Progress: 91.0%


Unnamed: 0,id,name,cpv_parent_id,embedding
03115110-4,17362,Bavlna,17361.0,"[-0.0275000333786011, -0.146244868636131, 0.07..."
03131100-9,17383,Kávové boby,17382.0,"[-0.0536825954914093, -0.131708815693855, 0.08..."
03211600-9,17408,Oves,17400.0,"[0.0767745599150658, -0.0503297559916973, -0.0..."
03221222-8,16546,Hrách cukrový,16544.0,"[-0.00412554573267698, -0.0486323796212673, 0...."
03222322-6,16589,Hrušky,16587.0,"[-0.0257897060364485, -0.124494545161724, 0.00..."
...,...,...,...,...
98513300-5,9714,Personál na dobu určitou v domácnostech,9711.0,"[-0.00468706572428346, -0.0627820566296577, 0...."
98513310-8,9715,Pomocnice v domácnosti,9714.0,"[0.0576155185699463, -0.0286662802100182, 0.13..."
98514000-9,9716,Služebnictvo,11100.0,"[-0.00296529196202755, -0.00993904192000628, 0..."
98900000-2,9717,Služby poskytované extrateritoriálními organiz...,11026.0,"[-0.0401757471263409, -0.0253001507371664, 0.0..."


In [42]:
df_cpv_codes['embedding'] = df_cpv_codes['name'].apply(lambda name: embedder.process(name))
df_cpv_codes

Unnamed: 0,id,name,cpv_parent_id,embedding
,0,Common Procurement Vocabulary,,"[0.051647455, 0.011115263, 0.074224494, 0.1089..."
03000000-1,17340,"Produkty zemědělství, hospodářské produkty, pr...",0.0,"[0.00970636, -0.08663636, 0.11477385, 0.090600..."
03100000-2,17341,Produkty zemědělství a zahradnictví,17340.0,"[0.037167333, -0.08562787, 0.11880125, 0.08423..."
03110000-5,17342,Produkty rostlinné výroby v zelinářství a zahr...,17341.0,"[0.016786795, -0.08355232, 0.114634, 0.0588657..."
03111000-2,17343,Semena,17342.0,"[-0.0688907, -0.15229264, 0.038263835, 0.09676..."
...,...,...,...,...
98513300-5,9714,Personál na dobu určitou v domácnostech,9711.0,"[-0.0046870657, -0.06278206, 0.05633483, 0.027..."
98513310-8,9715,Pomocnice v domácnosti,9714.0,"[0.05761552, -0.02866628, 0.1373671, 0.0232092..."
98514000-9,9716,Služebnictvo,11100.0,"[-0.002965292, -0.009939042, 0.20390521, 0.060..."
98900000-2,9717,Služby poskytované extrateritoriálními organiz...,11026.0,"[-0.040175747, -0.02530015, 0.08632081, 0.0791..."


In [49]:
for index, row in df_cpv_codes.iterrows():
    cpv_id = row['id']
    print(cpv_id)
    lembedding = row['embedding'].tolist()
    cursor = psycopg2_conn.cursor()

    postgres_update_query = """UPDATE cpv_code
                                SET embedding=%s
                                WHERE id=%s"""
    record_to_update = (lembedding,cpv_id)
    cursor.execute(postgres_update_query, record_to_update)

    psycopg2_conn.commit()
    count = cursor.rowcount
    cursor.close()

0
17340
17341
17342
17343
17344
17345
17346
17347
17348
17349
17350
17351
17352
17353
17354
17355
17356
17357
17358
17359
17360
17361
17362
17363
17364
17365
17366
17367
17368
17369
17370
17371
17372
17373
17374
17375
17376
17377
17378
17379
17380
17381
17382
17383
17384
17385
17386
17387
17388
17389
17390
17391
17392
17393
17394
17395
17396
17397
17398
17399
17400
17401
17402
17403
17404
17405
17406
17407
17408
17409
17410
17411
17412
17413
17414
17415
17416
17417
17418
17419
17420
17421
17422
17423
17424
17425
17426
17427
17428
17429
9091
16542
16543
16544
16545
16546
16547
16548
16549
16550
16551
16552
16553
16554
16555
16556
16557
16558
16559
16560
16561
16562
16563
16564
16565
16566
16567
16568
16569
16570
16571
16572
16573
16574
16575
16576
16577
16578
16579
16580
16581
16582
16583
16584
16585
16586
16587
16588
16589
16590
16591
16592
16593
16594
16595
16596
16597
16598
16599
16600
16601
16602
16603
16604
16605
16606
16607
16608
16609
16610
16611
16612
16613
16614
16615
16616
166

18432
18433
18434
18435
18436
18437
18438
18439
18440
18441
18442
18443
18444
18445
18446
18447
18448
18449
18450
18451
18452
18453
18454
18455
18456
18457
18458
18459
18460
15670
9097
15577
15578
15579
15580
15581
15582
15583
15584
15585
15586
15587
15588
15589
15590
15591
15592
15593
15594
15595
15596
15597
15598
15599
15600
15601
15602
15603
15604
15605
15606
15607
15608
15609
15610
15611
15612
15613
15614
15615
15616
15617
15618
15619
15620
15621
15622
15623
15624
15625
15626
15627
15628
15629
15630
15631
15632
15633
15634
15635
15636
15637
15638
15639
15640
15641
15642
15643
15644
15645
15646
15647
15648
15649
15650
15651
15652
15653
15654
15655
15656
15657
15658
15659
15660
15661
15662
15663
15664
15665
15666
15667
15668
15669
15939
15940
15941
15942
15943
15944
15945
15946
15947
15948
15949
15950
15951
15952
15953
15954
15955
15956
15957
15958
15959
15960
15961
15962
15963
15964
15965
15966
15967
15968
15969
15970
15971
15972
15973
15974
15975
15976
15977
15978
15979
15980
15981

16482
16483
16484
16485
16486
16487
16488
16489
16490
16491
16492
16493
16494
16495
16496
16497
16498
16499
16500
16501
16502
16503
16504
16505
16506
16507
16508
16509
16510
16511
16512
16513
16514
16515
16516
16517
16518
16519
16520
16521
16522
16523
16524
16525
16526
16527
16528
16529
16530
16531
16532
16533
16534
16535
16536
16537
16538
16539
16540
16541
14199
14200
9102
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
14201
14275
14276
14277
14278
14279
14280
14281
14282
14283
14284
14285
14286
14287
14288
14289
14290
14291
14292

17263
17264
17265
17266
17267
17268
17269
17270
17271
17272
17273
17274
17275
17276
17277
17278
17279
17280
17281
17282
17283
17284
17285
17286
17287
17288
17289
17290
17291
17292
17293
17294
17295
17296
17297
17298
17299
17300
17301
17302
17303
17304
17305
17306
17307
17308
17309
17310
17311
17312
17313
17314
17315
17316
17317
17318
17319
17320
17321
17322
17323
17324
17325
17326
17327
17328
17329
17330
17331
17332
17333
17334
17335
17336
17337
17338
17339
12936
12937
9106
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343

11590
11591
11592
11593
11594
11595
11596
11597
11598
12941
13360
13361
13362
13363
13364
13365
13366
13367
13368
13369
13370
13371
13372
13373
13374
13375
13376
13377
13378
13379
13380
13381
13382
13383
13384
13385
13386
13387
13388
13389
13390
13391
13392
13393
13394
13395
13396
13397
13398
13399
13400
13401
13402
13403
13404
13405
13406
13407
13408
13409
13410
13411
13412
13413
13414
13415
13416
13417
13418
13419
13420
13421
13422
13423
13424
13425
13426
13427
13428
13429
13430
13431
13432
13433
13434
13435
13436
13437
13438
13439
13440
13441
13442
13443
14208
16123
16124
16125
16126
16127
16128
16129
16130
16131
16132
16133
16134
16135
16136
16137
16138
16139
16140
16141
16142
16143
16144
16145
16146
16147
16148
16149
16150
16151
16152
16153
16154
16155
16156
16157
16158
16159
16160
16161
16162
16163
16164
16165
16166
16167
16168
16169
16170
16171
16172
16173
16174
16175
16176
16177
16178
16179
16180
16181
16182
16183
16184
16185
16186
16187
16188
16189
16190
16191
16192
16193
1619

14856
14857
14858
14859
10470
10471
9113
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
10472
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
11695
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031


11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
9123
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10939
10940
9124
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003


In [711]:
dlc = DistanceLocalityComputer(df_contract_locality)

In [712]:
dlc.compute_nearest(df_user_profile, 2)

{13: {'K Vejrychovsku 1074, Jilemnice': [{'contract_id': 849,
    'address': 'Velké Hamry,Velké Hamry,,,212,46845',
    'distance': 20.185034},
   {'contract_id': 813,
    'address': 'Liberec,Liberec IV-Perštýn,Liberec (nečleněné město),U Jezu,642,46001',
    'distance': 37.30463}]},
 14: {'Praha 1-Nové Město, nábřeží Ludvíka Svobody 1222/12': [{'contract_id': 59,
    'address': 'Praha,Nové Město,Praha 1,Těšnov,65,11000',
    'distance': 0.1570575},
   {'contract_id': 63,
    'address': 'Praha,Nové Město,Praha 1,Těšnov,65,11000',
    'distance': 0.1570575}]},
 15: {'Luční 1067, Jilemnice': [{'contract_id': 849,
    'address': 'Velké Hamry,Velké Hamry,,,212,46845',
    'distance': 20.038195},
   {'contract_id': 813,
    'address': 'Liberec,Liberec IV-Perštýn,Liberec (nečleněné město),U Jezu,642,46001',
    'distance': 37.127357}]},
 16: {'Letenská 525/15; Praha 1, Malá Strana': [{'contract_id': 425,
    'address': 'Praha,Hradčany,Praha 6,Tychonova,221,16000',
    'distance': 0.7470972},

In [736]:
slc = SimilarLocalityComputer(df_contract_locality)

In [740]:
similar_addresses = slc.compute_most_similar(df_user_profile, 1000)
similar_addresses

{13: {'K Vejrychovsku 1074, Jilemnice': [{'contract_id': 849,
    'address': 'Velké Hamry,Velké Hamry,,,212,46845',
    'distance': 20.185034,
    'similarity': 0.32751503988577},
   {'contract_id': 813,
    'address': 'Liberec,Liberec IV-Perštýn,Liberec (nečleněné město),U Jezu,642,46001',
    'distance': 37.30463,
    'similarity': 0.27430547017284834},
   {'contract_id': 814,
    'address': 'Liberec,Liberec IV-Perštýn,Liberec (nečleněné město),U Jezu,642,46001',
    'distance': 37.30463,
    'similarity': 0.27430547017284834},
   {'contract_id': 815,
    'address': 'Liberec,Liberec IV-Perštýn,Liberec (nečleněné město),U Jezu,642,46001',
    'distance': 37.30463,
    'similarity': 0.27430547017284834},
   {'contract_id': 816,
    'address': 'Liberec,Liberec IV-Perštýn,Liberec (nečleněné město),U Jezu,642,46001',
    'distance': 37.30463,
    'similarity': 0.27430547017284834},
   {'contract_id': 817,
    'address': 'Liberec,Liberec IV-Perštýn,Liberec (nečleněné město),U Jezu,642,4600

In [742]:
similar_addresses_flat = []
for uid in similar_addresses:
    for address in list(similar_addresses[uid].values())[0]:
        address['user'] = uid
        similar_addresses_flat.append(address)
similar_addresses_flat

[{'contract_id': 849,
  'address': 'Velké Hamry,Velké Hamry,,,212,46845',
  'distance': 20.185034,
  'similarity': 0.32751503988577,
  'user': 13},
 {'contract_id': 813,
  'address': 'Liberec,Liberec IV-Perštýn,Liberec (nečleněné město),U Jezu,642,46001',
  'distance': 37.30463,
  'similarity': 0.27430547017284834,
  'user': 13},
 {'contract_id': 814,
  'address': 'Liberec,Liberec IV-Perštýn,Liberec (nečleněné město),U Jezu,642,46001',
  'distance': 37.30463,
  'similarity': 0.27430547017284834,
  'user': 13},
 {'contract_id': 815,
  'address': 'Liberec,Liberec IV-Perštýn,Liberec (nečleněné město),U Jezu,642,46001',
  'distance': 37.30463,
  'similarity': 0.27430547017284834,
  'user': 13},
 {'contract_id': 816,
  'address': 'Liberec,Liberec IV-Perštýn,Liberec (nečleněné město),U Jezu,642,46001',
  'distance': 37.30463,
  'similarity': 0.27430547017284834,
  'user': 13},
 {'contract_id': 817,
  'address': 'Liberec,Liberec IV-Perštýn,Liberec (nečleněné město),U Jezu,642,46001',
  'dista

In [758]:
df_agg = pandas.read_pickle('C:/x/diplomka/research/src/tests/df_tmp_1.pickle')
df_agg

Unnamed: 0,contract_id,address,distance,similarity,user
0,0,V Jilmu 229 514 01 Jilemnice,0.782209,1.0,13
1,4,Masarykovo náměstí 82 514 01 Jilemnice,1.175011,1.0,13
2,5,"Lánovská 1658, 543 01 Vrchlabí",8.027082,0.454498,13
3,3,Čtveřín 60 Pěnčín u Liberce,29.195602,0.293453,13
4,1,Na Poříčí 24 Praha 1,95.607552,0.218787,13
5,2,Francouzská 6167/5 708 00 Ostrava-Poruba,209.159821,0.18699,13
6,1,Na Poříčí 24 Praha 1,0.412481,1.0,14
7,3,Čtveřín 60 Pěnčín u Liberce,72.964249,0.232365,14
8,4,Masarykovo náměstí 82 514 01 Jilemnice,95.425804,0.218877,14
9,0,V Jilmu 229 514 01 Jilemnice,95.477974,0.218851,14


In [781]:
df_agg.columns

Index(['contract_id', 'address', 'distance', 'similarity', 'user'], dtype='object')

In [715]:
alsc = AggregatedLocalSimilarityComputer(df_contract_items)

In [734]:
result = alsc.compute_most_similar(df_user_profile)
result

{13: [{'contract_id': 0,
   'address': 'V Jilmu 229 514 01 Jilemnice',
   'distance': 0.7822094,
   'similarity': 1},
  {'contract_id': 4,
   'address': 'Masarykovo náměstí 82 514 01 Jilemnice',
   'distance': 1.1750114,
   'similarity': 1},
  {'contract_id': 5,
   'address': 'Lánovská 1658, 543 01 Vrchlabí',
   'distance': 8.027082,
   'similarity': 0.4544981107295607},
  {'contract_id': 3,
   'address': 'Čtveřín 60 Pěnčín u Liberce',
   'distance': 29.195602,
   'similarity': 0.2934533853628448},
  {'contract_id': 1,
   'address': 'Na Poříčí 24 Praha 1',
   'distance': 95.60755,
   'similarity': 0.21878693783291647},
  {'contract_id': 2,
   'address': 'Francouzská 6167/5 708 00 Ostrava-Poruba',
   'distance': 209.15982,
   'similarity': 0.18699038892697675}],
 14: [{'contract_id': 1,
   'address': 'Na Poříčí 24 Praha 1',
   'distance': 0.41248056,
   'similarity': 1},
  {'contract_id': 3,
   'address': 'Čtveřín 60 Pěnčín u Liberce',
   'distance': 72.96425,
   'similarity': 0.2323645

In [732]:
df_contract_items

Unnamed: 0,contract_id,address,gps,items,embeddings,entity_items,entity_embeddings
0,0,V Jilmu 229 514 01 Jilemnice,"(50.60629521, 15.509971345)","[výpočetní technika, počítače]","[[0.009841297, -0.062487043, 0.04388702, 0.046...",[obchod s elektronikou],"[[-0.003762368, -0.07310595, 0.08018466, 0.119..."
1,1,Na Poříčí 24 Praha 1,"(50.0896730981, 14.4336646907)","[elektrická energie, plyn]","[[0.009689108, -0.06226731, 0.033098727, 0.016...",[energetický poskytovatel],"[[0.012319274, -0.05742455, -0.009580512, 0.06..."
2,2,Francouzská 6167/5 708 00 Ostrava-Poruba,"(49.8275383936, 18.1865474908)","[školení logistiky, dopravník]","[[0.072922386, -0.030289192, 0.08900586, 0.039...",[školení logistiky],"[[0.072922386, -0.030289192, 0.08900586, 0.039..."
3,3,Čtveřín 60 Pěnčín u Liberce,"(50.5907967629, 15.102488997)","[jablka, hrušky]","[[0.042488422, -0.07491765, 0.04933689, 0.0934...",[pěstování plodin],"[[0.03276872, -0.11388534, 0.035737667, 0.1059..."
4,4,Masarykovo náměstí 82 514 01 Jilemnice,"(50.6089454106, 15.5059879975)",[stavba školy],"[[0.07771003, -0.08573899, 0.091318145, 0.0090...",[správa města],"[[0.0006197457, -0.07027029, -0.005452789, 0.0..."
5,5,"Lánovská 1658, 543 01 Vrchlabí","(50.6207364452, 15.6231454345)",[početní technika],"[[-0.010320026, -0.015345275, 0.056282263, -0....",[obchod s alektronikou],"[[-0.00647911, -0.07109018, 0.08066723, 0.1239..."


In [733]:
df_user_profile

Unnamed: 0,user_id,address,gps,interest_items,embeddings
0,13,"K Vejrychovsku 1074, Jilemnice","(50.5999042303745, 15.5145712200147)",[automobily],"[[-0.0220996141433716, -0.109728731215, 0.1329..."
1,14,"Praha 1-Nové Město, nábřeží Ludvíka Svobody 12...","(50.0933520045633, 14.4343605745105)","[silniční komunikace, stavby]","[[-0.0160830970853567, -0.0245096459984779, 0...."
2,15,"Luční 1067, Jilemnice","(50.5995494322684, 15.5113817448695)",[výpočetní technika],"[[0.00984129682183266, -0.0624870434403419, 0...."
3,16,"Letenská 525/15; Praha 1, Malá Strana","(50.0891449249535, 14.4071893630789)",[politika],"[[0.0671171844005585, 0.00226269592531025, -0...."


In [695]:
df_user_profile.to_pickle('df_user_profile.pkl')

In [717]:
csc = ComplexSimilarityComputer(df_contract_items)

In [718]:
result = csc.compute_most_similar(df_user_profile, 2)
result

{13: [{'contract_id': 0, 'similarity': 0.1518906716700348},
  {'contract_id': 4, 'similarity': 0.11942910550748677}],
 14: [{'contract_id': 1, 'similarity': 0.12205733630991444},
  {'contract_id': 4, 'similarity': 0.05729924142112256}],
 15: [{'contract_id': 0, 'similarity': 0.41974040056116074},
  {'contract_id': 5, 'similarity': 0.1897753325171959}],
 16: [{'contract_id': 1, 'similarity': 0.06861591318796488},
  {'contract_id': 5, 'similarity': 0.02970121673980206}]}

In [721]:
aisc = AggregatedItemSimilarityComputer(df_contract_items)

In [722]:
aisc.compute_most_similar(df_user_profile)

{13: [{'contract_id': 2, 'similarity': 0.3969723406723996}],
 14: [{'contract_id': 4, 'similarity': 0.5235745397972891}],
 15: [{'contract_id': 5, 'similarity': 0.8445491496390173}],
 16: [{'contract_id': 5, 'similarity': 0.2769113253504405}]}