In [1]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid import config
from sem_covid.services.store_registry import store_registry
import pandas as pd
from io import StringIO
import json
import numpy as np
import mlflow

In [59]:
CONTENT_CLEANED_TOPIC_MODELING_COLUMN_NAME = 'content_cleaned_topic_modeling'
DOCUMENT_EMBEDDINGS_EURLEX_BERT_COLUMN_NAME = 'document_embeddings_eurlex_bert'
COUNTRY_COLUMN_NAME = 'country'
DOC_SOURCE = 'doc_source'
PWDB_ACTORS = 'pwdb_actors'
EU_CELLAR_AUTHOR_LABELS = 'eu_cellar_author_labels'
DATE_COLUMN_NAME = 'date'
EMBEDDING_COLUMN_NAME = 'document_embeddings_use'
COUNTRY_COLUMN_NAME = 'country'
PWDB_COLUMN_NAMES = ['pwdb_category', 'pwdb_funding', 'pwdb_type_of_measure',
                     'pwdb_actors', 'pwdb_target_group_l1', 'pwdb_target_group_l2']

COLUMN_FILTERS = PWDB_COLUMN_NAMES + [COUNTRY_COLUMN_NAME]

COLORS = ['#001a33', '#cc0000', '#0072B2', '#996666', '#CC79A7', '#ff3333',
          '#00b300', '#ffff00', '#4d004d', '#99ccff', '#0073e6', '#b3b300']

EXPERIMENT_ID = '120'
BUCKET_NAME = 'mlflow'
RML_BUCKET_NAME = 'rdf-transformer'

In [3]:
es_store = store_registry.es_index_store()

ds_unified = es_store.get_dataframe(index_name=config.UNIFIED_DATASET_ELASTIC_SEARCH_INDEX_NAME)

100% (6360 of 6360) |####################| Elapsed Time: 0:00:16 Time:  0:00:16


In [4]:

all_runs = mlflow.search_runs(
    experiment_ids=EXPERIMENT_ID
)
all_runs['params.freq_topic_minus_1'] = all_runs['params.freq_topic_minus_1'].astype('int')
all_runs.sort_values(by='params.freq_topic_minus_1', ascending=True, inplace=True)
best_run = all_runs.iloc[0]
topic_model = store_registry.minio_feature_store(BUCKET_NAME).get_features(
    features_name=f'{EXPERIMENT_ID}/{best_run.run_id}/artifacts/model/model.pkl')

_, probabilities = topic_model.transform(documents=ds_unified[CONTENT_CLEANED_TOPIC_MODELING_COLUMN_NAME],
                                         embeddings=np.array(
                                             list(ds_unified[DOCUMENT_EMBEDDINGS_EURLEX_BERT_COLUMN_NAME])))


In [9]:
topics = topic_model.get_topics()

In [75]:
topic_tokens_data = []
for key in topics.keys():
    topic_data = [{'topic_token_id': f"topic_{int(key) + 1}_{word}","token":word, "relevance": tf_idf} for word, tf_idf in
                  topics[key]]
    topic_tokens_data += topic_data

In [76]:
topic_tokens_data

[{'topic_token_id': 'topic_0_union',
  'token': 'union',
  'relevance': 0.007745977373635717},
 {'topic_token_id': 'topic_0_member',
  'token': 'member',
  'relevance': 0.007416900213924813},
 {'topic_token_id': 'topic_0_article',
  'token': 'article',
  'relevance': 0.007263722642869789},
 {'topic_token_id': 'topic_0_inputm',
  'token': 'inputm',
  'relevance': 0.006374404227811023},
 {'topic_token_id': 'topic_0_state',
  'token': 'state',
  'relevance': 0.006350724109868425},
 {'topic_token_id': 'topic_0_regulation',
  'token': 'regulation',
  'relevance': 0.006323316195395947},
 {'topic_token_id': 'topic_0_typecu',
  'token': 'typecu',
  'relevance': 0.0059600918255623934},
 {'topic_token_id': 'topic_0_council',
  'token': 'council',
  'relevance': 0.005771854072494038},
 {'topic_token_id': 'topic_0_include',
  'token': 'include',
  'relevance': 0.005747568203542315},
 {'topic_token_id': 'topic_0_financial',
  'token': 'financial',
  'relevance': 0.005739515776992126},
 {'topic_toke

In [53]:
topics_data = []
for key in topics.keys():
    topic_words = [word for word, tf_idf in topics[key]]
    topic_data = {'topic_id': f"topic_{int(key) + 1}", 'topic_name': '_'.join(topic_words),
                  "formed_by": [f"topic_{int(key) + 1}_{word}" for word in topic_words]}
    topics_data.append(topic_data)

In [54]:
topics_data

[{'topic_id': 'topic_0',
  'topic_name': 'union_member_article_inputm_state_regulation_typecu_council_include_financial',
  'formed_by': ['topic_0_union',
   'topic_0_member',
   'topic_0_article',
   'topic_0_inputm',
   'topic_0_state',
   'topic_0_regulation',
   'topic_0_typecu',
   'topic_0_council',
   'topic_0_include',
   'topic_0_financial']},
 {'topic_id': 'topic_1',
  'topic_name': 'employee_payment_employer_scheme_work_company_worker_covid19_business_employ',
  'formed_by': ['topic_1_employee',
   'topic_1_payment',
   'topic_1_employer',
   'topic_1_scheme',
   'topic_1_work',
   'topic_1_company',
   'topic_1_worker',
   'topic_1_covid19',
   'topic_1_business',
   'topic_1_employ']},
 {'topic_id': 'topic_2',
  'topic_name': 'minister_community_ireland_people_department_service_health_mental_funding_say',
  'formed_by': ['topic_2_minister',
   'topic_2_community',
   'topic_2_ireland',
   'topic_2_people',
   'topic_2_department',
   'topic_2_service',
   'topic_2_health'

In [55]:
topic_assignments_data = []
for measure_index, topics_relevance in ds_unified['topic_embeddings_eurlex_bert'].items():
    for topic_index in range(0, len(topics_relevance)):
        topic_assignments_data.append({'measure_index': measure_index, 'topic_index': f"topic_{topic_index}",
                                       'topic_relevance': topics_relevance[topic_index]})

In [56]:
topic_assignments_data

[{'measure_index': '1624',
  'topic_index': 'topic_0',
  'topic_relevance': 0.008678869},
 {'measure_index': '1624',
  'topic_index': 'topic_1',
  'topic_relevance': 0.0058465321},
 {'measure_index': '1624',
  'topic_index': 'topic_2',
  'topic_relevance': 0.0039080849},
 {'measure_index': '1624',
  'topic_index': 'topic_3',
  'topic_relevance': 0.0024686646},
 {'measure_index': '1624',
  'topic_index': 'topic_4',
  'topic_relevance': 0.001512011},
 {'measure_index': '1624',
  'topic_index': 'topic_5',
  'topic_relevance': 0.0051940721},
 {'measure_index': '1624',
  'topic_index': 'topic_6',
  'topic_relevance': 0.0044130651},
 {'measure_index': '1624',
  'topic_index': 'topic_7',
  'topic_relevance': 0.0023817167},
 {'measure_index': '1624',
  'topic_index': 'topic_8',
  'topic_relevance': 0.0018333167},
 {'measure_index': '1624',
  'topic_index': 'topic_9',
  'topic_relevance': 0.0076255043},
 {'measure_index': '1624',
  'topic_index': 'topic_10',
  'topic_relevance': 0.0011398385},


In [77]:
topic_data_mapping = {'topic_assignments_data': topic_assignments_data[:2],
                     'topics_data': topics_data[:2],
                     'topic_tokens_data': topic_tokens_data[:2],
                     }

In [78]:
minio_store = store_registry.minio_object_store(minio_bucket=RML_BUCKET_NAME)
minio_store.put_object(object_name='fields/topics_data.json',content=json.dumps(topic_data_mapping))

1065