# Imports

In [1]:
import sys
sys.path.append('/home/jovyan/work/sem-covid/')
sys.path = list(set(sys.path))

import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import mlflow
import numpy as np
import pickle

from sem_covid.services.store_registry import store_registry
from sem_covid import config
from umap import UMAP
from bertopic import BERTopic

# Global variables

In [2]:
CONTENT_CLEANED_TOPIC_MODELING_COLUMN_NAME = 'content_cleaned_topic_modeling'
DOCUMENT_EMBEDDINGS_EURLEX_BERT_COLUMN_NAME = 'document_embeddings_eurlex_bert'

EXPERIMENT_ID = '120'
BUCKET_NAME = 'mlflow'

MODEL_NR = ['model_1', 'model_2', 'model_3']
UMAP_MODEL_RANDOM_STATE = 30
UMAP_MODEL_N_NEIGHBORS = 15
UMAP_MODEL_N_COMPONENTS = 1
UMAP_MODEL_MIN_DIST = 0.0
UMAP_MODEL_METRIC = 'cosine'

# Fetch the data

In [3]:
es_store = store_registry.es_index_store()

In [4]:
ds_unified = es_store.get_dataframe(index_name=config.UNIFIED_DATASET_ELASTIC_SEARCH_INDEX_NAME)

100% (6360 of 6360) |####################| Elapsed Time: 0:00:15 Time:  0:00:15


In [5]:
ds_unified[CONTENT_CLEANED_TOPIC_MODELING_COLUMN_NAME]

_id
1624    commission staff work document accompany docum...
1625    regulation eu european parliament council   fe...
1626    communication commission european parliament c...
1627    european parliament resolution new multiannual...
1628    report commission european parliament council ...
                              ...                        
6355    statement national public health emergency tea...
6356    ministers mcconalogue heydon launch code good ...
6357    press release civil defence context covid-19 c...
6358    minister o’gorman launch lgbti+ youth ireland ...
6359    statement national public health emergency tea...
Name: content_cleaned_topic_modeling, Length: 6360, dtype: object

In [6]:
ds_unified[DOCUMENT_EMBEDDINGS_EURLEX_BERT_COLUMN_NAME]

_id
1624    [-0.151635021, 0.1485943347, 0.4740561545, -0....
1625    [-0.1007091999, 0.1460110992, 0.603488028, -0....
1626    [-0.1843583733, 0.1037103906, 0.325429529, -0....
1627    [-0.1732776463, -0.0359608233, 0.5692273974, -...
1628    [-0.0876726806, 0.0137052238, 0.4528407454, -0...
                              ...                        
6355    [-0.1895773709, 0.0278843828, 0.5813627839, -0...
6356    [-0.3856111765, 0.1667531282, 0.2791038156, -0...
6357    [-0.0512911826, 0.0993160307, 0.442119807, -0....
6358    [-0.3278441727, 0.1178098619, 0.5096987486, -0...
6359    [-0.0941892341, 0.0342305377, 0.5787585974, -0...
Name: document_embeddings_eurlex_bert, Length: 6360, dtype: object

# Train BERTopic (Model 1 - ds_pwdb, ds_eu_timeline, ds_ireland_timeline)

In [7]:
# To be implemented

# Train BERTopic (Model 2 - ds_eu_cellar)

In [8]:
# To be implemented

# Train BERTopic (Model 3 - all 4 datasets considered)

In [9]:
with mlflow.start_run(experiment_id=EXPERIMENT_ID):
    
    umap_model = UMAP(random_state=np.random.RandomState(UMAP_MODEL_RANDOM_STATE), n_neighbors=UMAP_MODEL_N_NEIGHBORS, 
                      n_components=UMAP_MODEL_N_COMPONENTS, min_dist=UMAP_MODEL_MIN_DIST, metric=UMAP_MODEL_METRIC)
    topic_model = BERTopic(nr_topics='auto', calculate_probabilities=True, umap_model=umap_model)
    
    topics, probabilities = topic_model.fit_transform(ds_unified[CONTENT_CLEANED_TOPIC_MODELING_COLUMN_NAME], 
                                                      np.array(list(ds_unified[DOCUMENT_EMBEDDINGS_EURLEX_BERT_COLUMN_NAME])))
    
    freq_topic_minus_1 = topic_model.get_topic_freq(topic=-1)
    
    mlflow.log_param('model_number', MODEL_NR[2])
    mlflow.log_param('umap_model_random_state', UMAP_MODEL_RANDOM_STATE)
    mlflow.log_param('umap_model_n_neighbors', UMAP_MODEL_N_NEIGHBORS)
    mlflow.log_param('umap_model_n_components', UMAP_MODEL_N_COMPONENTS)
    mlflow.log_param('umap_model_min_dist', UMAP_MODEL_MIN_DIST)
    mlflow.log_param('umap_model_metric', UMAP_MODEL_METRIC)
    mlflow.log_param('freq_topic_minus_1', freq_topic_minus_1)
    mlflow.log_param('total_nr_of_docs', len(ds_unified))
    
    run_id = mlflow.active_run().info.run_id
    store_registry.minio_object_store(BUCKET_NAME).put_object(object_name=f'{EXPERIMENT_ID}/{run_id}/artifacts/model/model.pkl',
                                                              content=pickle.dumps(topic_model))