In [2]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid import config
import numpy as np
from langdetect import detect, detect_langs, DetectorFactory

DetectorFactory.seed = 0
import pandas as pd
from sem_covid.services.model_registry import embedding_registry
from sem_covid.services.store_registry import StoreRegistry, store_registry
from typing import List
from sem_covid.adapters.abstract_store import IndexStoreABC


In [3]:
es_store = store_registry.es_index_store()
emb_model = embedding_registry.sent2vec_universal_sent_encoding()

INFO:absl:Using /tmp/tfhub_modules to cache modules.


In [4]:
def make_target_group_l1_column(dataset: pd.DataFrame):
    for col in TARGET_GROUPS_L1:
        dataset[col] = dataset[col].apply(lambda x: col if x == 1 else "")

    dataset["pwdb_target_group_l1"] = dataset[TARGET_GROUPS_L1].apply(lambda row: ' '.join(row.values.astype(str)),
                                                                      axis=1).apply(lambda x: x.split())


In [5]:
def make_topic_embeddings_column(dataset: pd.DataFrame):
    dataset["topic_embeddings"] = [[0] * 50] * len(dataset)

In [6]:
def create_new_column_with_defined_value(dataset: pd.DataFrame, column_name: str, value=None, empty_array=False):
    if empty_array:
        dataset[column_name] = [np.empty(0, dtype=float)] * len(dataset)
    else:
        dataset[column_name] = value

In [7]:
def replace_non_english_content(text):
    if text is not None:
        language = detect_langs(text)
        language_details = str(language[0]).split(":")
        if language_details[0] == "en" and float(language_details[1]) > 0.95:
            return text
        else:
            return None


In [8]:
TARGET_GROUPS_L1 = ["businesses", "workers", "citizens"]
COMMON_DATASET_COLUMNS = ["title", "content", "date", "doc_source", "country", "pwdb_category",
                          "pwdb_target_group_l1", "pwdb_funding", "pwdb_type_of_measure",
                          "pwdb_actors", "document_embeddings", "topic_embeddings"]

SPECIFIC_DATASET_COLUMNS = ["eu_cellar_subject_matter_labels", "eu_cellar_resource_type_labels",
                            "eu_cellar_directory_code_labels",
                            "eu_cellar_author_labels", "pwdb_target_group_l2", "ireland_keyword",
                            "ireland_department_data",
                            "ireland_campaign", "ireland_page_type", "eu_timeline_topic"]

CONTENT_COLUMN_NAME = 'content'
TITLE_COLUMN_NAME = 'title'
DATE_COLUMN_NAME = 'date'
DOCUMENT_SOURCE_COLUMN_NAME = 'doc_source'
COUNTRY_COLUMN_NAME = 'country'
PWDB_ACTORS_COLUMN_NAME = "pwdb_actors"
DOCUMENT_EMBEDDINGS_COLUMN_NAME = "document_embeddings"

#PWDB CONSTANTS
PWDB_CONTENT_COLUMNS = ['title', 'background_info_description', 'content_of_measure_description',
                        'use_of_measure_description', 'involvement_of_social_partners_description']
PWDB_DOC_SOURCE = 'ds_pwdb'
PWDB_RENAME_COLUMNS_MAPPING = {"category": "pwdb_category", "funding": "pwdb_funding",
                               "type_of_measure": "pwdb_type_of_measure", "actors": "pwdb_actors", "start_date": "date"}

#EU_CELLAR CONSTANTS
EU_CELLAR_CONTENT_COLUMNS = ["title", "content"]
EU_CELLAR_DOC_SOURCE = 'ds_eu_cellar'
EU_CELLAR_COUNTRY_NAME = "European Union"
EU_CELLAR_PWDB_ACTORS = "EU (Council, EC, EP)"
EU_CELLAR_RENAME_COLUMNS_MAPPING = {"category": "pwdb_category", "funding": "pwdb_funding",
                                    "type_of_measure": "pwdb_type_of_measure"}

#EU_TIMELINE CONSTANTS
EU_TIMELINE_CONTENT_COLUMNS = ["title", "abstract", "detail_content"]
EU_TIMELINE_DOC_SOURCE = 'ds_eu_timeline'
EU_TIMELINE_COUNTRY_NAME = "European Union"
EU_TIMELINE_PWDB_ACTORS = "EU (Council, EC, EP)"
EU_TIMELINE_RENAME_COLUMNS_MAPPING = {"category": "pwdb_category",
                                      "funding": "pwdb_funding",
                                      "type_of_measure": "pwdb_type_of_measure"}

#IRELAND_TIMELINE CONSTANTS
IRELAND_TIMELINE_CONTENT_COLUMNS = ["title", "content"]
IRELAND_TIMELINE_DOC_SOURCE = 'ds_ireland_timeline'
IRELAND_TIMELINE_COUNTRY_NAME = "Ireland"
IRELAND_TIMELINE_PWDB_ACTORS = "National government"
IRELAND_TIMELINE_RENAME_COLUMNS_MAPPING = {"category": "pwdb_category",
                                           "funding": "pwdb_funding",
                                           "type_of_measure": "pwdb_type_of_measure"}

In [9]:
def pwdb_replace_function(dataset: pd.DataFrame):
    dataset["pwdb_target_group_l2"] = dataset["target_groups"]


def eu_timeline_replace_function(dataset: pd.DataFrame):
    dataset["eu_timeline_topic"] = dataset["topics"]


def eu_cellar_replace_function(dataset: pd.DataFrame):
    dataset_specific_columns = ["subject_matter_labels",
                                "resource_type_labels",
                                "directory_codes_labels",
                                "author_labels"]
    for column_name in dataset_specific_columns:
        dataset["eu_cellar_" + column_name] = dataset[column_name]
        dataset["eu_cellar_" + column_name] = dataset["eu_cellar_" + column_name].apply(lambda x: x if x else [])


def ireland_timeline_replace_function(dataset: pd.DataFrame):
    dataset_specific_columns = ["keyword",
                                "department_data",
                                "campaigns_links",
                                "page_type"]
    for column_name in dataset_specific_columns:
        dataset["ireland_" + column_name] = dataset[column_name]
        dataset["ireland_" + column_name] = dataset["ireland_" + column_name].apply(lambda x: x if x else [])



In [10]:



from sem_covid.adapters.abstract_model import SentenceEmbeddingModelABC


class DefaultDatasetStructureTransformer:

    def __init__(self, dataset: pd.DataFrame,
                 emb_model: SentenceEmbeddingModelABC,
                 content_columns: List[str],
                 doc_source: str,
                 replace_function: callable,
                 rename_columns_mapping: dict,
                 country: str = None,
                 pwdb_actors: str = None
                 ):
        self.dataset = dataset
        self.content_columns = content_columns
        self.doc_source = doc_source
        self.country = country
        self.pwdb_actors = pwdb_actors
        self.replace_function = replace_function
        self.rename_columns_mapping = rename_columns_mapping
        self.emb_model = emb_model

    def create_columns(self):
        self.dataset[CONTENT_COLUMN_NAME] = self.dataset[self.content_columns].agg(
            lambda x: " ".join(item if item else "" for item in x),
            axis=1)
        create_new_column_with_defined_value(self.dataset, DOCUMENT_SOURCE_COLUMN_NAME, self.doc_source)
        if self.country:
            create_new_column_with_defined_value(self.dataset, COUNTRY_COLUMN_NAME, self.country)
        if self.pwdb_actors:
            create_new_column_with_defined_value(self.dataset, PWDB_ACTORS_COLUMN_NAME, self.pwdb_actors)
        make_target_group_l1_column(self.dataset)
        self.dataset[DOCUMENT_EMBEDDINGS_COLUMN_NAME] = self.emb_model.encode(self.dataset.content.values)
        make_topic_embeddings_column(self.dataset)

        for specific_column in SPECIFIC_DATASET_COLUMNS:
            create_new_column_with_defined_value(self.dataset, column_name=specific_column, empty_array=True)

    def replace_values(self):
        self.replace_function(self.dataset)

    def rename_columns(self):
        self.dataset.rename(columns=self.rename_columns_mapping, inplace=True)

    def execute(self) -> pd.DataFrame:
        self.create_columns()
        self.replace_values()
        self.rename_columns()
        return self.dataset


class UnifiedDatasetPipeline:

    def __init__(self, es_store: IndexStoreABC,
                 emb_model: SentenceEmbeddingModelABC
                 ):
        self.unified_dataset = pd.DataFrame()
        self.es_store = es_store
        self.emb_model = emb_model
        self.pwdb_df = pd.DataFrame()
        self.eu_cellar_df = pd.DataFrame()
        self.eu_timeline_df = pd.DataFrame()
        self.ir_timeline_df = pd.DataFrame()

    def get_steps(self) -> list:
        return [self.extract, self.transform, self.load]

    def extract(self):
        self.pwdb_df = self.es_store.get_dataframe(index_name=config.PWDB_ELASTIC_SEARCH_INDEX_NAME)
        self.eu_cellar_df = self.es_store.get_dataframe(
            index_name=config.EU_CELLAR_ELASTIC_SEARCH_INDEX_NAME + "_enriched")
        self.eu_timeline_df = self.es_store.get_dataframe(
            index_name=config.EU_TIMELINE_ELASTIC_SEARCH_INDEX_NAME + "_enriched")
        self.ir_timeline_df = self.es_store.get_dataframe(
            index_name=config.IRELAND_TIMELINE_ELASTIC_SEARCH_INDEX_NAME + "_enriched")

    def transform(self):
        self.pwdb_df = DefaultDatasetStructureTransformer(
            dataset=self.pwdb_df,
            emb_model=self.emb_model,
            content_columns=PWDB_CONTENT_COLUMNS,
            doc_source=PWDB_DOC_SOURCE,
            replace_function=pwdb_replace_function,
            rename_columns_mapping=PWDB_RENAME_COLUMNS_MAPPING
        ).execute()
        self.eu_cellar_df = DefaultDatasetStructureTransformer(
            dataset=self.eu_cellar_df,
            emb_model=self.emb_model,
            content_columns=EU_CELLAR_CONTENT_COLUMNS,
            doc_source=EU_CELLAR_DOC_SOURCE,
            replace_function=eu_cellar_replace_function,
            rename_columns_mapping=EU_CELLAR_RENAME_COLUMNS_MAPPING,
            country=EU_CELLAR_COUNTRY_NAME,
            pwdb_actors=EU_CELLAR_PWDB_ACTORS
        ).execute()
        self.eu_timeline_df = DefaultDatasetStructureTransformer(
            dataset=self.eu_timeline_df,
            emb_model=self.emb_model,
            content_columns=EU_TIMELINE_CONTENT_COLUMNS,
            doc_source=EU_TIMELINE_DOC_SOURCE,
            replace_function=eu_timeline_replace_function,
            rename_columns_mapping=EU_TIMELINE_RENAME_COLUMNS_MAPPING,
            country=EU_TIMELINE_COUNTRY_NAME,
            pwdb_actors=EU_TIMELINE_PWDB_ACTORS
        ).execute()
        self.ir_timeline_df = DefaultDatasetStructureTransformer(
            dataset=self.ir_timeline_df,
            emb_model=self.emb_model,
            content_columns=IRELAND_TIMELINE_CONTENT_COLUMNS,
            doc_source=IRELAND_TIMELINE_DOC_SOURCE,
            replace_function=ireland_timeline_replace_function,
            rename_columns_mapping=IRELAND_TIMELINE_RENAME_COLUMNS_MAPPING,
            country=IRELAND_TIMELINE_COUNTRY_NAME,
            pwdb_actors=IRELAND_TIMELINE_PWDB_ACTORS
        ).execute()
        data_frames = [pd.DataFrame(data_frame[COMMON_DATASET_COLUMNS + SPECIFIC_DATASET_COLUMNS].copy())
                       for data_frame in [self.pwdb_df, self.eu_cellar_df, self.eu_timeline_df, self.ir_timeline_df]]
        for data_frame in data_frames:
            data_frame.columns = COMMON_DATASET_COLUMNS + SPECIFIC_DATASET_COLUMNS
            data_frame[CONTENT_COLUMN_NAME] = data_frame[CONTENT_COLUMN_NAME].apply(
                lambda x: x if x not in ["", " "] else None)
            data_frame[CONTENT_COLUMN_NAME] = data_frame[CONTENT_COLUMN_NAME].apply(
                lambda x: replace_non_english_content(x))
            data_frame.dropna(subset=[CONTENT_COLUMN_NAME, 'title', 'date'], how="any", inplace=True)
        self.unified_dataset = pd.DataFrame(pd.concat(data_frames))
        self.unified_dataset.pwdb_funding = self.unified_dataset.pwdb_funding.apply(
            lambda x: x.split('|') if type(x) == str else x)
        self.unified_dataset.pwdb_actors = self.unified_dataset.pwdb_actors.apply(
            lambda x: x if type(x) == list else [x])
        #Note:the index is reset because enriched datasets have numeric indexing
        self.unified_dataset.reset_index(inplace=True, drop=True)

    def load(self):
        self.es_store.put_dataframe(index_name=config.UNIFIED_DATASET_ELASTIC_SEARCH_INDEX_NAME,
                                    content=self.unified_dataset)



In [11]:
unified_dataset_pipeline = UnifiedDatasetPipeline(es_store=es_store, emb_model=emb_model)

In [None]:
unified_dataset_pipeline.extract()
unified_dataset_pipeline.transform()
unified_dataset_pipeline.load()

N/A% (0 of 1381) |                       | Elapsed Time: 0:00:00 ETA:  --:--:--