In [2]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid import config
import numpy as np
from langdetect import detect, detect_langs, DetectorFactory
DetectorFactory.seed = 0
import pandas as pd
from sem_covid.services.model_registry import embedding_registry
from sem_covid.services.store_registry import StoreRegistry, store_registry


In [3]:
es_store = store_registry.es_index_store()
emb_model = embedding_registry.sent2vec_universal_sent_encoding()

In [6]:
def make_target_group_l1_column(dataset: pd.DataFrame):
    for col in TARGET_GROUPS_L1:
        dataset[col] = dataset[col].apply(lambda x: col if x == 1 else "")

    dataset["pwdb_target_group_l1"] = dataset[TARGET_GROUPS_L1].apply(lambda row: ' '.join(row.values.astype(str)),
                                                                      axis=1).apply(lambda x: x.split())


In [7]:
def make_document_embeddings_column(dataset: pd.DataFrame):
    dataset["document_embeddings"] = emb_model.encode(dataset.content.values)



In [8]:
def make_topic_embeddings_column(dataset: pd.DataFrame):
    dataset["topic_embeddings"] = [[0] * 50] * len(dataset)

In [9]:
def create_new_column_with_defined_value(dataset: pd.DataFrame, column_name: str, value=None, empty_array=False):
    if empty_array:
        dataset[column_name] = [np.empty(0, dtype=float)] * len(dataset)
    else:
        dataset[column_name] = value


In [10]:
TARGET_GROUPS_L1 = ["businesses", "workers", "citizens"]
COMMON_DATASET_COLUMNS = ["title", "content", "date", "doc_source", "country", "pwdb_category",
                          "pwdb_target_group_l1", "pwdb_funding", "pwdb_type_of_measure",
                          "pwdb_actors", "document_embeddings", "topic_embeddings"]

SPECIFIC_DATASET_COLUMNS = ["eu_cellar_subject_matter_labels", "eu_cellar_resource_type_labels",
                            "eu_cellar_directory_code_labels",
                            "eu_cellar_author_labels", "pwdb_target_group_l2", "ireland_keyword",
                            "ireland_department_data",
                            "ireland_campaign", "ireland_page_type", "eu_timeline_topic"]

In [11]:
# #Load dataframes from elastic
pwdb_df = es_store.get_dataframe(index_name=config.PWDB_ELASTIC_SEARCH_INDEX_NAME)
eu_cellar_df = es_store.get_dataframe(index_name=config.EU_CELLAR_ELASTIC_SEARCH_INDEX_NAME + "_enriched")
eu_timeline_df = es_store.get_dataframe(index_name=config.EU_TIMELINE_ELASTIC_SEARCH_INDEX_NAME + "_enriched")
ir_timeline_df = es_store.get_dataframe(index_name=config.IRELAND_TIMELINE_ELASTIC_SEARCH_INDEX_NAME + "_enriched")



100% (1381 of 1381) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (3175 of 3175) |####################| Elapsed Time: 0:00:02 Time:  0:00:02
100% (231 of 231) |######################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (1921 of 1921) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [12]:
# columns in PWDB dataset
pwdb_df.columns



Index(['identifier', 'title', 'title_national_language', 'country',
       'start_date', 'end_date', 'date_type', 'type_of_measure',
       'status_of_regulation', 'category', 'subcategory', 'creation_date',
       'background_info_description', 'content_of_measure_description',
       'use_of_measure_description', 'actors', 'target_groups', 'funding',
       'involvement_of_social_partners_description',
       'social_partner_involvement_form', 'social_partner_role',
       'is_sector_specific', 'private_or_public_sector',
       'is_occupation_specific', 'sectors', 'occupations', 'sources',
       'businesses', 'citizens', 'workers'],
      dtype='object')

In [90]:
CONTENT_COLUMNS = [col for col in pwdb_df.columns if "description" in col]

CONTENT_COLUMNS.insert(0, "title")

In [91]:
CONTENT_COLUMNS

['title',
 'background_info_description',
 'content_of_measure_description',
 'use_of_measure_description',
 'involvement_of_social_partners_description']

In [13]:
# create common columns
CONTENT_COLUMNS = [col for col in pwdb_df.columns if "description" in col]

CONTENT_COLUMNS.insert(0, "title")
pwdb_df["content"] = pwdb_df[CONTENT_COLUMNS].agg(lambda x: " ".join(item if item else "" for item in x), axis=1)
make_target_group_l1_column(pwdb_df)
make_document_embeddings_column(pwdb_df)
make_topic_embeddings_column(pwdb_df)

create_new_column_with_defined_value(pwdb_df, "doc_source", "ds_pwdb")

# create specific dataset columns

for column in SPECIFIC_DATASET_COLUMNS:
    create_new_column_with_defined_value(pwdb_df, column_name=column, empty_array=True)



In [14]:
#add values if the specific column exists in the current dataset
pwdb_df["pwdb_target_group_l2"] = pwdb_df["target_groups"]



In [15]:
# rename columns in the current dataset and prepare new version for merging
pwdb_df.rename(
    columns={"category": "pwdb_category", "funding": "pwdb_funding", "type_of_measure": "pwdb_type_of_measure"
        , "actors": "pwdb_actors", "start_date": "date"}, inplace=True)

tmp_pwdb_df = pd.DataFrame(pwdb_df[COMMON_DATASET_COLUMNS + SPECIFIC_DATASET_COLUMNS])

In [16]:
# Verify if the new version has all the necessary columns
if tmp_pwdb_df.columns.tolist() == COMMON_DATASET_COLUMNS + SPECIFIC_DATASET_COLUMNS:
    print("ready for merge")
else:
    raise ValueError("Some columns are missing!! Not ready for merge")



ready for merge


In [17]:
# Print dataframe columns
eu_cellar_df.columns

Index(['work', 'title', 'cdm_types', 'cdm_type_labels', 'resource_types',
       'resource_type_labels', 'eurovoc_concepts', 'eurovoc_concept_labels',
       'subject_matters', 'subject_matter_labels', 'directory_codes',
       'directory_codes_labels', 'celex_numbers', 'legal_elis', 'id_documents',
       'same_as_uris', 'authors', 'author_labels', 'full_ojs', 'oj_sectors',
       'internal_comments', 'is_in_force', 'dates_document', 'dates_created',
       'legal_dates_entry_into_force', 'legal_dates_signature', 'manifs_pdf',
       'manifs_html', 'pdfs_to_download', 'htmls_to_download', 'dossiers',
       'related_works', 'work_sequences', 'eu_cellar_core',
       'eu_cellar_extended', 'metadata', 'content_path', 'content', 'language',
       'businesses', 'citizens', 'workers', 'category', 'subcategory',
       'type_of_measure', 'funding'],
      dtype='object')

In [18]:
# create common columns
CONTENT_COLUMNS = ["title", "content"]
eu_cellar_df["content"] = eu_cellar_df[CONTENT_COLUMNS].agg(lambda x: " ".join(item if item else "" for item in x),
                                                            axis=1)
create_new_column_with_defined_value(eu_cellar_df, "doc_source", "ds_eu_cellar")
create_new_column_with_defined_value(eu_cellar_df, "country", "European Union")
create_new_column_with_defined_value(eu_cellar_df, "pwdb_actors", "EU (Council, EC, EP)")
make_target_group_l1_column(eu_cellar_df)
eu_cellar_df["date"] = eu_cellar_df["dates_document"]

make_document_embeddings_column(eu_cellar_df)
make_topic_embeddings_column(eu_cellar_df)
# create specific dataset columns

for column in SPECIFIC_DATASET_COLUMNS:
    create_new_column_with_defined_value(eu_cellar_df, column_name=column, empty_array=True)

eu_cellar_df[SPECIFIC_DATASET_COLUMNS]



Unnamed: 0_level_0,eu_cellar_subject_matter_labels,eu_cellar_resource_type_labels,eu_cellar_directory_code_labels,eu_cellar_author_labels,pwdb_target_group_l2,ireland_keyword,ireland_department_data,ireland_campaign,ireland_page_type,eu_timeline_topic
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,[],[],[],[],[],[],[],[],[],[]
1,[],[],[],[],[],[],[],[],[],[]
2,[],[],[],[],[],[],[],[],[],[]
3,[],[],[],[],[],[],[],[],[],[]
4,[],[],[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...
3170,[],[],[],[],[],[],[],[],[],[]
3171,[],[],[],[],[],[],[],[],[],[]
3172,[],[],[],[],[],[],[],[],[],[]
3173,[],[],[],[],[],[],[],[],[],[]


In [19]:
#add values if the specific column exists in the current dataset
eu_cellar_specific_columns = ["subject_matter_labels",
                              "resource_type_labels",
                              "directory_codes_labels",
                              "author_labels"]
for column_name in eu_cellar_specific_columns:
    eu_cellar_df["eu_cellar_" + column_name] = eu_cellar_df[column_name]
    eu_cellar_df["eu_cellar_" + column_name] = eu_cellar_df["eu_cellar_" + column_name].apply(lambda x: x if x else [])


In [20]:
# rename columns in the current dataset and prepare new version for merging
eu_cellar_df.rename(columns={"category": "pwdb_category", "funding": "pwdb_funding",
                             "type_of_measure": "pwdb_type_of_measure"}, inplace=True)

tmp_eu_cellar_df = pd.DataFrame(eu_cellar_df[COMMON_DATASET_COLUMNS + SPECIFIC_DATASET_COLUMNS])


In [21]:
# Verify if the new version has all the necessary columns
if tmp_eu_cellar_df.columns.tolist() == COMMON_DATASET_COLUMNS + SPECIFIC_DATASET_COLUMNS:
    print("ready for merge")
else:
    raise ValueError("Some columns are missing!! Not ready for merge")



ready for merge


In [22]:
# Print dataframe columns
eu_timeline_df.columns

Index(['month_name', 'date', 'title', 'abstract', 'presscorner_links',
       'all_links', 'detail_link', 'detail_type', 'detail_date',
       'detail_location', 'detail_content', 'detail_title',
       'for_more_information_links', 'detail_pdf_link', 'press_contacts',
       'topics', 'businesses', 'citizens', 'workers', 'category',
       'subcategory', 'type_of_measure', 'funding'],
      dtype='object')

In [23]:
# create columns
CONTENT_COLUMNS = ["title", "abstract", "detail_content"]
eu_timeline_df["content"] = eu_timeline_df[CONTENT_COLUMNS].agg(lambda x: " ".join(item if item else "" for item in x),
                                                                axis=1)
create_new_column_with_defined_value(eu_timeline_df, "doc_source", "ds_eu_timeline")
create_new_column_with_defined_value(eu_timeline_df, "country", "European Union")
create_new_column_with_defined_value(eu_timeline_df, "pwdb_actors", "EU (Council, EC, EP)")
make_target_group_l1_column(eu_timeline_df)

make_document_embeddings_column(eu_timeline_df)
make_topic_embeddings_column(eu_timeline_df)
# create specific dataset columns

for column in SPECIFIC_DATASET_COLUMNS:
    create_new_column_with_defined_value(eu_timeline_df, column_name=column, empty_array=True)

eu_timeline_df[SPECIFIC_DATASET_COLUMNS]

Unnamed: 0_level_0,eu_cellar_subject_matter_labels,eu_cellar_resource_type_labels,eu_cellar_directory_code_labels,eu_cellar_author_labels,pwdb_target_group_l2,ireland_keyword,ireland_department_data,ireland_campaign,ireland_page_type,eu_timeline_topic
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,[],[],[],[],[],[],[],[],[],[]
1,[],[],[],[],[],[],[],[],[],[]
2,[],[],[],[],[],[],[],[],[],[]
3,[],[],[],[],[],[],[],[],[],[]
4,[],[],[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...
226,[],[],[],[],[],[],[],[],[],[]
227,[],[],[],[],[],[],[],[],[],[]
228,[],[],[],[],[],[],[],[],[],[]
229,[],[],[],[],[],[],[],[],[],[]


In [24]:
#add values if the specific column exists in the current dataset
eu_timeline_df["eu_timeline_topic"] = eu_timeline_df["topics"]

In [25]:
# rename columns in the current dataset and prepare new version for merging
eu_timeline_df.rename(columns={"category": "pwdb_category",
                               "funding": "pwdb_funding",
                               "type_of_measure": "pwdb_type_of_measure"}, inplace=True)

tmp_eu_timeline_df = pd.DataFrame(eu_timeline_df[COMMON_DATASET_COLUMNS + SPECIFIC_DATASET_COLUMNS])


In [26]:
# Verify if the new version has all the necessary columns
if tmp_eu_timeline_df.columns.tolist() == COMMON_DATASET_COLUMNS + SPECIFIC_DATASET_COLUMNS:
    print("ready for merge")
else:
    raise ValueError("Some columns are missing!! Not ready for merge")



ready for merge


In [27]:
# Print dataframe columns
ir_timeline_df.columns



Index(['keyword', 'page_type', 'page_link', 'department_data',
       'published_date', 'updated_date', 'title', 'content', 'content_links',
       'campaigns_links', 'part_of_links', 'documents', 'businesses',
       'citizens', 'workers', 'category', 'subcategory', 'type_of_measure',
       'funding'],
      dtype='object')

In [28]:
# create columns
CONTENT_COLUMNS = ["title", "content"]
ir_timeline_df["content"] = ir_timeline_df[CONTENT_COLUMNS].agg(lambda x: " ".join(item if item else "" for item in x),
                                                                axis=1)
create_new_column_with_defined_value(ir_timeline_df, "doc_source", "ds_eu_timeline")
create_new_column_with_defined_value(ir_timeline_df, "country", "Ireland")
create_new_column_with_defined_value(ir_timeline_df, "pwdb_actors", "National government")
make_target_group_l1_column(ir_timeline_df)

make_document_embeddings_column(ir_timeline_df)
make_topic_embeddings_column(ir_timeline_df)
# create specific dataset columns

for column in SPECIFIC_DATASET_COLUMNS:
    create_new_column_with_defined_value(ir_timeline_df, column_name=column, empty_array=True)

ir_timeline_df[SPECIFIC_DATASET_COLUMNS]


Unnamed: 0_level_0,eu_cellar_subject_matter_labels,eu_cellar_resource_type_labels,eu_cellar_directory_code_labels,eu_cellar_author_labels,pwdb_target_group_l2,ireland_keyword,ireland_department_data,ireland_campaign,ireland_page_type,eu_timeline_topic
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,[],[],[],[],[],[],[],[],[],[]
1,[],[],[],[],[],[],[],[],[],[]
2,[],[],[],[],[],[],[],[],[],[]
3,[],[],[],[],[],[],[],[],[],[]
4,[],[],[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...
1916,[],[],[],[],[],[],[],[],[],[]
1917,[],[],[],[],[],[],[],[],[],[]
1918,[],[],[],[],[],[],[],[],[],[]
1919,[],[],[],[],[],[],[],[],[],[]


In [29]:
#add values if the specific column exists in the current dataset
ir_timeline_specific_columns = ["keyword",
                                "department_data",
                                "campaigns_links",
                                "page_type"]
for column_name in ir_timeline_specific_columns:
    ir_timeline_df["ireland_" + column_name] = ir_timeline_df[column_name]
    ir_timeline_df["ireland_" + column_name] = ir_timeline_df["ireland_" + column_name].apply(lambda x: x if x else [])

In [30]:

# rename columns in the current dataset and prepare new version for merging
ir_timeline_df.rename(columns={"category": "pwdb_category",
                               "funding": "pwdb_funding",
                               "type_of_measure": "pwdb_type_of_measure"}, inplace=True)

In [31]:
tmp_ir_timeline_df = pd.DataFrame(ir_timeline_df[COMMON_DATASET_COLUMNS + SPECIFIC_DATASET_COLUMNS])

In [32]:

# Verify if the new version has all the necessary columns
if tmp_eu_timeline_df.columns.tolist() == COMMON_DATASET_COLUMNS + SPECIFIC_DATASET_COLUMNS:
    print("ready for merge")
else:
    raise ValueError("Some columns are missing!! Not ready for merge")


ready for merge


In [33]:
def replace_non_english_content(text):
    if text is not None:
        language = detect_langs(text)
        language_details = str(language[0]).split(":")
        if language_details[0] == "en" and float(language_details[1]) > 0.95:
            return text
        else:
            return None




In [34]:
#Dropping rows without content, title and date
data_frames = [tmp_eu_cellar_df, tmp_eu_timeline_df, tmp_ir_timeline_df, tmp_pwdb_df]
for data_frame in data_frames:
    data_frame.columns = COMMON_DATASET_COLUMNS + SPECIFIC_DATASET_COLUMNS
    data_frame["content"] = data_frame["content"].apply(lambda x: x if x not in ["", " "] else None)
    data_frame["content"] = data_frame["content"].apply(lambda x: replace_non_english_content(x))
    data_frame.dropna(subset=['content', 'title', 'date'], how="any", inplace=True)






In [35]:
#Unified datasets dataframe
unified_datasets_df = pd.DataFrame(pd.concat(data_frames))
unified_datasets_df



Unnamed: 0_level_0,title,content,date,doc_source,country,pwdb_category,pwdb_target_group_l1,pwdb_funding,pwdb_type_of_measure,pwdb_actors,...,eu_cellar_subject_matter_labels,eu_cellar_resource_type_labels,eu_cellar_directory_code_labels,eu_cellar_author_labels,pwdb_target_group_l2,ireland_keyword,ireland_department_data,ireland_campaign,ireland_page_type,eu_timeline_topic
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Prior notification of a concentration (Case M....,Prior notification of a concentration (Case M....,2020-01-30,ds_eu_cellar,European Union,Supporting businesses to get back to normal,[],National funds,Legislations or other statutory regulations,"EU (Council, EC, EP)",...,"[Competition, Concentrations between undertaki...",[Announcements],[],"[Directorate-General for Competition, European...",[],[],[],[],[],[]
1,COMMISSION STAFF WORKING DOCUMENT […] Accompan...,COMMISSION STAFF WORKING DOCUMENT […] Accompan...,2021-08-30,ds_eu_cellar,European Union,"Promoting the economic, labour market and soci...",[],European Funds|National funds|Regional funds,Legislations or other statutory regulations,"EU (Council, EC, EP)",...,"[Commercial policy, Dumping]",[Staff working document],[],"[Directorate-General for Trade, European Commi...",[],[],[],[],[],[]
2,P9_TA(2020)0157 Amending Regulations (EU) No 5...,P9_TA(2020)0157 Amending Regulations (EU) No 5...,2020-06-18,ds_eu_cellar,European Union,Supporting businesses to get back to normal,[],No special funding required,Legislations or other statutory regulations,"EU (Council, EC, EP)",...,"[Freedom of establishment, Internal market - P...",[Legislative resolution],[],"[Committee on Economic and Monetary Affairs, E...",[],[],[],[],[],[]
3,Opinion No 6/2020 (pursuant to Article 287(4) ...,Opinion No 6/2020 (pursuant to Article 287(4) ...,2020-09-07,ds_eu_cellar,European Union,"Promoting the economic, labour market and soci...",[],No special funding required,Legislations or other statutory regulations,"EU (Council, EC, EP)",...,[Economic policy],[Opinion],[],[European Court of Auditors],[],[],[],[],[],[]
4,Council Recommendation (EU) 2021/816 20 May 20...,Council Recommendation (EU) 2021/816 20 May 20...,2021-05-20,ds_eu_cellar,European Union,Ensuring business continuity and support for e...,[],No special funding required,Legislations or other statutory regulations,"EU (Council, EC, EP)",...,"[Area of freedom, security and justice, Border...",[Recommendation],[],[Council of the European Union],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tika/330d076853d0731e295d75f70841b445e4bc45061a8ae1507766fa26d778cc6f,Government support to assist the sports sector...,Government support to assist the sports sector...,2020-04-09,ds_pwdb,Croatia,Employment protection and retention,[businesses],[National funds],Legislations or other statutory regulations,[National government],...,[],[],[],[],[Sector specific set of companies],[],[],[],[],[]
tika/ffaa29a24828dd01c81d66ece0f8f3c21451083be7a0b6d2f4ed954b9378b0e6,National public sector collective agreement 2021,National public sector collective agreement 20...,2021-01-01,ds_pwdb,Lithuania,"Protection of workers, adaptation of workplace",[workers],"[Employer, National funds]",Bipartite collective agreements,"[National government, Trade unions]",...,[],[],[],[],[Other groups of workers],[],[],[],[],[]
tika/3df93a8fb254187e8783d19760b465a98d88863e59a513b35a4d8062248e4190,"Lifting of overtime work restrictions, extensi...","Lifting of overtime work restrictions, extensi...",2020-03-14,ds_pwdb,Greece,Ensuring business continuity and support for e...,[workers],[Companies],Legislations or other statutory regulations,"[National government, Company / Companies]",...,[],[],[],[],[Employees in standard employment],[],[],[],[],[]
tika/71fc1cad3a3a8b9de4da7080acc3a8251fa8ff8c5b7514f082f18c1cf69ab140,Exceptional regime suspending overtime limits,Exceptional regime suspending overtime limits ...,2020-03-14,ds_pwdb,Portugal,Ensuring business continuity and support for e...,[workers],[No special funding required],Legislations or other statutory regulations,"[National government, Local / regional governm...",...,[],[],[],[],"[Other groups of workers, Workers in essential...",[],[],[],[],[]


In [59]:
def filter_func(x):
    if type(x) == str:
        return x.split('|')
    else:
        return x


unified_datasets_df.pwdb_funding = unified_datasets_df.pwdb_funding.apply(filter_func)

In [65]:
unified_datasets_df.pwdb_actors = unified_datasets_df.pwdb_actors.apply(lambda x: x if type(x) == list else [x])

In [67]:
unified_datasets_df.pwdb_actors.values

array([list(['EU (Council, EC, EP)']), list(['EU (Council, EC, EP)']),
       list(['EU (Council, EC, EP)']), ...,
       list(['National government', 'Company / Companies']),
       list(['National government', 'Local / regional government', 'Public support service providers']),
       list(['National government', 'Trade unions', "Employers' organisations"])],
      dtype=object)

In [68]:
test_unified_datasets_df = pd.DataFrame(unified_datasets_df.copy())

In [69]:
test_unified_datasets_df.reset_index(inplace=True, drop=True)


In [72]:
#Upload to elastic
store_registry.es_index_store().put_dataframe(index_name=config.UNIFIED_DATASET_ELASTIC_SEARCH_INDEX_NAME,
                                              content=test_unified_datasets_df)




 96% (4512 of 4690) |################### | Elapsed Time: 0:00:00 ETA:   0:00:00

4690

In [73]:
from typing import List
from sem_covid.adapters.abstract_store import IndexStoreABC

CONTENT_COLUMN_NAME = 'content'
TITLE_COLUMN_NAME = 'title'
DATE_COLUMN_NAME = 'date'
DOCUMENT_SOURCE_COLUMN_NAME = 'doc_source'
COUNTRY_COLUMN_NAME = 'country'
PWDB_ACTORS_COLUMN_NAME = "pwdb_actors"

#PWDB CONSTANTS
PWDB_CONTENT_COLUMNS = ['title', 'background_info_description', 'content_of_measure_description',
                        'use_of_measure_description', 'involvement_of_social_partners_description']
PWDB_DOC_SOURCE = 'ds_pwdb'
PWDB_RENAME_COLUMNS_MAPPING = {"category": "pwdb_category", "funding": "pwdb_funding",
                               "type_of_measure": "pwdb_type_of_measure", "actors": "pwdb_actors", "start_date": "date"}

#EU_CELLAR CONSTANTS
EU_CELLAR_CONTENT_COLUMNS = ["title", "content"]
EU_CELLAR_DOC_SOURCE = 'ds_eu_cellar'
EU_CELLAR_COUNTRY_NAME = "European Union"
EU_CELLAR_PWDB_ACTORS = "EU (Council, EC, EP)"
EU_CELLAR_RENAME_COLUMNS_MAPPING = {"category": "pwdb_category", "funding": "pwdb_funding",
                                    "type_of_measure": "pwdb_type_of_measure"}

#EU_TIMELINE CONSTANTS
EU_TIMELINE_CONTENT_COLUMNS = ["title", "abstract", "detail_content"]
EU_TIMELINE_DOC_SOURCE = 'ds_eu_timeline'
EU_TIMELINE_COUNTRY_NAME = "European Union"
EU_TIMELINE_PWDB_ACTORS = "EU (Council, EC, EP)"
EU_TIMELINE_RENAME_COLUMNS_MAPPING = {"category": "pwdb_category",
                                      "funding": "pwdb_funding",
                                      "type_of_measure": "pwdb_type_of_measure"}

#IRELAND_TIMELINE CONSTANTS
IRELAND_TIMELINE_CONTENT_COLUMNS = ["title", "content"]
IRELAND_TIMELINE_DOC_SOURCE = 'ds_ireland_timeline'
IRELAND_TIMELINE_COUNTRY_NAME = "Ireland"
IRELAND_TIMELINE_PWDB_ACTORS = "National government"
IRELAND_TIMELINE_RENAME_COLUMNS_MAPPING = {"category": "pwdb_category",
                                           "funding": "pwdb_funding",
                                           "type_of_measure": "pwdb_type_of_measure"}

SyntaxError: invalid syntax (269289113.py, line 6)

In [None]:
def pwdb_replace_function(dataset: pd.DataFrame):
    dataset["pwdb_target_group_l2"] = dataset["target_groups"]


def eu_timeline_replace_function(dataset: pd.DataFrame):
    dataset["eu_timeline_topic"] = dataset["topics"]


def eu_cellar_replace_function(dataset: pd.DataFrame):
    dataset_specific_columns = ["subject_matter_labels",
                                "resource_type_labels",
                                "directory_codes_labels",
                                "author_labels"]
    for column_name in dataset_specific_columns:
        dataset["eu_cellar_" + column_name] = dataset[column_name]
        dataset["eu_cellar_" + column_name] = dataset["eu_cellar_" + column_name].apply(lambda x: x if x else [])


def ireland_timeline_replace_function(dataset: pd.DataFrame):
    dataset_specific_columns = ["keyword",
                                "department_data",
                                "campaigns_links",
                                "page_type"]
    for column_name in dataset_specific_columns:
        dataset["ireland_" + column_name] = dataset[column_name]
        dataset["ireland_" + column_name] = dataset["ireland_" + column_name].apply(lambda x: x if x else [])



In [None]:
class DefaultDatasetStructureTransformer:

    def __init__(self, dataset: pd.DataFrame,
                 content_columns: List[str],
                 doc_source: str,
                 replace_function: callable,
                 rename_columns_mapping: dict,
                 country: str = None,
                 pwdb_actors: str = None
                 ):
        self.dataset = dataset
        self.content_columns = content_columns
        self.doc_source = doc_source
        self.country = country
        self.pwdb_actors = pwdb_actors
        self.replace_function = replace_function
        self.rename_columns_mapping = rename_columns_mapping

    def create_columns(self):
        self.dataset[CONTENT_COLUMN_NAME] = self.dataset[self.content_columns].agg(
            lambda x: " ".join(item if item else "" for item in x),
            axis=1)
        create_new_column_with_defined_value(self.dataset, DOCUMENT_SOURCE_COLUMN_NAME, self.doc_source)
        if self.country:
            create_new_column_with_defined_value(self.dataset, COUNTRY_COLUMN_NAME, self.country)
        if self.pwdb_actors:
            create_new_column_with_defined_value(self.dataset, PWDB_ACTORS_COLUMN_NAME, self.pwdb_actors)
        make_target_group_l1_column(self.dataset)
        make_document_embeddings_column(self.dataset)
        make_topic_embeddings_column(self.dataset)

        for specific_column in SPECIFIC_DATASET_COLUMNS:
            create_new_column_with_defined_value(self.dataset, column_name=specific_column, empty_array=True)

    def replace_values(self):
        self.replace_function(self.dataset)

    def rename_columns(self):
        self.dataset.rename(columns=self.rename_columns_mapping, inplace=True)

    def execute(self) -> pd.DataFrame:
        self.create_columns()
        self.replace_values()
        self.rename_columns()
        return self.dataset


class UnifiedDatasetPipeline:

    def __init__(self, es_store: IndexStoreABC):
        self.unified_dataset = pd.DataFrame()
        self.es_store = es_store
        self.pwdb_df = pd.DataFrame()
        self.eu_cellar_df = pd.DataFrame()
        self.eu_timeline_df = pd.DataFrame()
        self.ir_timeline_df = pd.DataFrame()

    def get_steps(self) -> list:
        return [self.extract, self.transform, self.load]

    def extract(self):
        self.pwdb_df = self.es_store.get_dataframe(index_name=config.PWDB_ELASTIC_SEARCH_INDEX_NAME)
        self.eu_cellar_df = self.es_store.get_dataframe(
            index_name=config.EU_CELLAR_ELASTIC_SEARCH_INDEX_NAME + "_enriched")
        self.eu_timeline_df = self.es_store.get_dataframe(
            index_name=config.EU_TIMELINE_ELASTIC_SEARCH_INDEX_NAME + "_enriched")
        self.ir_timeline_df = self.es_store.get_dataframe(
            index_name=config.IRELAND_TIMELINE_ELASTIC_SEARCH_INDEX_NAME + "_enriched")

    def transform(self):
        self.pwdb_df = DefaultDatasetStructureTransformer(
            dataset=self.pwdb_df,
            content_columns=PWDB_CONTENT_COLUMNS,
            doc_source=PWDB_DOC_SOURCE,
            replace_function=pwdb_replace_function,
            rename_columns_mapping=PWDB_RENAME_COLUMNS_MAPPING
        ).execute()
        self.eu_cellar_df = DefaultDatasetStructureTransformer(
            dataset=self.eu_cellar_df,
            content_columns=EU_CELLAR_CONTENT_COLUMNS,
            doc_source=EU_CELLAR_DOC_SOURCE,
            replace_function=eu_cellar_replace_function,
            rename_columns_mapping=EU_CELLAR_RENAME_COLUMNS_MAPPING,
            country=EU_CELLAR_COUNTRY_NAME,
            pwdb_actors=EU_CELLAR_PWDB_ACTORS
        ).execute()
        self.eu_timeline_df = DefaultDatasetStructureTransformer(
            dataset=self.eu_timeline_df,
            content_columns=EU_TIMELINE_CONTENT_COLUMNS,
            doc_source=EU_TIMELINE_DOC_SOURCE,
            replace_function=eu_timeline_replace_function,
            rename_columns_mapping=EU_TIMELINE_RENAME_COLUMNS_MAPPING,
            country=EU_TIMELINE_COUNTRY_NAME,
            pwdb_actors=EU_TIMELINE_PWDB_ACTORS
        ).execute()
        self.ir_timeline_df = DefaultDatasetStructureTransformer(
            dataset=self.ir_timeline_df,
            content_columns=IRELAND_TIMELINE_CONTENT_COLUMNS,
            doc_source=IRELAND_TIMELINE_DOC_SOURCE,
            replace_function=ireland_timeline_replace_function,
            rename_columns_mapping=IRELAND_TIMELINE_RENAME_COLUMNS_MAPPING,
            country=IRELAND_TIMELINE_COUNTRY_NAME,
            pwdb_actors=IRELAND_TIMELINE_PWDB_ACTORS
        ).execute()
        data_frames = [pd.DataFrame(data_frame[COMMON_DATASET_COLUMNS + SPECIFIC_DATASET_COLUMNS].copy())
                       for data_frame in [self.pwdb_df, self.eu_cellar_df, self.eu_timeline_df, self.ir_timeline_df]]
        for data_frame in data_frames:
            data_frame.columns = COMMON_DATASET_COLUMNS + SPECIFIC_DATASET_COLUMNS
            data_frame[CONTENT_COLUMN_NAME] = data_frame[CONTENT_COLUMN_NAME].apply(
                lambda x: x if x not in ["", " "] else None)
            data_frame[CONTENT_COLUMN_NAME] = data_frame[CONTENT_COLUMN_NAME].apply(
                lambda x: replace_non_english_content(x))
            data_frame.dropna(subset=[CONTENT_COLUMN_NAME, 'title', 'date'], how="any", inplace=True)
        self.unified_dataset = pd.DataFrame(pd.concat(data_frames))
        self.unified_dataset.pwdb_funding = self.unified_dataset.pwdb_funding.apply(
            lambda x: x.split('|') if type(x) == str else x)
        self.unified_dataset.pwdb_actors = self.unified_dataset.pwdb_actors.apply(
            lambda x: x if type(x) == list else [x])
        #Note:the index is reset because enriched datasets have numeric indexing
        self.unified_dataset.reset_index(inplace=True, drop=True)

    def load(self):
        self.es_store.put_dataframe(index_name=config.UNIFIED_DATASET_ELASTIC_SEARCH_INDEX_NAME,
                                    content=self.unified_dataset)


