In [1]:
import sys

import pandas as pd

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid import config
import json
import numpy as np
import re
from langdetect import detect, detect_langs, DetectorFactory

DetectorFactory.seed = 0

from sem_covid.services.store_registry import StoreRegistry, store_registry


In [2]:
es_store = store_registry.es_index_store()

In [3]:
#Load dataframes from elastic
pwdb_df = es_store.get_dataframe(index_name=config.PWDB_ELASTIC_SEARCH_INDEX_NAME)
eu_cellar_df = es_store.get_dataframe(index_name=config.EU_CELLAR_ELASTIC_SEARCH_INDEX_NAME)
eu_timeline_df = es_store.get_dataframe(index_name=config.EU_TIMELINE_ELASTIC_SEARCH_INDEX_NAME)
ir_timeline_df = es_store.get_dataframe(index_name=config.IRELAND_TIMELINE_ELASTIC_SEARCH_INDEX_NAME)


100% (1381 of 1381) |####################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (3175 of 3175) |####################| Elapsed Time: 0:00:01 Time:  0:00:01
100% (210 of 210) |######################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (1921 of 1921) |####################| Elapsed Time: 0:00:00 Time:  0:00:00


In [4]:
# Print dataframe columns
pwdb_df.columns


Index(['identifier', 'title', 'title_national_language', 'country',
       'start_date', 'end_date', 'date_type', 'type_of_measure',
       'status_of_regulation', 'category', 'subcategory', 'creation_date',
       'background_info_description', 'content_of_measure_description',
       'use_of_measure_description', 'actors', 'target_groups', 'funding',
       'involvement_of_social_partners_description',
       'social_partner_involvement_form', 'social_partner_role',
       'is_sector_specific', 'private_or_public_sector',
       'is_occupation_specific', 'sectors', 'occupations', 'sources',
       'businesses', 'citizens', 'workers'],
      dtype='object')

In [5]:
# What are the textual columns in PWDB dataset
textual_colums = [col for col in pwdb_df.columns if "description" in col]
print(textual_colums)

['background_info_description', 'content_of_measure_description', 'use_of_measure_description', 'involvement_of_social_partners_description']


In [6]:
# Print only textual columns
pwdb_df[textual_colums]


Unnamed: 0_level_0,background_info_description,content_of_measure_description,use_of_measure_description,involvement_of_social_partners_description
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tika/2ec585e04df9d361a951e7d26ce5d3ab06e4d17612e36ee84696c1451c972c4b,"This measure, introduced in the framework of t...","According to the Employment Relationship Act, ...",Companies that are not eligible are:\r\r* Stat...,No information available yet.
tika/497730ad75916dd4a3174469d0946ccf6218adfd9513cfffb52038a3dce3fa8d,Following up the decree no. 6 issued on 23 Feb...,The protocol introduced general rules to follo...,The protocol applies to all the kind of transp...,The protocol was negotiated by the Ministry fo...
tika/5f74df01de688c3083e10eb50ebd865b081ee3595a662118c3d1d4ca1ad09bc2,Regional ELY Centers (Centers for Economic Dev...,Business development aid granted by ELY Center...,"As of 28 August 2020, approximately 21,000 com...",There is no information regarding whether and ...
tika/ae1904ac24001af20cfaf9ce898605e95122371e26ae96b7337d114098819c33,Aiming at responding to the specific nature of...,The Order 3863-B/2020 of 27 March 2020 provide...,Not available,No involvement of social partners.
tika/3bafed913ae25dd3152fec477a007abfa079507146ee1b7f8b0a9abad10dc2aa,The Council of Ministers has approved a Royal ...,This set of measures has been articulated arou...,No data available.,No involvement has been reported.
...,...,...,...,...
tika/330d076853d0731e295d75f70841b445e4bc45061a8ae1507766fa26d778cc6f,"As sport events have been prohibited, the fina...",This Decision establishes measures to assist t...,The usage is still unknown but it can be asses...,
tika/ffaa29a24828dd01c81d66ece0f8f3c21451083be7a0b6d2f4ed954b9378b0e6,"On 16 October 2020, the Government of the Repu...","The NCA increased the BAOS, which is used to c...",According to the Ministry of Social Security a...,The NCA was signed by the Ministry of Social S...
tika/3df93a8fb254187e8783d19760b465a98d88863e59a513b35a4d8062248e4190,The government with a Legislative Act (Offici...,During the period when there is the risk of ou...,The number of employees affected by the measur...,
tika/71fc1cad3a3a8b9de4da7080acc3a8251fa8ff8c5b7514f082f18c1cf69ab140,"The Decree Law No. 10-A/2020 of 13 March, sett...","This measure applies to all bodies, agencies a...",Not evaluated yet,


In [7]:
#Merge content of the textual columns into one new column
pwdb_df["merged_content"] = pwdb_df[textual_colums].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
pwdb_df["merged_content"]

_id
tika/2ec585e04df9d361a951e7d26ce5d3ab06e4d17612e36ee84696c1451c972c4b    This measure, introduced in the framework of t...
tika/497730ad75916dd4a3174469d0946ccf6218adfd9513cfffb52038a3dce3fa8d    Following up the decree no. 6 issued on 23 Feb...
tika/5f74df01de688c3083e10eb50ebd865b081ee3595a662118c3d1d4ca1ad09bc2    Regional ELY Centers (Centers for Economic Dev...
tika/ae1904ac24001af20cfaf9ce898605e95122371e26ae96b7337d114098819c33    Aiming at responding to the specific nature of...
tika/3bafed913ae25dd3152fec477a007abfa079507146ee1b7f8b0a9abad10dc2aa    The Council of Ministers has approved a Royal ...
                                                                                               ...                        
tika/330d076853d0731e295d75f70841b445e4bc45061a8ae1507766fa26d778cc6f    As sport events have been prohibited, the fina...
tika/ffaa29a24828dd01c81d66ece0f8f3c21451083be7a0b6d2f4ed954b9378b0e6    On 16 October 2020, the Government of the Repu...
tika/3df93a8

In [8]:
#What are the columns that shows dates
date_columns = [col for col in pwdb_df.columns if "date" in col]
pwdb_df[date_columns]

Unnamed: 0_level_0,start_date,end_date,date_type,creation_date
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tika/2ec585e04df9d361a951e7d26ce5d3ab06e4d17612e36ee84696c1451c972c4b,2020-03-13,2021-06-30,,2020-04-10
tika/497730ad75916dd4a3174469d0946ccf6218adfd9513cfffb52038a3dce3fa8d,2020-05-04,,,2020-09-28
tika/5f74df01de688c3083e10eb50ebd865b081ee3595a662118c3d1d4ca1ad09bc2,2020-03-31,2020-06-08,,2020-04-10
tika/ae1904ac24001af20cfaf9ce898605e95122371e26ae96b7337d114098819c33,2020-03-28,,,2020-04-09
tika/3bafed913ae25dd3152fec477a007abfa079507146ee1b7f8b0a9abad10dc2aa,2020-07-07,,,2020-10-29
...,...,...,...,...
tika/330d076853d0731e295d75f70841b445e4bc45061a8ae1507766fa26d778cc6f,2020-04-09,2020-05-29,,2020-04-12
tika/ffaa29a24828dd01c81d66ece0f8f3c21451083be7a0b6d2f4ed954b9378b0e6,2021-01-01,2021-12-31,,2020-10-23
tika/3df93a8fb254187e8783d19760b465a98d88863e59a513b35a4d8062248e4190,2020-03-14,2020-06-13,,2020-04-16
tika/71fc1cad3a3a8b9de4da7080acc3a8251fa8ff8c5b7514f082f18c1cf69ab140,2020-03-14,,,2020-04-06


In [9]:
# Create new dataframe with the wanted content and columns from PWDB dataset
tmp_pwdb_df = pd.DataFrame(pwdb_df[["creation_date", "title", "merged_content"]])
tmp_pwdb_df["doc_source"] = "pwdb"


In [10]:
# See the newly created PWDB dataframe
tmp_pwdb_df

Unnamed: 0_level_0,creation_date,title,merged_content,doc_source
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tika/2ec585e04df9d361a951e7d26ce5d3ab06e4d17612e36ee84696c1451c972c4b,2020-04-10,Temporary layoff scheme and reimbursement of r...,"This measure, introduced in the framework of t...",pwdb
tika/497730ad75916dd4a3174469d0946ccf6218adfd9513cfffb52038a3dce3fa8d,2020-09-28,Protocol for the containment of the COVID-19 s...,Following up the decree no. 6 issued on 23 Feb...,pwdb
tika/5f74df01de688c3083e10eb50ebd865b081ee3595a662118c3d1d4ca1ad09bc2,2020-04-10,ELY centres' business development aid to micro...,Regional ELY Centers (Centers for Economic Dev...,pwdb
tika/ae1904ac24001af20cfaf9ce898605e95122371e26ae96b7337d114098819c33,2020-04-09,Regularisation of immigrants' presence on nati...,Aiming at responding to the specific nature of...,pwdb
tika/3bafed913ae25dd3152fec477a007abfa079507146ee1b7f8b0a9abad10dc2aa,2020-10-29,Economic reactivation measures to face the imp...,The Council of Ministers has approved a Royal ...,pwdb
...,...,...,...,...
tika/330d076853d0731e295d75f70841b445e4bc45061a8ae1507766fa26d778cc6f,2020-04-12,Government support to assist the sports sector...,"As sport events have been prohibited, the fina...",pwdb
tika/ffaa29a24828dd01c81d66ece0f8f3c21451083be7a0b6d2f4ed954b9378b0e6,2020-10-23,National public sector collective agreement 2021,"On 16 October 2020, the Government of the Repu...",pwdb
tika/3df93a8fb254187e8783d19760b465a98d88863e59a513b35a4d8062248e4190,2020-04-16,"Lifting of overtime work restrictions, extensi...",The government with a Legislative Act (Offici...,pwdb
tika/71fc1cad3a3a8b9de4da7080acc3a8251fa8ff8c5b7514f082f18c1cf69ab140,2020-04-06,Exceptional regime suspending overtime limits,"The Decree Law No. 10-A/2020 of 13 March, sett...",pwdb


In [11]:
# Print dataframe columns
eu_cellar_df.columns

Index(['work', 'title', 'cdm_types', 'cdm_type_labels', 'resource_types',
       'resource_type_labels', 'eurovoc_concepts', 'eurovoc_concept_labels',
       'subject_matters', 'subject_matter_labels', 'directory_codes',
       'directory_codes_labels', 'celex_numbers', 'legal_elis', 'id_documents',
       'same_as_uris', 'authors', 'author_labels', 'full_ojs', 'oj_sectors',
       'internal_comments', 'is_in_force', 'dates_document', 'dates_created',
       'legal_dates_entry_into_force', 'legal_dates_signature', 'manifs_pdf',
       'manifs_html', 'pdfs_to_download', 'htmls_to_download', 'dossiers',
       'related_works', 'work_sequences', 'eu_cellar_core',
       'eu_cellar_extended', 'metadata', 'content_path', 'content',
       'language'],
      dtype='object')

In [12]:
#What are the columns that shows dates
date_colums = [col for col in eu_cellar_df.columns if "date" in col]
eu_cellar_df[date_colums]


Unnamed: 0_level_0,dates_document,dates_created,legal_dates_entry_into_force,legal_dates_signature
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0e3d35dcad11d1d80f8fb1c35599be27fb3e864874ab1ac679fcd004723bc2e3,2020-01-30,,,
39dac43e4fffa3fc60906f3563e4d1547aea4a0f94e17cd28ec144b117e25df6,2021-08-30,,,
00f65d205b8df943ae8acbd34b729c3da11da1646e7e324a6c517f95f950a5d9,2020-06-18,,,
002086bd15c9aba8b8b3cdf88498e25735cf66043ddab8877aca317256615aff,2020-09-07,,,
014589ceb95203c13ee4bd097ddf2f164657cb6aeb1650db280b2d8c7a57977d,2021-05-20,,,
...,...,...,...,...
5d1566cde8e0ff00d5e907e551d9202c714e320039c29609324e0eb579395aa1,2020-12-10,,,
f63ec4df58c38a804c983d295869f22e89515ef32c686841d43b7a9618e106d9,2021-02-24,,,
5c798a5a6aeb16a370e018faa10f1f79ce040d1adc8e0a2c4d06470a3d20f52b,2020-12-29,,,
f6334c565e8ab7de7306c50e94d8ef10c6cf38e8d39dc7d3be72aa244c522f22,2020-06-18,,,


In [13]:
# Create new dataframe with the wanted content and columns from Cellar dataset
tmp_cellar_df = pd.DataFrame(eu_cellar_df[["dates_document", "title", "content"]])
tmp_cellar_df["doc_source"] = "eu_cellar"

In [14]:
# See the newly created Cellar dataframe
tmp_cellar_df


Unnamed: 0_level_0,dates_document,title,content,doc_source
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0e3d35dcad11d1d80f8fb1c35599be27fb3e864874ab1ac679fcd004723bc2e3,2020-01-30,Prior notification of a concentration (Case M....,30.1.2020 EN Official Journal of the European ...,eu_cellar
39dac43e4fffa3fc60906f3563e4d1547aea4a0f94e17cd28ec144b117e25df6,2021-08-30,COMMISSION STAFF WORKING DOCUMENT […] Accompan...,"EUROPEAN COMMISSION Brussels, 30.8.2021 SWD(20...",eu_cellar
00f65d205b8df943ae8acbd34b729c3da11da1646e7e324a6c517f95f950a5d9,2020-06-18,P9_TA(2020)0157 Amending Regulations (EU) No 5...,8.9.2021 EN Official Journal of the European U...,eu_cellar
002086bd15c9aba8b8b3cdf88498e25735cf66043ddab8877aca317256615aff,2020-09-07,Opinion No 6/2020 (pursuant to Article 287(4) ...,20.10.2020 EN Official Journal of the European...,eu_cellar
014589ceb95203c13ee4bd097ddf2f164657cb6aeb1650db280b2d8c7a57977d,2021-05-20,Council Recommendation (EU) 2021/816 20 May 20...,21.5.2021 EN Official Journal of the European ...,eu_cellar
...,...,...,...,...
5d1566cde8e0ff00d5e907e551d9202c714e320039c29609324e0eb579395aa1,2020-12-10,Case C-774/19: Judgment of the Court (Sixth Ch...,15.2.2021 EN Official Journal of the European ...,eu_cellar
f63ec4df58c38a804c983d295869f22e89515ef32c686841d43b7a9618e106d9,2021-02-24,Calls for proposals and related activities und...,24.2.2021 EN Official Journal of the European ...,eu_cellar
5c798a5a6aeb16a370e018faa10f1f79ce040d1adc8e0a2c4d06470a3d20f52b,2020-12-29,,,eu_cellar
f6334c565e8ab7de7306c50e94d8ef10c6cf38e8d39dc7d3be72aa244c522f22,2020-06-18,European Parliament resolution of 18 June 2020...,8.9.2021 EN Official Journal of the European U...,eu_cellar


In [15]:
# Print dataframe columns
eu_timeline_df.columns

Index(['month_name', 'date', 'title', 'abstract', 'presscorner_links',
       'all_links', 'detail_link', 'detail_type', 'detail_date',
       'detail_location', 'detail_content', 'detail_title', 'detail_pdf_link',
       'press_contacts', 'topics', 'for_more_information_links'],
      dtype='object')

In [16]:
#What are the columns that shows dates
date_columns = [col for col in eu_timeline_df.columns if "date" in col]
eu_timeline_df[date_columns]


Unnamed: 0_level_0,date,detail_date
_id,Unnamed: 1_level_1,Unnamed: 2_level_1
01dc34ca5f6bfa5316012d6f89a4256b8d2ce6e3a12210b0406903dd15bf36ac,2021-01-28,
04d776c061243576ffc1e1c5a724d41ecca2ed12339b2ff473c5aa2ff1a071c7,2021-06-22,2021-06-22
04eb55005a803c7353e5d5ed339d7f3a72cc8da6e49f2b8b152bf8fb40a1398f,2020-06-16,2020-06-17
073e22ec2b44a5f2f6289f09dc954ecd6ca6cbb4a4b33ffd88c3c150572569c0,2020-04-15,2020-04-15
0a3650bef3282d09d971dd92dc01570b7ecbdb67a4bdd0ff1c8cf8443a06b937,2020-12-18,
...,...,...
fa793c764aaf89c283f747036c9d73fd66f0a7e3692be56b8ac985610bef2417,2021-06-29,2021-06-29
fb9c4d1634da94c73fd529b44ec40057c28a5c925a8b8e71d73c49e5a815784d,2020-03-13,2020-03-13
fbad1d421cf80d3e0e4c6b6d3d60a2a6ec8bed4ee19e07cd15d963fa74352420,2021-01-28,2020-03-19
fbeff236835d3cfd394f9c9fd5b63eb524c9cdab6e68c9370758f3e35592973e,2020-07-06,2020-07-06


In [17]:
# Create new dataframe with the wanted content and columns from EU timeline dataset
tmp_eu_timeline_df = pd.DataFrame(eu_timeline_df[["date", "title", "detail_content"]])
tmp_eu_timeline_df["doc_source"] = "eu_timeline"

In [18]:
# See the newly created EU timeline  dataframe
tmp_eu_timeline_df

Unnamed: 0_level_0,date,title,detail_content,doc_source
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
01dc34ca5f6bfa5316012d6f89a4256b8d2ce6e3a12210b0406903dd15bf36ac,2021-01-28,EU countries adopt guidelines on proof of vacc...,,eu_timeline
04d776c061243576ffc1e1c5a724d41ecca2ed12339b2ff473c5aa2ff1a071c7,2021-06-22,Commission endorses Latvia's €1.8 billion reco...,The European Commission has today adopted a po...,eu_timeline
04eb55005a803c7353e5d5ed339d7f3a72cc8da6e49f2b8b152bf8fb40a1398f,2020-06-16,Commission unveils EU vaccines strategy,"Today, to help protect people everywhere, the ...",eu_timeline
073e22ec2b44a5f2f6289f09dc954ecd6ca6cbb4a4b33ffd88c3c150572569c0,2020-04-15,European roadmap shows path towards common lif...,"Today, the Commission, in cooperation with the...",eu_timeline
0a3650bef3282d09d971dd92dc01570b7ecbdb67a4bdd0ff1c8cf8443a06b937,2020-12-18,Commission puts forward rules on rapid antigen...,,eu_timeline
...,...,...,...,...
fa793c764aaf89c283f747036c9d73fd66f0a7e3692be56b8ac985610bef2417,2021-06-29,Commission identifies five promising candidate...,The EU Strategy on COVID-19 Therapeutics deliv...,eu_timeline
fb9c4d1634da94c73fd529b44ec40057c28a5c925a8b8e71d73c49e5a815784d,2020-03-13,Setting out coordinated response to counter th...,COVID-19 is a severe public health emergency f...,eu_timeline
fbad1d421cf80d3e0e4c6b6d3d60a2a6ec8bed4ee19e07cd15d963fa74352420,2021-01-28,Prolonging State aid Temporary Framework to fu...,The European Commission has adopted a,eu_timeline
fbeff236835d3cfd394f9c9fd5b63eb524c9cdab6e68c9370758f3e35592973e,2020-07-06,Commission and EIB provide CureVac with a €75 ...,,eu_timeline


In [19]:
# Print dataframe columns
ir_timeline_df.columns



Index(['keyword', 'page_type', 'page_link', 'department_data',
       'published_date', 'updated_date', 'title', 'content', 'content_links',
       'campaigns_links', 'part_of_links', 'documents'],
      dtype='object')

In [20]:
#What are the columns that shows dates
date_columns = [col for col in ir_timeline_df.columns if "date" in col]
ir_timeline_df[date_columns]



Unnamed: 0_level_0,published_date,updated_date
_id,Unnamed: 1_level_1,Unnamed: 2_level_1
41c0f398402f522980ec6e3ae49ef3b60b7139c1deb0ba75db830e77f62d5bdd,2021-03-24,2021-03-24
41d85d20ea2ffeaba4c27e36def4c03986ad83679fb62a07de6214abf0309ae1,2020-07-30,2020-07-30
420468c03e6550d8111344aa4d28d7e7cfd70037b16f8cab3451ff52d6bbd7b5,2020-06-06,2020-06-06
42081c8a3c63d62e4b0ef27f6a3386848b45d71665318ef14642649bb9b11b3e,2021-07-21,2021-07-23
421dd88edfd80b1fbc6c350565455e1a218774c9d4f1d74bbe2047d972187736,2020-12-10,2020-12-11
...,...,...
ff4ea2ef933287c4125d898245e878ae9fcb1b356b3c60de288dd97020bdd3a5,2021-01-22,2021-01-23
ff7508dca9aaf53565b08dc6826102bff9176584afc245b9947c18dae946506a,2021-08-25,2021-08-27
ff82338ae7c8129669562babc1c0d7a893a9c59f6d2bd00cefe853539721367e,2020-03-20,2020-03-20
ffef06958ad76786347e3aa545351f708105f6db826708b4523cd16ea150e702,2021-05-24,2021-06-01


In [21]:
# Create new dataframe with the wanted content and columns from Ireland timeline dataset
tmp_ir_timeline_df = pd.DataFrame(ir_timeline_df[["published_date", "title", "content"]])
tmp_ir_timeline_df["doc_source"] = "ireland_timeline"

In [22]:
# See the newly created Ireland timeline  dataframe
tmp_ir_timeline_df


Unnamed: 0_level_0,published_date,title,content,doc_source
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
41c0f398402f522980ec6e3ae49ef3b60b7139c1deb0ba75db830e77f62d5bdd,2021-03-24,Statement by Minister for Education Norma Fole...,The State Examinations Commission is today iss...,ireland_timeline
41d85d20ea2ffeaba4c27e36def4c03986ad83679fb62a07de6214abf0309ae1,2020-07-30,From pencil and paper to drones and satellites...,July 31 2020 marks the 175th anniversary of th...,ireland_timeline
420468c03e6550d8111344aa4d28d7e7cfd70037b16f8cab3451ff52d6bbd7b5,2020-06-06,Return to Sport Expert Group recommends furthe...,"The Minister for Transport, Tourism and Sport,...",ireland_timeline
42081c8a3c63d62e4b0ef27f6a3386848b45d71665318ef14642649bb9b11b3e,2021-07-21,Government to put “special focus” on Drogheda ...,Government agrees that funding applications un...,ireland_timeline
421dd88edfd80b1fbc6c350565455e1a218774c9d4f1d74bbe2047d972187736,2020-12-10,Statement from the National Public Health Emer...,1. Hospital statistics 2. Gender of patients 3...,ireland_timeline
...,...,...,...,...
ff4ea2ef933287c4125d898245e878ae9fcb1b356b3c60de288dd97020bdd3a5,2021-01-22,Statement from the National Public Health Emer...,The Health Protection Surveillance Centre has ...,ireland_timeline
ff7508dca9aaf53565b08dc6826102bff9176584afc245b9947c18dae946506a,2021-08-25,Ministers McConalogue and Heydon launch Code o...,"The Minister for Agriculture, Food and the Mar...",ireland_timeline
ff82338ae7c8129669562babc1c0d7a893a9c59f6d2bd00cefe853539721367e,2020-03-20,Press Release on Civil Defence in the context ...,Civil Defence (Cosaint Shibhialta) is a statut...,ireland_timeline
ffef06958ad76786347e3aa545351f708105f6db826708b4523cd16ea150e702,2021-05-24,Minister O’Gorman launches ‘LGBTI+ Youth in Ir...,"Minister for Children, Equality, Disability, I...",ireland_timeline


In [23]:
def replace_non_english_content(text):
    if text is not None:
        language = detect_langs(text)
        language_details = str(language[0]).split(":")
        if language_details[0] == "en" and float(language_details[1]) > 0.95:
            return text
        else:
            return None




In [24]:
#Renaming columns in dataframes and dropping rows without content, title and date
data_frames = [tmp_cellar_df, tmp_eu_timeline_df, tmp_ir_timeline_df, tmp_pwdb_df]
for data_frame in data_frames:
    data_frame.columns = ["Date", "Title", "Content", "Document_source"]
    data_frame["Content"] = data_frame["Content"].apply(lambda x: x if x != "" else None)
    data_frame["Content"] = data_frame["Content"].replace({np.nan: None}).apply(lambda x: replace_non_english_content(x))
    data_frame.dropna(subset=['Content', 'Title', 'Date'], how="any", inplace=True)
    # data_frame.dropna(subset=['Title'], inplace=True)
    # data_frame.dropna(subset=['Date'], inplace=True)




In [25]:
#Unified datasets dataframe
unified_datasets_df = pd.DataFrame(pd.concat(data_frames))
unified_datasets_df




Unnamed: 0_level_0,Date,Title,Content,Document_source
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0e3d35dcad11d1d80f8fb1c35599be27fb3e864874ab1ac679fcd004723bc2e3,2020-01-30,Prior notification of a concentration (Case M....,30.1.2020 EN Official Journal of the European ...,eu_cellar
39dac43e4fffa3fc60906f3563e4d1547aea4a0f94e17cd28ec144b117e25df6,2021-08-30,COMMISSION STAFF WORKING DOCUMENT […] Accompan...,"EUROPEAN COMMISSION Brussels, 30.8.2021 SWD(20...",eu_cellar
00f65d205b8df943ae8acbd34b729c3da11da1646e7e324a6c517f95f950a5d9,2020-06-18,P9_TA(2020)0157 Amending Regulations (EU) No 5...,8.9.2021 EN Official Journal of the European U...,eu_cellar
002086bd15c9aba8b8b3cdf88498e25735cf66043ddab8877aca317256615aff,2020-09-07,Opinion No 6/2020 (pursuant to Article 287(4) ...,20.10.2020 EN Official Journal of the European...,eu_cellar
014589ceb95203c13ee4bd097ddf2f164657cb6aeb1650db280b2d8c7a57977d,2021-05-20,Council Recommendation (EU) 2021/816 20 May 20...,21.5.2021 EN Official Journal of the European ...,eu_cellar
...,...,...,...,...
tika/330d076853d0731e295d75f70841b445e4bc45061a8ae1507766fa26d778cc6f,2020-04-12,Government support to assist the sports sector...,"As sport events have been prohibited, the fina...",pwdb
tika/ffaa29a24828dd01c81d66ece0f8f3c21451083be7a0b6d2f4ed954b9378b0e6,2020-10-23,National public sector collective agreement 2021,"On 16 October 2020, the Government of the Repu...",pwdb
tika/3df93a8fb254187e8783d19760b465a98d88863e59a513b35a4d8062248e4190,2020-04-16,"Lifting of overtime work restrictions, extensi...",The government with a Legislative Act (Offici...,pwdb
tika/71fc1cad3a3a8b9de4da7080acc3a8251fa8ff8c5b7514f082f18c1cf69ab140,2020-04-06,Exceptional regime suspending overtime limits,"The Decree Law No. 10-A/2020 of 13 March, sett...",pwdb


In [27]:
#Upload to elastic
store_registry.es_index_store().put_dataframe(index_name=config.UNIFIED_DATASET_ELASTIC_SEARCH_INDEX_NAME,
                                              content=unified_datasets_df)


 98% (6116 of 6194) |################### | Elapsed Time: 0:00:00 ETA:   0:00:00

6194