In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import re
import string

import json
import pathlib

DATA_FOLDER = pathlib.Path("/home/jovyan/work/Dan/data")
JSON_FILE = DATA_FOLDER / "covid19db.json"

# Data manipulation
import pandas as pd

from nltk.corpus import stopwords

# lightweight and flexible JSON processor
import jq

ModuleNotFoundError: No module named 'jq'

In [2]:
with open(JSON_FILE) as file:
    content = file.read()
    data = json.loads(content)

In [3]:
transformation_rules = '''{
"Identifier": .recordId,
"Title": .fieldData.title,
"Title (national language)": .fieldData.title_nationalLanguage,
"Country": .fieldData.calc_country,
"Start date": .fieldData.d_startDate,
"End date": .fieldData.d_endDate,
"Date type": .fieldData.dateType,
"Type of measure": .fieldData.calc_type,
"Status of regulation": .fieldData.statusOfRegulation,
"Category": .fieldData.calc_minorCategory,
"Subcategory": .fieldData.calc_subMinorCategory,
"Case added": .fieldData.calc_creationDay,
"Background information": .fieldData.descriptionBackgroundInfo,
"Content of measure": .fieldData.descriptionContentOfMeasure,
"Use of measure": .fieldData.descriptionUseOfMeasure,
"Actors": [.portalData.actors[] |  ."actors::name" ],
"Target groups": [.portalData.targetGroups[] | ."targetGroups::name"],
"Funding": [.portalData.funding[] | ."funding::name" ],
"Views of social partners": .fieldData.descriptionInvolvementOfSocialPartners,
"Form of social partner involvement": .fieldData.socialPartnerform,
"Role of social partners": .fieldData.socialPartnerrole,
"Is sector specific": .fieldData.isSector,
"Private or public sector": .fieldData.sector_privateOrPublic,
"Is occupation specific": .fieldData.isOccupation,
"Sectors": [.portalData.sectors[] | ."sectors::name" ],
"Occupations": [.portalData.occupations[] | .],
"Sources": [.portalData.sources[] | ."sources::url" ],
}'''

jq_transformation_program = (".[] | "+str(transformation_rules)).replace("\n","")
transformer = jq.compile(jq_transformation_program)

In [4]:
new_data = transformer.input(data).all()
df_new = pd.DataFrame.from_records(new_data)

In [5]:
def reduce_array_column(df, column, new_column=None):
    """
        assuming that the column contains array objects,
        reduces thse arrays to a string of concatenated values
        :df: the pandas DataFrame
        :column: the column with array values
        :new_column: the new column where the concatenated strings are placed;
                     If the new_column is None then the original column is replaced
    """

    if new_column:
        df[new_column] = df[column].apply( lambda x: ", ".join(sorted(x)) )
    else:
        df[column] = df[column].apply( lambda x: ", ".join(sorted(x)))
    return df

In [6]:
reduce_array_column(df_new,"Target groups")

Unnamed: 0,Identifier,Title,Title (national language),Country,Start date,End date,Date type,Type of measure,Status of regulation,Category,...,Funding,Views of social partners,Form of social partner involvement,Role of social partners,Is sector specific,Private or public sector,Is occupation specific,Sectors,Occupations,Sources
0,95,Hardship case fund: Safety net for self-employed,Härtefall-Fonds: Sicherheitnetz für Selbststän...,Austria,03/27/2020,04/30/2021,Temporary,Legislations or other statutory regulations,Entirely new measure,Income protection beyond short-time work,...,[National funds],The Federal Economic Chamber was involved.,,,No,Not specified,No,[],[],[https://www.wko.at/service/haertefall-fonds-e...
1,96,State support for tourism - Access to finance,Massnahmenpaket fuer den Tourismus - Bank,Austria,03/06/2020,12/31/2020,Temporary,Legislations or other statutory regulations,New aspects included into existing measure,Supporting businesses to stay afloat,...,[National funds],The social partners were consulted.,,,Yes,Not specified,No,"[Accommodation, Food and beverage service acti...",[],"[https://orf.at/stories/3159574/, https://www...."
2,98,Bank guarantees for SMEs and one-person enterp...,AWS Überbrückungsfinanzierung/garantie fuer EP...,Austria,03/04/2020,,Open ended,Legislations or other statutory regulations,Entirely new measure,Supporting businesses to stay afloat,...,[National funds],consulted,,,No,Not specified,No,[],[],[https://www.wko.at/service/coronavirus-ueberb...
3,100,Emergency measures relating to short-time working,Mesures d'urgence en matière d'activité partielle,France,03/27/2020,,Open ended,Legislations or other statutory regulations,New aspects included into existing measure,Employment protection and retention,...,[National funds],,,,No,Not specified,No,[],[],[https://www.legifrance.gouv.fr/affichTexte.do...
4,101,Airbus agreement for making up unworked hours ...,un accord chez Airbus pour organiser la récupé...,France,03/31/2020,12/31/2020,Temporary,Bipartite collective agreements,Entirely new measure,"Protection of workers, adaptation of workplace",...,[No special funding required],,,,Yes,Not specified,No,[Manufacture of other transport equipment],[],[https://news.industriall-europe.eu/content/do...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
930,1599,Taxation measures for enterprises,Belastingmaatregelen bedrijven,Netherlands,10/01/2020,,Open ended,Legislations or other statutory regulations,New aspects included into existing measure,Supporting businesses to stay afloat,...,"[National funds, No special funding required]",Employer’s organisations and trade unions are ...,,,No,Not specified,No,[],[],[https://www.rijksoverheid.nl/onderwerpen/bela...
931,1607,Winwin-loan,Winwinlening,Belgium,10/06/2020,,Open ended,Legislations or other statutory regulations,New aspects included into existing measure,Supporting businesses to stay afloat,...,"[Regional funds, Other]",No clear indication of the involvement of the ...,,,No,Only private sector,No,[],[],[https://www.vlaio.be/nl/subsidies-financierin...
932,1608,Support measures for businesses closed since N...,Ondersteunende maatregelen voor ondernemingen ...,Belgium,11/02/2020,,Open ended,Legislations or other statutory regulations,Entirely new measure,Supporting businesses to stay afloat,...,[Regional funds],The involvement of the social partners is not ...,,,No,Only private sector,No,[],[],[http://economie.wallonie.be/content/nouvelles...
933,1609,New loans available to enterprises in Brussels,Nieuwe kredieten beschikbaar voor Brusselse on...,Belgium,05/13/2020,,Open ended,Legislations or other statutory regulations,Entirely new measure,Supporting businesses to stay afloat,...,[Regional funds],The involvement of the social partners is not ...,,,Yes,Only private sector,No,"[Accommodation, Food and beverage service acti...",[],[https://1819.brussels/nl/blog/financeinvestbr...


In [8]:
#df_new.to_pickle(DATA_FOLDER / "pwdb/pickle/df_full.pkl")

In [148]:
df = df_new[['Title', 'Background information', 'Content of measure',
             'Category', 'Type of measure', 'Target groups']]
df.head()

Unnamed: 0,Title,Background information,Content of measure,Category,Type of measure,Target groups
0,Hardship case fund: Safety net for self-employed,As part of the €4 billion fund to mitigate the...,The support is a one-off payment and does not ...,Income protection beyond short-time work,Legislations or other statutory regulations,"One person or microenterprises, Self-employed,..."
1,State support for tourism - Access to finance,As the tourism industry was among the first se...,"Initially, bank guarantees amounting to €100 m...",Supporting businesses to stay afloat,Legislations or other statutory regulations,"SMEs, Sector specific set of companies"
2,Bank guarantees for SMEs and one-person enterp...,"On 4 March, following consultation of the soci...",The measure is targeted at small and medium si...,Supporting businesses to stay afloat,Legislations or other statutory regulations,"One person or microenterprises, SMEs"
3,Emergency measures relating to short-time working,Link to case FR-2020-10/462\r\r\rAn ordinance ...,The new ordinance guarantees partial time work...,Employment protection and retention,Legislations or other statutory regulations,"Employees in standard employment, Other groups..."
4,Airbus agreement for making up unworked hours ...,"On 20 March, the Airbus Group signed an agreem...",This agreement qualifies as unworked time the ...,"Protection of workers, adaptation of workplace",Bipartite collective agreements,"Employees in standard employment, Larger corpo..."


In [149]:
df_columns = df['Title'].map(str) + df['Background information'].map(str) + df['Content of measure'].map(str)
df_columns = pd.DataFrame(df_columns, columns=['Concatinate Data'])
df['Concatinated Data'] = df_columns

In [150]:
df.head()


Unnamed: 0,Title,Background information,Content of measure,Category,Type of measure,Target groups,Concatinated Data
0,Hardship case fund: Safety net for self-employed,As part of the €4 billion fund to mitigate the...,The support is a one-off payment and does not ...,Income protection beyond short-time work,Legislations or other statutory regulations,"One person or microenterprises, Self-employed,...",Hardship case fund: Safety net for self-employ...
1,State support for tourism - Access to finance,As the tourism industry was among the first se...,"Initially, bank guarantees amounting to €100 m...",Supporting businesses to stay afloat,Legislations or other statutory regulations,"SMEs, Sector specific set of companies",State support for tourism - Access to financeA...
2,Bank guarantees for SMEs and one-person enterp...,"On 4 March, following consultation of the soci...",The measure is targeted at small and medium si...,Supporting businesses to stay afloat,Legislations or other statutory regulations,"One person or microenterprises, SMEs",Bank guarantees for SMEs and one-person enterp...
3,Emergency measures relating to short-time working,Link to case FR-2020-10/462\r\r\rAn ordinance ...,The new ordinance guarantees partial time work...,Employment protection and retention,Legislations or other statutory regulations,"Employees in standard employment, Other groups...",Emergency measures relating to short-time work...
4,Airbus agreement for making up unworked hours ...,"On 20 March, the Airbus Group signed an agreem...",This agreement qualifies as unworked time the ...,"Protection of workers, adaptation of workplace",Bipartite collective agreements,"Employees in standard employment, Larger corpo...",Airbus agreement for making up unworked hours ...


In [151]:
def cleaning(text):
    """cleaner function"""
    stopword = stopwords.words('english')
    # set text to lowercase
    text = text.lower()
    # remove links
    text = re.sub(r"^https?:\/\/.*[\r\n]*", '', text)
    # remove "new line" symbol
    text = re.sub('\n', '', text)
    # Match every decimal digits and every character marked as letters in Unicode database
    text = re.sub('\w*\d\w*', '', text)
    # Delete square brackets
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[‘’“”…]', '', text)
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = ''.join(text)
    text = re.split('\W+', text)
    text = [word for word in text if word not in stopword]

    return text

In [152]:
clean = lambda x: cleaning(x)
df['Concatinated Data (clean)'] = df['Concatinated Data'].apply(clean)

In [153]:
df.head()


Unnamed: 0,Title,Background information,Content of measure,Category,Type of measure,Target groups,Concatinated Data,Concatinated Data (clean)
0,Hardship case fund: Safety net for self-employed,As part of the €4 billion fund to mitigate the...,The support is a one-off payment and does not ...,Income protection beyond short-time work,Legislations or other statutory regulations,"One person or microenterprises, Self-employed,...",Hardship case fund: Safety net for self-employ...,"[hardship, case, fund, safety, net, selfemploy..."
1,State support for tourism - Access to finance,As the tourism industry was among the first se...,"Initially, bank guarantees amounting to €100 m...",Supporting businesses to stay afloat,Legislations or other statutory regulations,"SMEs, Sector specific set of companies",State support for tourism - Access to financeA...,"[state, support, tourism, access, financeas, t..."
2,Bank guarantees for SMEs and one-person enterp...,"On 4 March, following consultation of the soci...",The measure is targeted at small and medium si...,Supporting businesses to stay afloat,Legislations or other statutory regulations,"One person or microenterprises, SMEs",Bank guarantees for SMEs and one-person enterp...,"[bank, guarantees, smes, oneperson, enterprise..."
3,Emergency measures relating to short-time working,Link to case FR-2020-10/462\r\r\rAn ordinance ...,The new ordinance guarantees partial time work...,Employment protection and retention,Legislations or other statutory regulations,"Employees in standard employment, Other groups...",Emergency measures relating to short-time work...,"[emergency, measures, relating, shorttime, wor..."
4,Airbus agreement for making up unworked hours ...,"On 20 March, the Airbus Group signed an agreem...",This agreement qualifies as unworked time the ...,"Protection of workers, adaptation of workplace",Bipartite collective agreements,"Employees in standard employment, Larger corpo...",Airbus agreement for making up unworked hours ...,"[airbus, agreement, making, unworked, hours, p..."


In [156]:
#df.to_pickle(DATA_FOLDER / "pwdb/pickle/df.pkl")