# EDA on textual data

## Import libraries

In [765]:
import pathlib as path
from tqdm import tqdm

import pandas as pd
from collections import  Counter

import spacy
from cleantext import clean
import re

import plotly.express as px
from IPython.display import display,Markdown

## Define constants

In [766]:
FOLDER  = path.Path("/home/jovyan/data/")

SRC_FILE_NAME = "covid19db.json"

OUTPUT_FILE_NAME = "eda_pwdb_result.html"

SRC_FILE_PATH = FOLDER / SRC_FILE_NAME

OUTPUT_FILE_PATH = FOLDER / OUTPUT_FILE_NAME

TEXTUAL_COLUMNS = ['title','background_info_description','content_of_measure_description','use_of_measure_description','involvement_of_social_partners_description']


## Load PWDB dataset & SpaCy NLP component

In [767]:
df = None
if SRC_FILE_PATH.exists():
    df = pd.read_json(SRC_FILE_PATH)
    nlp = spacy.load("en_core_web_sm",exclude=["lemmatizer"])
else:
    print("Input source file is invalid!")

## Functions to calculate frequency for textual data

In [768]:
def calculate_frequency(data : pd.Series,title : str,relative = False):
    observation_type_name = 'Absolute freq' if not relative else 'Relative freq'
    data.dropna(inplace=True)
    observation = pd.DataFrame(Counter(data).most_common(10),columns=[title,observation_type_name])
    if relative:
        observation[observation_type_name]/=observation[observation_type_name].sum()/100
        observation[observation_type_name]=round(observation[observation_type_name],2)
    return observation

## Function to get spaCy NLP doc for each row from series of strings

In [769]:
def get_nlp_docs(data : pd.Series):
    return [nlp(row) for row in data]

## Function to get entity names from series of strings

In [770]:
def get_named_entities(data: pd.Series,docs : list = None):
    if docs is None:
        docs = get_nlp_docs(data)
    result = [e.label_ for doc in docs for e in doc.ents]
    return pd.Series(result,dtype=str)

## Function to get list of words labeled with entity_name

In [771]:
def get_entity_words(data : pd.Series, entity_name : str = 'ORG',docs : list = None):
    if docs is None:
        docs = get_nlp_docs(data)
    result = [e.text for doc in docs for e in doc.ents if e.label_== entity_name]
    return pd.Series(result,dtype=str)

## Function which remove stop words from series of strings

In [772]:
def remove_stopwords(data : pd.Series):
    stop_words = nlp.Defaults.stop_words
    result = []
    for row in data:
        result.append(" ".join([word for word in row.split() if word not in stop_words]))
    return pd.Series(result,dtype=str)

## Function to get N grams from series of strings

In [773]:
def get_ngrams(data : pd.Series, n : int,stopwords : bool = True):
    if not stopwords:
        data = remove_stopwords(data)
    result = [ " ".join(text[i:i+n])
               for text in data.str.split()
               for i in range(len(text)-n+1)]
    return pd.Series(result,dtype=str)

## Function to get list of noun phrases

In [774]:
def get_noun_phrases(data : pd.Series,docs : list = None):
    if docs is None:
        docs = get_nlp_docs(data)
    result = [str(n) for doc in docs for n in doc.noun_chunks ]
    return pd.Series(result,dtype=str)

## Function to get list of words without stop words

In [775]:
def get_words(data : pd.Series):
    text = " ".join(data)
    stop_words = nlp.Defaults.stop_words
    result = [ word for word in text.split() if word not in stop_words]
    return pd.Series(result,dtype=str)

## Function to delete punctuation form text

In [776]:
def delete_punctuation(text : str):
    regex_filter = r'[,;:\*`#\'\"^&~@=+_.()?\[\]!\s]\s*'
    text = " ".join(list(filter(None,re.split(regex_filter,text))))
    return text

## Function to clear textual data
- remove stop-words
- transform text to lowercase
- remove textual "artifacts"
- remove emails
- remove urls
- remove phone numbers


In [777]:
def prepare_text_data(data : pd.Series ):
    data = data.dropna().explode()
    result = [ delete_punctuation(
                clean(text,
                     no_urls=True,
                     no_emails=True,
                     no_phone_numbers=True,
                     )
                )
              for text in data
             ]
    return pd.Series(result,dtype=str)

## Function for plot bar chart on observations

In [778]:
def plot_bar_chart(observations: pd.DataFrame,chart_title: str):
    columns = observations.columns
    return px.bar(observations,x=columns[1],y=columns[0],title=chart_title)

## Function for plot pie chart on observations

In [779]:
def plot_pie_chart(observations: pd.DataFrame,chart_title: str):
    columns = observations.columns
    return px.pie(observations,values=columns[1],names=columns[0],title=chart_title )

## Function to display result of textual EDA

In [780]:
def eda_display_result(result : pd.DataFrame,chart_title : str):
    if result.size > 0 :
        display(Markdown(chart_title))
        display(tuple(result))
        plot_bar_chart(result, chart_title).show()
        plot_pie_chart(result, chart_title).show()

## Textual EDA for words classified with the same entity name

In [781]:
def eda_entity_words( data : pd.Series,data_title : str, docs ):
    result = calculate_frequency(get_entity_words(data,'ORG',docs),data_title,True)
    eda_display_result(result,"Entity words for "+data_title)

## Textual EDA for words frequency

In [782]:
def eda_words_freq( data : pd.Series,data_title : str):
    result = calculate_frequency(get_words(data),data_title,True)
    eda_display_result(result,"Words frequency for "+data_title)


## Textual EDA for entity names

In [783]:
def eda_named_entities( data : pd.Series,data_title : str, docs ):
    result = calculate_frequency(get_named_entities(data,docs),data_title,True)
    eda_display_result(result,"Named entities for "+data_title)

## Textual EDA for noun phrases

In [784]:
def eda_noun_phrases( data : pd.Series,data_title : str, docs ):
    result = calculate_frequency(get_noun_phrases(data,docs),data_title,True)
    eda_display_result(result,"Noun phrases for "+data_title)

## Textual EDA for N grams

In [785]:
def eda_n_grams( data : pd.Series,data_title : str, n_grams):
    result = calculate_frequency(get_ngrams(data,n_grams),data_title,True)
    eda_display_result(result,"N grams for "+data_title)

## Textual EDA for N grams without stop words

In [786]:
def eda_n_grams_without_stopwords( data : pd.Series,data_title : str, n_grams):
    result = calculate_frequency(get_ngrams(data,n_grams,False),data_title,True)
    eda_display_result(result,"N grams without stopwords for "+data_title)

## Combined textual EDA

In [787]:
def eda_textual(data : pd.DataFrame):
    pbar = tqdm(data.columns)
    for column_name in pbar:
        pbar.set_description('Eda on textual data ['+column_name+']')
        column_data = prepare_text_data(data[column_name])
        docs = get_nlp_docs(column_data)
        eda_words_freq(column_data,column_name)
        eda_n_grams(column_data,column_name,3)
        eda_n_grams_without_stopwords(column_data,column_name,3)
        eda_noun_phrases(column_data,column_name,docs)
        eda_named_entities(column_data,column_name,docs)
        eda_entity_words(column_data,column_name,docs)

## Execute combined textual EDA on specific columns from PWDB dataset

In [788]:
eda_textual(df[TEXTUAL_COLUMNS])




Eda on textual data [involvement_of_social_partners_description]: 100%|██████████| 5/5 [00:57<00:00, 11.51s/it]


Words frequency for title

Unnamed: 0,title,Relative freq
0,covid-19,17.16
1,support,15.68
2,workers,9.38
3,social,8.92
4,temporary,8.81
5,measures,8.58
6,companies,8.47
7,working,8.12
8,agreement,7.55
9,employees,7.32


N grams for title

Unnamed: 0,title,Relative freq
0,the covid-19 pandemic,15.31
1,in case of,12.24
2,health and safety,11.22
3,people with disabilities,11.22
4,social security contributions,9.18
5,measures to support,9.18
6,of the covid-19,8.16
7,financial support for,8.16
8,affected by the,8.16
9,exceptional and temporary,7.14


N grams without stopwords for title

Unnamed: 0,title,Relative freq
0,social security contributions,20.0
1,context covid-19 pandemic,11.11
2,- exceptional temporary,8.89
3,exceptional temporary measure,8.89
4,sector collective agreement,8.89
5,national social package,8.89
6,social partners address,8.89
7,partners address economic,8.89
8,address economic challenges,8.89
9,payment social security,6.67


Noun phrases for title

Unnamed: 0,title,Relative freq
0,covid-19,16.18
1,companies,15.86
2,employees,11.97
3,workers,9.71
4,measures,9.71
5,support,8.41
6,people,7.77
7,extension,6.8
8,work,6.8
9,businesses,6.8


Named entities for title

Unnamed: 0,title,Relative freq
0,PERSON,42.45
1,CARDINAL,26.98
2,DATE,15.47
3,ORDINAL,6.12
4,TIME,4.32
5,NORP,1.8
6,PERCENT,1.44
7,ORG,0.72
8,GPE,0.72


Entity words for title

Unnamed: 0,title,Relative freq
0,cooperatives,100.0


Words frequency for background_info_description

Unnamed: 0,background_info_description,Relative freq
0,2020,17.41
1,covid-19,15.93
2,measures,11.98
3,government,10.25
4,support,9.15
5,social,7.15
6,march,7.09
7,companies,7.05
8,pandemic,7.05
9,crisis,6.92


N grams for background_info_description

Unnamed: 0,background_info_description,Relative freq
0,in order to,17.08
1,of the covid-19,13.42
2,the covid-19 pandemic,13.12
3,due to the,12.75
4,the covid-19 crisis,9.62
5,the ministry of,7.68
6,as well as,7.23
7,the spread of,6.94
8,affected by the,6.19
9,as a result,5.97


N grams without stopwords for background_info_description

Unnamed: 0,background_info_description,Relative freq
0,20 march 2020,13.74
1,resolution council ministers,11.85
2,17 march 2020,10.43
3,consequences spread covid-19,9.95
4,social security contributions,9.48
5,1 october 2020,9.48
6,31 december 2020,9.0
7,ustawa z dnia,9.0
8,cultural creative sector,8.53
9,minister education culture,8.53


Noun phrases for background_info_description

Unnamed: 0,background_info_description,Relative freq
0,the government,15.24
1,it,15.1
2,who,11.28
3,order,10.92
4,covid-19,10.47
5,employees,8.13
6,companies,7.96
7,the measure,7.42
8,they,7.28
9,measures,6.2


Named entities for background_info_description

Unnamed: 0,background_info_description,Relative freq
0,DATE,36.25
1,CARDINAL,33.86
2,PERSON,17.06
3,ORDINAL,4.82
4,PERCENT,3.71
5,TIME,1.35
6,LAW,1.24
7,MONEY,0.72
8,QUANTITY,0.52
9,GPE,0.48


Entity words for background_info_description

Unnamed: 0,background_info_description,Relative freq
0,cooperatives,50.0
1,budget2020,16.67
2,march2020,16.67
3,of14,16.67


Words frequency for content_of_measure_description

Unnamed: 0,content_of_measure_description,Relative freq
0,2020,17.05
1,employees,12.84
2,work,9.62
3,support,9.41
4,covid-19,9.0
5,period,8.83
6,social,8.57
7,companies,8.49
8,000,8.19
9,1,8.01


N grams for content_of_measure_description

Unnamed: 0,content_of_measure_description,Relative freq
0,the amount of,13.75
1,in order to,13.45
2,the end of,11.46
3,as well as,10.72
4,due to the,10.2
5,according to the,9.16
6,amount of the,8.35
7,a maximum of,7.83
8,of up to,7.61
9,the ministry of,7.46


N grams without stopwords for content_of_measure_description

Unnamed: 0,content_of_measure_description,Relative freq
0,31 december 2020,24.59
1,social security contributions,16.07
2,30 june 2020,10.49
3,personal income tax,8.2
4,1 march 2020,7.87
5,31 december 2019,7.54
6,1 april 2020,7.21
7,30 september 2020,6.23
8,small medium-sized enterprises,6.23
9,16 march 2020,5.57


Noun phrases for content_of_measure_description

Unnamed: 0,content_of_measure_description,Relative freq
0,it,18.03
1,who,16.18
2,they,12.76
3,employees,11.13
4,the company,8.48
5,companies,7.54
6,the employer,7.37
7,the measure,6.49
8,the amount,6.13
9,the government,5.88


Named entities for content_of_measure_description

Unnamed: 0,content_of_measure_description,Relative freq
0,DATE,39.15
1,CARDINAL,35.29
2,PERCENT,10.45
3,PERSON,5.56
4,ORDINAL,3.34
5,TIME,3.3
6,MONEY,1.23
7,QUANTITY,0.81
8,LAW,0.58
9,GPE,0.28


Entity words for content_of_measure_description

Unnamed: 0,content_of_measure_description,Relative freq
0,illnesscovid-19,50.0
1,cooperatives,25.0
2,covid-19,25.0


Words frequency for use_of_measure_description

Unnamed: 0,use_of_measure_description,Relative freq
0,2020,15.7
1,million,12.93
2,000,12.71
3,available,10.73
4,information,9.82
5,employees,8.53
6,total,8.1
7,measure,7.8
8,1,7.68
9,number,6.0


N grams for use_of_measure_description

Unnamed: 0,use_of_measure_description,Relative freq
0,the number of,15.44
1,no information available,14.36
2,according to the,14.23
3,the end of,10.6
4,a total of,8.19
5,there is no,7.79
6,the total amount,7.52
7,no information to,7.52
8,total amount of,7.25
9,no data available,7.11


N grams without stopwords for use_of_measure_description

Unnamed: 0,use_of_measure_description,Relative freq
0,accommodation food services,12.22
1,social security contributions,11.11
2,according data provided,11.11
3,ministry social security,10.0
4,social security labour,10.0
5,31 december 2020,10.0
6,potentially eligible population,8.89
7,tax year 2018,8.89
8,arts entertainment recreation,8.89
9,food services sector,8.89


Noun phrases for use_of_measure_description

Unnamed: 0,use_of_measure_description,Relative freq
0,no information,20.19
1,it,17.19
2,the measure,13.52
3,the number,10.18
4,no data,7.27
5,the end,6.76
6,employees,6.59
7,the scheme,6.42
8,who,6.16
9,they,5.73


Named entities for use_of_measure_description

Unnamed: 0,use_of_measure_description,Relative freq
0,CARDINAL,53.47
1,DATE,27.16
2,PERCENT,9.23
3,MONEY,3.81
4,ORDINAL,2.23
5,PERSON,2.06
6,QUANTITY,1.2
7,TIME,0.54
8,PRODUCT,0.2
9,GPE,0.09


Words frequency for involvement_of_social_partners_description

Unnamed: 0,involvement_of_social_partners_description,Relative freq
0,social,26.01
1,partners,18.64
2,measure,9.75
3,government,8.28
4,involved,7.74
5,measures,7.26
6,trade,6.67
7,involvement,6.11
8,unions,5.52
9,consulted,4.02


N grams for involvement_of_social_partners_description

Unnamed: 0,involvement_of_social_partners_description,Relative freq
0,the social partners,20.1
1,social partners were,16.08
2,involved in the,11.18
3,of the measure,9.9
4,economic and social,8.24
5,were not involved,8.24
6,not involved in,6.96
7,partners were not,6.86
8,social partners have,6.37
9,of social partners,6.08


N grams without stopwords for involvement_of_social_partners_description

Unnamed: 0,involvement_of_social_partners_description,Relative freq
0,social partners involved,24.95
1,involvement social partners,14.5
2,economic social council,13.01
3,social partners consulted,10.23
4,level social partners,7.46
5,standing committee social,6.61
6,committee social concertation,6.61
7,social partners informed,5.54
8,design implementation monitoring,5.54
9,peak-level social partners,5.54


Noun phrases for involvement_of_social_partners_description

Unnamed: 0,involvement_of_social_partners_description,Relative freq
0,social partners,18.55
1,the measure,13.78
2,the social partners,13.51
3,the government,12.28
4,it,8.46
5,no involvement,8.32
6,they,8.25
7,this measure,6.34
8,measures,5.8
9,no information,4.71


Named entities for involvement_of_social_partners_description

Unnamed: 0,involvement_of_social_partners_description,Relative freq
0,CARDINAL,32.81
1,DATE,30.28
2,ORDINAL,17.98
3,PERSON,15.46
4,PERCENT,1.26
5,NORP,0.79
6,GPE,0.63
7,TIME,0.47
8,LAW,0.32
