# EDA on textual data

## Import libraries

In [14]:
from tqdm import tqdm

import pandas as pd
from collections import  Counter

import spacy
from cleantext import clean
import re
from sklearn.feature_extraction.text import TfidfVectorizer

import plotly.express as px
from IPython.display import display,Markdown
from sem_covid.services.data_registry import Dataset
from gensim.parsing.preprocessing import remove_stopwords

ModuleNotFoundError: No module named 'sem_covid'

## Define constants

In [116]:

TEXTUAL_COLUMNS = ['title','background_info_description','content_of_measure_description','use_of_measure_description','involvement_of_social_partners_description']


## Load PWDB dataset & SpaCy NLP component

In [117]:
df = Dataset.PWDB.fetch()
nlp = spacy.load("en_core_web_sm",exclude=["lemmatizer"])
tokenizer = nlp.tokenizer

## Functions to calculate frequency for textual data

In [118]:
def calculate_frequency(data : pd.Series,title : str,relative = False):
    observation_type_name = 'Absolute freq' if not relative else 'Relative freq'
    data.dropna(inplace=True)
    observation = pd.DataFrame(Counter(data).most_common(10),columns=[title,observation_type_name])
    if relative:
        observation[observation_type_name]/=observation[observation_type_name].sum()/100
        observation[observation_type_name]=round(observation[observation_type_name],2)
    return observation

## Function to get spaCy NLP doc for each row from series of strings

In [119]:
def get_nlp_docs(data : pd.Series):
    return [nlp(row) for row in data]

## Function to get entity names from series of strings

In [120]:
def get_named_entities(data: pd.Series,docs : list = None):
    if docs is None:
        docs = get_nlp_docs(data)
    result = [e.label_ for doc in docs for e in doc.ents]
    return pd.Series(result,dtype=str)

## Function to get list of words labeled with entity_name

In [121]:
def get_entity_words(data: pd.Series, entity_class: str = 'ORG', docs: list = None):
    if docs is None:
        docs = get_nlp_docs(data)
    result = [e.text for doc in docs for e in doc.ents if e.label_== entity_class]
    return pd.Series(result,dtype=str)

## Function to get TF-IDF for series of strings

In [123]:
def calculate_tf_idf(data : pd.Series,title : str):
    vectorizer = TfidfVectorizer()
    documents = data.apply(remove_stopwords)
    vectors = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    tmp_df = pd.DataFrame(denselist, columns=feature_names)
    tmp_df = tmp_df.max().sort_values(ascending=False).reset_index()
    tmp_df.columns = [title,"TF-IDF"]
    return tmp_df


## Function to get N grams from series of strings

In [124]:
def get_ngrams(data : pd.Series, n : int,stopwords : bool = True):
    if not stopwords:
        data = data.apply(remove_stopwords)
    result = [ " ".join(text[i:i+n])
               for text in data.str.split()
               for i in range(len(text)-n+1)]
    return pd.Series(result,dtype=str)

## Function to get list of noun phrases

In [125]:
def get_noun_phrases(data : pd.Series,docs : list = None):
    if docs is None:
        docs = get_nlp_docs(data)
    result = [str(n) for doc in docs for n in doc.noun_chunks ]
    return pd.Series(result,dtype=str)

## Function to get list of words without stop words

In [126]:
def get_words(data : pd.Series):
    text = " ".join(data)
    stop_words = nlp.Defaults.stop_words
    result = [ word for word in text.split() if word not in stop_words]
    return pd.Series(result,dtype=str)

## Function to delete punctuation form text

In [127]:
def delete_punctuation(text : str):
    regex_filter = r'[,;:\*`#\'\"^&~@=+_.()?\[\]!\s]\s*'
    text = " ".join(list(filter(None,re.split(regex_filter,text))))
    return text

## Function to clear textual data
- remove stop-words
- transform text to lowercase
- remove textual "artifacts"
- remove emails
- remove urls
- remove phone numbers


In [128]:
def prepare_text_data(data : pd.Series ):
    data = data.dropna().explode()
    result = [ delete_punctuation(
                clean(text,
                     no_urls=True,
                     no_emails=True,
                     no_phone_numbers=True,
                     )
                )
              for text in data
             ]
    return pd.Series(result,dtype=str)

## Function for plot bar chart on observations

In [129]:
def plot_bar_chart(observations: pd.DataFrame,chart_title: str):
    columns = observations.columns
    return px.bar(observations,x=columns[1],y=columns[0],title=chart_title)

## Function for plot pie chart on observations

In [130]:
def plot_pie_chart(observations: pd.DataFrame,chart_title: str):
    columns = observations.columns
    return px.pie(observations,values=columns[1],names=columns[0],title=chart_title )

## Function to display result of textual EDA

In [131]:
def eda_display_result(result : pd.DataFrame,chart_title : str, bar_char : bool = True,pie_chart : bool = True):
    if result.size > 0 :
        display(Markdown(chart_title))
        display(tuple(result))
        if bar_char:
            plot_bar_chart(result, chart_title).show()
        if pie_chart:
            plot_pie_chart(result, chart_title).show()

## Textual EDA for words classified with the same entity name

In [132]:
def eda_entity_words( data : pd.Series,data_title : str, docs ):
    result = calculate_frequency(get_entity_words(data,'ORG',docs),data_title,True)
    eda_display_result(result,"Entity words for "+data_title)

## Textual EDA for words frequency

In [133]:
def eda_words_freq( data : pd.Series,data_title : str):
    result = calculate_frequency(get_words(data),data_title,True)
    eda_display_result(result,"Words frequency for "+data_title)


## Textual EDA for entity names

In [134]:
def eda_named_entities( data : pd.Series,data_title : str, docs ):
    result = calculate_frequency(get_named_entities(data,docs),data_title,True)
    eda_display_result(result,"Named entities for "+data_title)

## Textual EDA for noun phrases

In [135]:
def eda_noun_phrases( data : pd.Series,data_title : str, docs ):
    result = calculate_frequency(get_noun_phrases(data,docs),data_title,True)
    eda_display_result(result,"Noun phrases for "+data_title)

## Textual EDA for N grams

In [136]:
def eda_n_grams( data : pd.Series,data_title : str, n_grams):
    result = calculate_frequency(get_ngrams(data,n_grams),data_title,True)
    eda_display_result(result,"N grams for "+data_title)

## Textual EDA for N grams without stop words

In [137]:
def eda_n_grams_without_stopwords( data : pd.Series,data_title : str, n_grams):
    result = calculate_frequency(get_ngrams(data,n_grams,False),data_title,True)
    eda_display_result(result,"N grams without stopwords for "+data_title)

## Textual EDA for top 10 TF-IDF from column

In [144]:
def eda_tf_idf(data : pd.Series, data_title : str):
    result = calculate_tf_idf(data,data_title)
    eda_display_result(result.head(10),"TOP 10 TF-IDF for "+data_title,pie_chart=False)

## Combined textual EDA

In [146]:
def eda_textual(data : pd.DataFrame):
    pbar = tqdm(data.columns)
    for column_name in pbar:
        pbar.set_description('Eda on textual data ['+column_name+']')
        column_data = prepare_text_data(data[column_name])
        docs = get_nlp_docs(column_data)
        eda_words_freq(column_data,column_name)
        eda_n_grams(column_data,column_name,3)
        eda_n_grams_without_stopwords(column_data,column_name,3)
        eda_noun_phrases(column_data,column_name,docs)
        eda_named_entities(column_data,column_name,docs)
        eda_entity_words(column_data,column_name,docs)
        eda_tf_idf(column_data,column_name)

## Execute combined textual EDA on specific columns from PWDB dataset

In [147]:
eda_textual(df[TEXTUAL_COLUMNS])




Eda on textual data [involvement_of_social_partners_description]: 100%|██████████| 5/5 [00:59<00:00, 11.88s/it]


Words frequency for title

('title', 'Relative freq')

N grams for title

('title', 'Relative freq')

N grams without stopwords for title

('title', 'Relative freq')

Noun phrases for title

('title', 'Relative freq')

Named entities for title

('title', 'Relative freq')

Entity words for title

('title', 'Relative freq')

TOP 10 TF-IDF for title

('title', 'TF-IDF')

Words frequency for background_info_description

('background_info_description', 'Relative freq')

N grams for background_info_description

('background_info_description', 'Relative freq')

N grams without stopwords for background_info_description

('background_info_description', 'Relative freq')

Noun phrases for background_info_description

('background_info_description', 'Relative freq')

Named entities for background_info_description

('background_info_description', 'Relative freq')

Entity words for background_info_description

('background_info_description', 'Relative freq')

TOP 10 TF-IDF for background_info_description

('background_info_description', 'TF-IDF')

Words frequency for content_of_measure_description

('content_of_measure_description', 'Relative freq')

N grams for content_of_measure_description

('content_of_measure_description', 'Relative freq')

N grams without stopwords for content_of_measure_description

('content_of_measure_description', 'Relative freq')

Noun phrases for content_of_measure_description

('content_of_measure_description', 'Relative freq')

Named entities for content_of_measure_description

('content_of_measure_description', 'Relative freq')

Entity words for content_of_measure_description

('content_of_measure_description', 'Relative freq')

TOP 10 TF-IDF for content_of_measure_description

('content_of_measure_description', 'TF-IDF')

Words frequency for use_of_measure_description

('use_of_measure_description', 'Relative freq')

N grams for use_of_measure_description

('use_of_measure_description', 'Relative freq')

N grams without stopwords for use_of_measure_description

('use_of_measure_description', 'Relative freq')

Noun phrases for use_of_measure_description

('use_of_measure_description', 'Relative freq')

Named entities for use_of_measure_description

('use_of_measure_description', 'Relative freq')

TOP 10 TF-IDF for use_of_measure_description

('use_of_measure_description', 'TF-IDF')

Words frequency for involvement_of_social_partners_description

('involvement_of_social_partners_description', 'Relative freq')

N grams for involvement_of_social_partners_description

('involvement_of_social_partners_description', 'Relative freq')

N grams without stopwords for involvement_of_social_partners_description

('involvement_of_social_partners_description', 'Relative freq')

Noun phrases for involvement_of_social_partners_description

('involvement_of_social_partners_description', 'Relative freq')

Named entities for involvement_of_social_partners_description

('involvement_of_social_partners_description', 'Relative freq')

TOP 10 TF-IDF for involvement_of_social_partners_description

('involvement_of_social_partners_description', 'TF-IDF')