# EDA on textual data

In [None]:
def get_str_columns(df : pd.DataFrame):
    column_names = []
    for column in df.columns:
        if type(df[column][0]) == str :
            column_names.append(column)
    return column_names


In [None]:
def plot_sentence_length_histogram(df : pd.DataFrame, title : str):
    plots =[]
    tmp_df = pd.Series(df.str.len())
    plots.append(px.histogram(tmp_df,title='Sentence len for: '+title))
    return plots


In [None]:
def plot_top_non_stopwords_barchart(text,title : str):
    stop=set(stopwords.words('english'))
    new= text.str.split()
    new=new.values.tolist()
    corpus=[]
    plots = []
    for element in new:
        if isinstance(element,Iterable):
            for word in element:
                corpus.append(str.lower(word))
    counter=Counter(corpus)
    most=counter.most_common()
    x, y=[], []
    for word,count in most[:40]:
        if word not in stop:
            x.append(word)
            y.append(count)

    plots.append(px.bar(x=y,y=x,title='Word frec for : '+title))
    return plots

In [None]:
def plot_top_ngrams_barchart(text,title : str ,n=2):
    plots = []
    try:
        stop=set(stopwords.words('english'))

        new= text.str.split()
        new=new.values.tolist()
        corpus=[]
        for element in new:
            if isinstance(element,Iterable):
                for word in element:
                    corpus.append(str.lower(str(word)))
        def _get_top_ngram(corpus_local, n=None):
            vec = CountVectorizer(ngram_range=(n, n)).fit(corpus_local)
            bag_of_words = vec.transform(corpus_local)
            sum_words = bag_of_words.sum(axis=0)
            words_freq = [(word, sum_words[0, idx])
                          for word, idx in vec.vocabulary_.items()]
            words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
            return words_freq[:10]

        top_n_bigrams=_get_top_ngram(corpus,n)[:10]
        x,y=map(list,zip(*top_n_bigrams))
        plots.append(px.bar(x=y,y=x,title='Top '+str(n)+' grams : '+title))
    except:
        print("Some error for textual data : ",title)
    return plots


In [None]:
def plot_named_entity_barchart(text,title:str):
    nlp = spacy.load("en_core_web_sm")
    ent = []
    plots = []
    for row in text:
        if type(row) == str:
            doc = nlp(row)
            for e in doc.ents:
                ent.append(e.label_)
    if ent:
        counter=Counter(ent)
        count=counter.most_common()
        x,y=map(list,zip(*count))
        plots.append(px.bar(x=y,y=x,title='Named entity : '+title))
    return plots

In [None]:
def plot_most_common_named_entity_barchart(text,title, entity="ORG"):
    nlp = spacy.load("en_core_web_sm")
    ent = []
    plots = []
    for row in text:
        if type(row) == str:
            doc = nlp(row)
            for e in doc.ents:
                if e.label_ == entity:
                    ent.append(str.lower(e.text))
    if ent:
        counter=Counter(ent)
        count=counter.most_common(10)
        x,y=map(list,zip(*count))
        plots.append(px.bar(x=y,y=x,title='Most common named entity ['+entity+'] : '+title))
    return plots

In [None]:
def plot_most_common_noun_phrases_barchart(text,title):
    nlp = spacy.load("en_core_web_sm")
    ent = []
    plots = []
    for row in text:
        if type(row) == str:
            doc = nlp(row)
            doc.cats
            for noun_phrase in doc.noun_chunks:
                    ent.append(str.lower(noun_phrase.text))
    if ent:
        counter=Counter(ent)
        count=counter.most_common(10)
        x,y=map(list,zip(*count))
        plots.append(px.bar(x=y,y=x,title='Most common noun phrases: '+title))
    return plots

In [None]:
def eda_on_text_data(df: pd.DataFrame):
    str_column_names = get_str_columns(df)
    plots = []
    pbar = tqdm(str_column_names)
    for column_name in pbar:
        pbar.set_description('Eda on text data : ',column_name)
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = []
            #plot_sentence_length_histogram(df[column_name],column_name)
            futures.append(executor.submit(plot_top_non_stopwords_barchart,df[column_name],column_name))
            #futures.append(executor.submit(plot_top_ngrams_barchart,df[column_name],column_name,3))
            #futures.append(executor.submit(plot_named_entity_barchart,df[column_name],column_name))
            #futures.append(executor.submit(plot_most_common_named_entity_barchart,df[column_name],column_name,'ORG'))
            #futures.append(executor.submit(plot_most_common_noun_phrases_barchart,df[column_name],column_name))
            for future in concurrent.futures.as_completed(futures):
                plots+=future.result()
    return plots

In [None]:
def execute_eda(src_path : path.PosixPath, output_path:path.PosixPath):
    if src_path.exists():
        df = pd.read_json(src_path)
        specific_categorical_columns= ['country', 'category', 'subcategory',
        'actors', 'target_groups', 'funding','sectors']
        specific_textual_columns= ['title',
       'background_info_description', 'content_of_measure_description',
        'use_of_measure_description','involvement_of_social_partners_description']
        plots = []
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(eda_on_categorical_data,df[specific_categorical_columns]),
                       executor.submit(eda_on_text_data,df[specific_textual_columns])]
            for future in concurrent.futures.as_completed(futures):
                plots+=future.result()
        pbar = tqdm(plots)
        if output_path is not None:
            if output_path.exists():
                with open(output_path,'w') as f:
                    pbar.set_description('Generate HTML report.')
                    for plot in pbar:
                        f.write(plot.to_html(full_html=False,include_plotlyjs='cdn'))
            else:
                print('This output_path:',output_path,'is invalid!')
        else:
             pbar.set_description('Render plots in jupyter notebook.')
             for plot in pbar:
                plot.show()
    else:
        print('This src_path:',src_path,'is invalid!')


In [None]:
class EdaPWDB():
    def __init__(self, src_path : path.PosixPath):
        if src_path.exists():
            self.plots = []
            self.df = pd.read_json(src_path)
            self.df ['title', 'country', 'category', 'subcategory',
       'background_info_description', 'content_of_measure_description',
        'use_of_measure_description','involvement_of_social_partners_description',
        'actors', 'target_groups', 'funding'].head(1)
        else:
            print('This src_path:',src_path,'is invalid!')
    def execute(self):
        specific_categorical_columns= ['country', 'category', 'subcategory',
        'actors', 'target_groups', 'funding']
        specific_textual_columns= ['title',
       'background_info_description', 'content_of_measure_description',
        'use_of_measure_description','involvement_of_social_partners_description']
        self.df[specific_textual_columns].head(1)
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(eda_on_categorical_data,self.df[specific_categorical_columns]),
                       executor.submit(eda_on_text_data,self.df[specific_textual_columns])]
            for future in concurrent.futures.as_completed(futures):
                self.plots+=future.result()
    def show(self):
        progress_bar = tqdm(self.plots)
        progress_bar.set_description('Render plots in jupyter notebook.')
        for plot in progress_bar:
            plot.show()
    def export_to_html(self,output_path : path.PosixPath):
        progress_bar = tqdm(self.plots)
        with open(output_path,'w') as f:
            progress_bar.set_description('Generate HTML report.')
            for plot in progress_bar:
               f.write(plot.to_html(full_html=False,include_plotlyjs='cdn'))



In [None]:
#execute_eda(SRC_FILE_PATH,None)


In [None]:
eda_pwdb = EdaPWDB(SRC_FILE_PATH)


In [None]:
eda_pwdb.execute()

In [None]:
#eda_pwdb.show()
#eda_pwdb.export_to_html(OUTPUT_FILE_PATH)


# EDA on categorical data

In [None]:
def get_list_from_series(series : pd.Series):
    result_list = []
    for elements in series:
        if type(elements)==list:
            for element in elements:
               result_list.append(element)
    return result_list

In [None]:
def prepare_series_from_dataframe(df : pd.DataFrame):
    result_dict = {}
    columns = df.columns
    for column in columns:
        if type(df[column]) == pd.Series:
            if type(df[column][0]) == list:
                tmp_list = get_list_from_series(df[column])
                result_dict[column]=tmp_list
    return result_dict

In [None]:
def eda_on_categorical_data(df : pd.DataFrame):
    prepared_data = prepare_series_from_dataframe(df)
    plots = []
    pbar = tqdm(prepared_data.keys())
    for key in pbar:
        pbar.set_description('Eda on categorical data : ',str(key))
        if type(prepared_data[key])!=dict:
            try:
                duplicate_dict = Counter(prepared_data[key])
                duplicate_dict = dict(sorted(duplicate_dict.items(), key=lambda item: item[1],reverse=True))
                column_stat = pd.DataFrame({'Data':duplicate_dict.keys(),'Count':duplicate_dict.values()})
                plots.append(px.bar(column_stat,x='Data',y='Count',title=key))
                plots.append(px.pie(column_stat,names='Data',values='Count',title=key))
            except:
                print('Some error for categorial data : ',key)
    return plots