In [1]:
import shutup;
shutup.please()

In [2]:
# importing libraries

# nlp libraries
import spacy
import de_core_news_sm
from wordcloud import WordCloud
from collections import Counter
from textblob_de import TextBlobDE as TextBlob
import string
import re
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
from gensim import corpora
import pyLDAvis
import pandas as pd
import nltk

# visualization libraries
import plotly.subplots as sp
import plotly.graph_objects as go


# interactive widgets
import ipywidgets as widgets
from ipywidgets import Layout



In [3]:
nltk.download('punkt');

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michaelstark/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# jupyter matplotlib backend magic
%matplotlib inline

In [5]:
# function for reading in the dataset according to year input

def read_textfile(year):
    file = pd.read_csv(f'fem_df{year}.csv')
    return file


In [6]:
# functions performing nlp - analysis

# loading german corpus from spacy library
global nlp

nlp = de_core_news_sm.load()


In [7]:
# function for cleaning and splitting input texts for further processing

def file_io(filename):
    f_io = filename.split('.')
    f_c = [re.sub('\\n', '', f) for f in f_io]
    return f_c


In [8]:
# function that performs pos-tagging on sentence and returns list with pos-tags

def pos_tagger(sent):
    doc = nlp(sent)
    pos_tags = [token.tag_ for token in doc]
    return pos_tags


In [9]:
# function that searches string for named entities and returns list of entities

def recog_ne(sent):
    doc = nlp(sent)
    named_ents = [ent.label_ for ent in doc.ents]
    return named_ents


In [10]:
# function that performs sentiment analysis on a given string and returns polarity

def sent_analysis(sent):
    blob = TextBlob(sent)
    return blob.sentiment[0]


In [11]:
# function that creates a wordcloud derived from topic modeling

def wc_from_tm(input_string, num_topics=1):
    # Process the input string using the German language model
    doc = nlp(input_string)

    # Tokenize the input string, removing stop words and punctuation
    tokens = [token.lemma_ for token in doc
              if not token.is_stop and not token.is_punct]

    # Create a dictionary from the tokenized input
    dictionary = corpora.Dictionary([tokens])

    # Create a bag of words representation of the input
    bow = [dictionary.doc2bow(tokens)]

    # Train the LDA model on the input
    lda_model = LdaModel(bow, num_topics=num_topics, id2word=dictionary)

    topic_distribution = lda_model[bow]
    # Create a word cloud for each topic
    for topic_id, topic_prob in topic_distribution[0]:
        topic = lda_model.show_topic(topic_id)
        topic_words = " ".join([word for word, prob in topic])
        wordcloud = WordCloud(background_color='white',
                              width=400,
                              height=200,
                              max_words=200,
                              max_font_size=50,
                              min_font_size=12).generate(topic_words)
        
    return wordcloud.to_array()

In [12]:
# utility function for creating an ordered, counted dictionary

def count_dict(obj_to_count_from):
    c = Counter(obj_to_count_from)
    sort_c = sorted(c.items(), key=lambda x:x[1], reverse=True)
    sort_dict_c = {obj[0]: obj[1] for obj in sort_c}
    return sort_dict_c


In [13]:
# function for visualizing the data

def visualize_data(text_num=0, year=2019):
    
    if text_num > len(read_textfile(year)):
        return
    
    inp_string = read_textfile(year)
    
    text_to_vis = inp_string[f'articles_{year}'][text_num]
    
    # prepare necessary data
    text_to_analyze = file_io(text_to_vis)

    pos_tag_count = count_dict(pos_tagger(text_to_vis))

    # count the named entities

    ne = []

    for sen in text_to_analyze:
        ne += recog_ne(sen)

    counted_na_ent = count_dict(ne)


    # sent_analysis for whole article

    sent_an = [sent_analysis(sent) for sent in text_to_analyze]

    # make dataframes for visualizations

    make_df = {'Pos Tags':pos_tag_count.keys(),
               'Count': pos_tag_count.values()}

    make_df_na = {'Named Entities':counted_na_ent.keys(),
                  'Count': counted_na_ent.values()}

    df_make_sa = {'Sentence No.': list(range(len(sent_an))),
                  'Sentiment Polarity': sent_an}

    df_sa = pd.DataFrame(df_make_sa)
    df_na = pd.DataFrame(make_df_na)
    df_pt = pd.DataFrame(make_df)
    wc_array = wc_from_tm(text_to_vis)


    # making the subplots
    fig_sb = sp.make_subplots(rows=2,
                              cols=2,
                              subplot_titles=("Pos Tags",
                                              "Named Entites",
                                              "Sentiment Analysis",
                                              "WordCloud from Topic Modeling"),
                              specs=[[{'type':'bar'}, {'type':'domain'}],
                                    [{'type':'xy'}, {}]])


        # first one = barplot pos_tags
    fig_pt = go.Bar(x = df_pt['Pos Tags'],
             y = df_pt['Count'],
             marker_color=list(range(len(df_pt['Pos Tags'])))[::-1])


        # second one = pieplot named entities
    fig_na = go.Pie(labels=df_na['Named Entities'],
                    values=df_na['Count'])


    # third one = line plot sentiment analysis

    fig_sa = go.Scatter(x=df_sa['Sentence No.'],
                        y = df_sa['Sentiment Polarity'])


    fig_wc = go.Image(z=wc_array, hoverinfo='none')


        # adding to main plot

    fig_sb.add_trace(fig_pt, row=1, col=1)
    fig_sb.add_trace(fig_na, row=1, col=2)
    fig_sb.add_trace(fig_sa, row=2, col=1)
    fig_sb.add_trace(fig_wc, row=2, col=2)

    # styling plot
    fig_sb.update_xaxes(tickfont_size=10,  row=1, col=1)
    fig_sb.update_xaxes(visible=False,  row=2, col=1)
    fig_sb.update_xaxes(visible=False,  row=2, col=1)
    fig_sb.update_xaxes(visible=False,  row=2, col=2)

    fig_sb.update_yaxes(range=[-1, 1], row=2, col=1)
    fig_sb.update_yaxes(visible=False,  row=2, col=2)
    
    fig_sb.update_annotations(yshift=12, 
                              font=dict(family='Courier New, monospace', 
                                        size=14, color='Black'))


    fig_sb.update_layout(
        width = 1350,
        height = 600,
        title='NLP Dashboard for Media Coverage of Femicides in Austria',
        title_font=dict(
            family='Courier New, monospace',
            size=18,
            color='Black'
            ),
        showlegend=False
        )

    return fig_sb.show()

In [14]:
# creating widgets


text_select = widgets.BoundedIntText(
                                    value=0,
                                    min=0,
                                    max=50,
                                    step=1,
                                    description='Article No.',
                                    disabled=False)


year_select = widgets.BoundedIntText(
                                     value=2019,
                                     min=2019,
                                     max=2022,
                                     step=1,
                                     description='Year:',
                                     disabled=False)

<h2 style="text-align:center; font-family:Courier New, monospace; font-weight:bold;">F.E.M.</h2>

In [15]:
int_act = widgets.interact(visualize_data, text_num=text_select, year=year_select)

interactive(children=(BoundedIntText(value=0, description='Article No.', max=50), BoundedIntText(value=2019, d…