In [16]:
import os
from collections import Counter

import numpy as np
import pandas as pd
import spacy
from tqdm.notebook import tqdm
import math
from difflib import SequenceMatcher

nlp = spacy.load("nl_core_news_lg")
from util.conferences import get_conference_data, correct_cwd

In [17]:
TF_IDF_CSV_PATH = 'output/tf_idf.csv'
TF_CSV_PATH = 'output/tf.csv'
CORPUS_PATH = 'output/corpus.csv'

In [18]:
correct_cwd()

In [19]:
def calculate_tfidf(text_by_speaker: tuple, include_idf=True) -> tuple:
    """

    Calculates the tfidf per speaker per conference

    :return: a tuple containing Rutte texts and De Jonge texts respectively

    """
    corpus = pd.DataFrame(columns=['word'])
    nr_of_conferences = len(text_by_speaker[0])

    # Create Dataframe with Word Counts
    for i in tqdm(range(nr_of_conferences)):
        full_conference_text = text_by_speaker[0][i]['text'] + text_by_speaker[1][i]['text']
        word = [token.lemma_ for token in nlp(full_conference_text) if
                 not (token.is_stop or token.is_punct or token.is_space)]

        word_count = Counter(word)

        new_word = list(set(word_count.keys()) - set(corpus['word']))
        corpus = corpus.append(pd.DataFrame({'word': new_word}), ignore_index=True)

        wordlist = []
        for word in corpus['word']:
            if word in word_count.keys():
                wordlist.append(word_count[word])
            else:
                wordlist.append(0)

        corpus[text_by_speaker[0][i]['date']] = wordlist

    corpus.set_index('word', inplace=True)
    corpus.fillna(0, inplace=True)

    corpus.to_csv(CORPUS_PATH)

    tf_idf = {k: [] for k in corpus.columns}

    # Create Dataframe with Relative Word Frequencies
    for index, row in tqdm(corpus.iterrows(), total=len(corpus)):
        docs_with = np.count_nonzero(row)

        for colname, count in row.items():
            total_uniques = np.count_nonzero(corpus[colname])
            tf = count / total_uniques
            result = tf
            if include_idf:
                idf = math.log(len(corpus.columns) / docs_with)
                result *= idf

            tf_idf[colname].append(result)

    tf_idf_df = pd.DataFrame.from_dict(tf_idf)
    tf_idf_df.set_index(corpus.index, inplace=True)

    if include_idf:
        tf_idf_df.to_csv(TF_IDF_CSV_PATH)
        print(f"Saved results to 'output/corpus.csv and '{TF_IDF_CSV_PATH}'")
    else:
        tf_idf_df.to_csv(TF_CSV_PATH)
        print(f"Saved results to 'output/corpus.csv' and '{TF_CSV_PATH}'")



In [20]:
if not os.path.isfile(TF_IDF_CSV_PATH) or True:
    calculate_tfidf(get_conference_data(include_journalist_questions=True), include_idf=True)
    calculate_tfidf(get_conference_data(include_journalist_questions=True), include_idf=False)

  0%|          | 0/62 [00:00<?, ?it/s]

  0%|          | 0/10079 [00:00<?, ?it/s]

Saved results to 'output/corpus.csv and 'output/tf_idf.csv'


  0%|          | 0/62 [00:00<?, ?it/s]

  0%|          | 0/10079 [00:00<?, ?it/s]

Saved results to 'output/corpus.csv' and 'output/tf.csv'


In [21]:
tf_idf_matrix = pd.read_csv(TF_IDF_CSV_PATH)
tf_idf_matrix

Unnamed: 0,word,2020-03-06,2020-03-09,2020-03-12,2020-03-13,2020-03-15,2020-03-17,2020-03-19,2020-03-20,2020-03-23,...,2021-07-09,2021-07-13,2021-08-13,2021-09-14,2021-11-03,2021-11-12,2021-11-27,2021-12-14,2021-12-18,2022-01-14
0,bruinsen,0.005654,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,raad,0.004280,0.0,0.0,0.001831,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.002504,0.000000,0.00000,0.000000,0.001126,0.000000,0.000000,0.000000,0.000000
2,hopen,0.000279,0.0,0.0,0.000358,0.0,0.0,0.000461,0.000352,0.000399,...,0.000814,0.000000,0.001005,0.00121,0.000442,0.000660,0.000557,0.000809,0.000409,0.000913
3,ontzettend,0.001489,0.0,0.0,0.001275,0.0,0.0,0.002464,0.003762,0.000000,...,0.001933,0.000871,0.000000,0.00000,0.000000,0.000784,0.000849,0.000393,0.000000,0.000000
4,triest,0.007509,0.0,0.0,0.000000,0.0,0.0,0.004140,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10074,crowdmanagement,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004627
10075,22,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004627
10076,meubelwinkel,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004627
10077,naja,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004627


In [22]:
def similar_in_meaning(token_0, token_2):
    return token_0.similarity(token_2) > 0.9

def merge_similar_words(df):
    for i, row in df.iterrows():
        token = nlp(row.word)[0]
        if token.has_vector:
            for j, row_other in df.iloc[i + 1:].iterrows():
                token_other = nlp(row_other.word)[0]
                if token_other.has_vector and similar_in_meaning(token, token_other):
                    print(similar_in_meaning(token, token_other))
                    print(f"{i}: {row.word} is similar to {j}: {row_other.word}")
                    # merge
                    df.loc[i] += df.loc[j]
                    df.loc[i][0] = row.word
                    df.drop(j, inplace=True)


# reduced_tf_idf = tf_idf_matrix.copy()
# merge_similar_words(reduced_tf_idf)
# tf_idf_matrix.to_csv('corrected_tf_idf')

In [23]:
WORD_COUNT = 20

for column in tf_idf_matrix.columns[1:]:
    freqs = tf_idf_matrix[column]
    sorted_by_column = tf_idf_matrix.sort_values(by=[column], ascending=False)
    print(sorted_by_column[['word', column]].head(WORD_COUNT))
    print('\n')

              word  2020-03-06
171        turkije    0.056449
616         grieks    0.033922
569           2016    0.023520
7      griekenland    0.022614
537        stootje    0.016961
479       tekening    0.016961
432     humanitair    0.016961
318   staatsbezoek    0.016961
152  deskundigheid    0.015018
711    nabestaande    0.014025
307       regering    0.013608
146      nationaal    0.012961
647          sfeer    0.012497
683    slachtoffer    0.012446
101          grens    0.012226
483      vraagstuk    0.011864
572  gerechtigheid    0.011307
524         lassen    0.011307
612            5.5    0.011307
718   persaandacht    0.011307


                 word  2020-03-09
794           brabant    0.099653
103          schudden    0.035257
875       nuchterheid    0.033737
813    redelijkerwijs    0.033737
740          maatwerk    0.031323
296         indamfase    0.029873
768              mild    0.029829
730               zev    0.027898
843             bosch    0.022491
837    