In [24]:
import os
from collections import Counter

import numpy as np
import pandas as pd
import spacy
from tqdm.notebook import tqdm
import math
from difflib import SequenceMatcher

nlp = spacy.load("nl_core_news_lg")
from util.conferences import get_conference_data, correct_cwd

In [25]:
TF_IDF_CSV_PATH = 'output/tf_idf.csv'
TF_CSV_PATH = 'output/tf.csv'
CORPUS_PATH = 'output/corpus.csv'

In [26]:
correct_cwd()

In [27]:
def calculate_tfidf(text_by_speaker: tuple, include_idf=True) -> tuple:
    """

    Calculates the tfidf per speaker per conference

    :return: a tuple containing Rutte texts and De Jonge texts respectively

    """
    corpus = pd.DataFrame(columns=['word'])
    nr_of_conferences = len(text_by_speaker[0])

    # Create Dataframe with Word Counts
    for i in tqdm(range(nr_of_conferences)):
        full_conference_text = text_by_speaker[0][i]['text'] + text_by_speaker[1][i]['text']
        word = [token.lemma_ for token in nlp(full_conference_text) if
                 not (token.is_stop or token.is_punct or token.is_space)]

        word_count = Counter(word)

        new_word = list(set(word_count.keys()) - set(corpus['word']))
        corpus = corpus.append(pd.DataFrame({'word': new_word}), ignore_index=True)

        wordlist = []
        for word in corpus['word']:
            if word in word_count.keys():
                wordlist.append(word_count[word])
            else:
                wordlist.append(0)

        corpus[text_by_speaker[0][i]['date']] = wordlist

    corpus.set_index('word', inplace=True)
    corpus.fillna(0, inplace=True)

    corpus.to_csv(CORPUS_PATH)

    tf_idf = {k: [] for k in corpus.columns}

    # Create Dataframe with Relative Word Frequencies
    for index, row in tqdm(corpus.iterrows(), total=len(corpus)):
        docs_with = np.count_nonzero(row)

        for colname, count in row.items():
            total_uniques = np.count_nonzero(corpus[colname])
            tf = count / total_uniques
            result = tf
            if include_idf:
                idf = math.log(len(corpus.columns) / docs_with)
                result *= idf

            tf_idf[colname].append(result)

    tf_idf_df = pd.DataFrame.from_dict(tf_idf)
    tf_idf_df.set_index(corpus.index, inplace=True)

    if include_idf:
        tf_idf_df.to_csv(TF_IDF_CSV_PATH)
        print(f"Saved results to 'output/corpus.csv and '{TF_IDF_CSV_PATH}'")
    else:
        tf_idf_df.to_csv(TF_CSV_PATH)
        print(f"Saved results to 'output/corpus.csv' and '{TF_CSV_PATH}'")



In [28]:
if not os.path.isfile(TF_IDF_CSV_PATH):
    calculate_tfidf(get_conference_data(include_journalist_questions=True), include_idf=True)
    calculate_tfidf(get_conference_data(include_journalist_questions=True), include_idf=False)

In [29]:
tf_idf_matrix = pd.read_csv(TF_IDF_CSV_PATH)
tf_idf_matrix

Unnamed: 0,word,2020-09-01,2020-09-18,2020-09-28,2020-10-13,2020-10-27,2020-11-03,2020-11-17,2020-12-08,2021-01-12,...,2021-07-09,2021-07-13,2021-08-13,2021-09-14,2021-11-03,2021-11-12,2021-11-27,2021-12-14,2021-12-18,2022-01-14
0,inzet,0.001829,0.00000,0.001929,0.000000,0.000000,0.000000,0.0,0.000000,0.000850,...,0.001893,0.000000,0.000000,0.000844,0.001540,0.000768,0.000000,0.001539,0.000000,0.000000
1,voorstaan,0.001702,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.001672,0.000000,...,0.001761,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,raken,0.000123,0.00032,0.000194,0.000053,0.000124,0.000304,0.0,0.000362,0.000171,...,0.000572,0.000344,0.000192,0.000283,0.000465,0.000464,0.000223,0.000155,0.000057,0.000481
3,prioriteit,0.001106,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.001087,0.001029,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.002888
4,behandelfase,0.002893,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7745,crowdmanagement,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.003775
7746,zwaarstwegend,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.003775
7747,platleggen,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.003775
7748,71,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.003775


In [30]:
def similar_in_meaning(token_0, token_2):
    return token_0.similarity(token_2) > 0.9

def merge_similar_words(df):
    for i, row in df.iterrows():
        token = nlp(row.word)[0]
        if token.has_vector:
            for j, row_other in df.iloc[i + 1:].iterrows():
                token_other = nlp(row_other.word)[0]
                if token_other.has_vector and similar_in_meaning(token, token_other):
                    print(similar_in_meaning(token, token_other))
                    print(f"{i}: {row.word} is similar to {j}: {row_other.word}")
                    # merge
                    df.loc[i] += df.loc[j]
                    df.loc[i][0] = row.word
                    df.drop(j, inplace=True)


# reduced_tf_idf = tf_idf_matrix.copy()
# merge_similar_words(reduced_tf_idf)
# tf_idf_matrix.to_csv('corrected_tf_idf')

In [31]:
WORD_COUNT = 20

for column in tf_idf_matrix.columns[1:]:
    freqs = tf_idf_matrix[column]
    sorted_by_column = tf_idf_matrix.sort_values(by=[column], ascending=False)
    print(sorted_by_column[['word', column]].head(WORD_COUNT))
    print('\n')

                      word  2020-09-01
680              richtlijn    0.027287
978             preventief    0.023388
171         asymptomatisch    0.017541
165                  boete    0.013432
1067                 regio    0.012262
458              evaluatie    0.012211
685         voortschrijden    0.012081
503             navolgbaar    0.011571
1114  beschermingsmiddelen    0.010211
74            laboratorium    0.009190
1017            innovatief    0.009190
232                    vog    0.008679
868                 heilig    0.008679
191                 extern    0.008679
579               uitbraak    0.008232
251              opvatting    0.008121
791                 vergen    0.007745
1004                 proef    0.007327
867                 waarde    0.007327
598                  razen    0.006892


                     word  2020-09-18
680             richtlijn    0.038590
1438             vluchtig    0.024117
1067                regio    0.022361
681             schaarste  