In [1]:
import os
from collections import Counter

import numpy as np
import pandas as pd
import spacy
from tqdm.notebook import tqdm
import math
from difflib import SequenceMatcher

nlp = spacy.load("nl_core_news_lg")
from util.conferences import get_conference_data, correct_cwd

In [2]:
TF_IDF_CSV_PATH = 'output/tf_idf.csv'
TF_CSV_PATH = 'output/tf.csv'
CORPUS_PATH = 'output/corpus.csv'

In [3]:
correct_cwd()

In [4]:
def calculate_tfidf(text_by_speaker: tuple, include_idf=True) -> tuple:
    """

    Calculates the tfidf per speaker per conference

    :return: a tuple containing Rutte texts and De Jonge texts respectively

    """
    corpus = pd.DataFrame(columns=['word'])
    nr_of_conferences = len(text_by_speaker[0])

    # Create Dataframe with Word Counts
    for i in tqdm(range(nr_of_conferences)):
        full_conference_text = text_by_speaker[0][i]['text'] + text_by_speaker[1][i]['text']
        word = [token.lemma_ for token in nlp(full_conference_text) if
                 not (token.is_stop or token.is_punct or token.is_space)]

        word_count = Counter(word)

        new_word = list(set(word_count.keys()) - set(corpus['word']))
        corpus = corpus.append(pd.DataFrame({'word': new_word}), ignore_index=True)

        wordlist = []
        for word in corpus['word']:
            if word in word_count.keys():
                wordlist.append(word_count[word])
            else:
                wordlist.append(0)

        corpus[text_by_speaker[0][i]['date']] = wordlist

    corpus.set_index('word', inplace=True)
    corpus.fillna(0, inplace=True)

    corpus.to_csv(CORPUS_PATH)

    tf_idf = {k: [] for k in corpus.columns}

    # Create Dataframe with Relative Word Frequencies
    for index, row in tqdm(corpus.iterrows(), total=len(corpus)):
        docs_with = np.count_nonzero(row)

        for colname, count in row.items():
            total_uniques = np.count_nonzero(corpus[colname])
            tf = count / total_uniques
            result = tf
            if include_idf:
                idf = math.log(len(corpus.columns) / docs_with)
                result *= idf

            tf_idf[colname].append(result)

    tf_idf_df = pd.DataFrame.from_dict(tf_idf)
    tf_idf_df.set_index(corpus.index, inplace=True)

    if include_idf:
        tf_idf_df.to_csv(TF_IDF_CSV_PATH)
        print(f"Saved results to 'output/corpus.csv and '{TF_IDF_CSV_PATH}'")
    else:
        tf_idf_df.to_csv(TF_CSV_PATH)
        print(f"Saved results to 'output/corpus.csv' and '{TF_CSV_PATH}'")



In [5]:
if not os.path.isfile(TF_IDF_CSV_PATH) or True:
    calculate_tfidf(get_conference_data(include_journalist_questions=True), include_idf=True)
    calculate_tfidf(get_conference_data(include_journalist_questions=True), include_idf=False)

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/10078 [00:00<?, ?it/s]

Saved results to 'output/corpus.csv and 'output/tf_idf.csv'


  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/10078 [00:00<?, ?it/s]

Saved results to 'output/corpus.csv' and 'output/tf.csv'


In [6]:
tf_idf_matrix = pd.read_csv(TF_IDF_CSV_PATH)
tf_idf_matrix

Unnamed: 0,word,2020-03-06,2020-03-09,2020-03-12,2020-03-13,2020-03-19,2020-03-20,2020-03-23,2020-03-25,2020-03-27,...,2021-07-09,2021-07-13,2021-08-13,2021-09-14,2021-11-03,2021-11-12,2021-11-27,2021-12-14,2021-12-18,2022-01-14
0,voelen,0.000700,0.011135,0.000000,0.000000,0.000780,0.001768,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000810,0.001108,0.001841,0.000399,0.000738,0.000411,0.005727
1,deskundig,0.002608,0.005188,0.002538,0.004464,0.000581,0.007466,0.0,0.000964,0.000000,...,0.000338,0.0,0.000342,0.000302,0.000275,0.000000,0.001487,0.000825,0.000306,0.000000
2,aanstaande,0.001575,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.014556,0.003071,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000898,0.000000,0.001849,0.000000
3,reisadvies,0.003987,0.000000,0.000000,0.000000,0.000000,0.001679,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.001052,0.000000,0.002274,0.000000,0.000000,0.000000
4,aandacht,0.003598,0.000000,0.000000,0.002053,0.005346,0.000000,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000684,0.001265,0.002111,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10073,visite,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004590
10074,a.,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004590
10075,risiconemen,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004590
10076,afwijzen,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004590


In [7]:
def similar_in_meaning(token_0, token_2):
    return token_0.similarity(token_2) > 0.9

def merge_similar_words(df):
    for i, row in df.iterrows():
        token = nlp(row.word)[0]
        if token.has_vector:
            for j, row_other in df.iloc[i + 1:].iterrows():
                token_other = nlp(row_other.word)[0]
                if token_other.has_vector and similar_in_meaning(token, token_other):
                    print(similar_in_meaning(token, token_other))
                    print(f"{i}: {row.word} is similar to {j}: {row_other.word}")
                    # merge
                    df.loc[i] += df.loc[j]
                    df.loc[i][0] = row.word
                    df.drop(j, inplace=True)


# reduced_tf_idf = tf_idf_matrix.copy()
# merge_similar_words(reduced_tf_idf)
# tf_idf_matrix.to_csv('corrected_tf_idf')

In [8]:
WORD_COUNT = 20

for column in tf_idf_matrix.columns[1:]:
    freqs = tf_idf_matrix[column]
    sorted_by_column = tf_idf_matrix.sort_values(by=[column], ascending=False)
    print(sorted_by_column[['word', column]].head(WORD_COUNT))
    print('\n')

              word  2020-03-06
502        turkije    0.055910
458         grieks    0.033652
678           2016    0.023296
622    griekenland    0.022435
186   staatsbezoek    0.016826
403        stootje    0.016826
465       tekening    0.016826
464     humanitair    0.016826
218  deskundigheid    0.014839
189    nabestaande    0.013801
397       regering    0.013293
252      nationaal    0.012602
342    slachtoffer    0.012311
270          sfeer    0.012272
637          grens    0.011821
110      vraagstuk    0.011505
590    competentie    0.011217
686    strafproces    0.011217
568  gerechtigheid    0.011217
507             vn    0.011217


                 word  2020-03-09
861           brabant    0.097241
39           schudden    0.034363
852       nuchterheid    0.033469
885    redelijkerwijs    0.033469
770          maatwerk    0.030698
145         indamfase    0.029516
847              mild    0.029293
887               zev    0.027451
892             bosch    0.022313
231    