In [1]:
import math
from collections import Counter

import numpy as np
import pandas as pd
import spacy
from tqdm.notebook import tqdm
import os

nlp = spacy.load("nl_core_news_sm")
from util.conferences import get_conference_data, correct_cwd

In [None]:
TF_IDF_CSV_PATH = 'output/tf_idf.csv'
CORPUS_PATH = 'output/corpus.csv'

In [2]:
correct_cwd()

In [3]:
def calculate_tfidf(text_by_speaker: tuple) -> tuple:
    """

    Calculates the tfidf per speaker per conference

    :return: a tuple containing Rutte texts and De Jonge texts respectively

    """
    corpus = pd.DataFrame(columns=['words'])
    nr_of_conferences = len(text_by_speaker[0])

    # Create Dataframe with Word Counts
    for i in tqdm(range(nr_of_conferences)):
        full_conference_text = text_by_speaker[0][i]['text'] + text_by_speaker[1][i]['text']
        words = [token.lemma_ for token in nlp(full_conference_text) if
                 not (token.is_stop or token.is_punct or token.is_space)]

        word_count = Counter(words)

        new_words = list(set(word_count.keys()) - set(corpus['words']))
        corpus = corpus.append(pd.DataFrame({'words': new_words}), ignore_index=True)

        wordlist = []
        for word in corpus['words']:
            if word in word_count.keys():
                wordlist.append(word_count[word])
            else:
                wordlist.append(0)

        corpus[text_by_speaker[0][i]['date']] = wordlist

    corpus.set_index('words', inplace=True)
    corpus.fillna(0, inplace=True)

    corpus.to_csv(CORPUS_PATH)

    tf_idf = {k: [] for k in corpus.columns}

    # Create Dataframe with Relative Word Frequencies
    for index, row in tqdm(corpus.iterrows(), total=len(corpus)):
        docs_with = np.count_nonzero(row)

        for colname, count in row.items():
            total_uniques = np.count_nonzero(corpus[colname])
            value = (count / total_uniques) * math.log(len(corpus.columns) / docs_with)

            tf_idf[colname].append(value)

    tf_idf_df = pd.DataFrame.from_dict(tf_idf)
    tf_idf_df.set_index(corpus.index, inplace=True)

    tf_idf_df.to_csv(TF_IDF_CSV_PATH)

    print("Saved results to 'output/corpus.csv' and 'output/tf_idf.csv'")


In [4]:
if not os.path.isfile(TF_IDF_CSV_PATH):
    calculate_tfidf(get_conference_data())

In [None]:
tf_idf_matrix = pd.read_csv(TF_IDF_CSV_PATH)