# Imports

In [43]:
import sys 
sys.path.append('..')

from itertools import chain
import math

from util.nos_articles import *
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import spacy
# nlp = spacy.load("nl_core_news_sm")
nlp = spacy.load("nl_core_news_lg")

# Download Articles

In [129]:
dates = [x.strftime('%Y-%m-%d') for x in pd.date_range(start='2020-04-01', end='2020-12-31', freq='D')]

In [39]:
# Only run if you want to download new articles
# download_articles(dates=dates, max_articles=5000, refresh_articles=False)

# Load Articles

In [131]:
articles_dict = load_nos_texts(dates=dates)
# articles_dict['2020-04-01']

  0%|          | 0/275 [00:00<?, ?it/s]

# Prepare Data

In [5]:
list_of_articles = list(chain(*articles_dict.values()))
list_of_articles = [' '.join(x) for x in list_of_articles]

# list_of_text = [item for sublist in list_of_articles for item in sublist]
# print(len(list_of_text))
# chunk_size =  int(len(list_of_text)/500)
# print(chunk_size)
# chunks = [list_of_text[x:x+chunk_size] for x in range(0, len(list_of_text), chunk_size)]
# chunks[1]
# string_chunks = [' '.join(x) for x in chunks]
# len(list_of_text)
# stringed_text = ' '.join(list_of_text)
# stringed_text[:500]
# len(string_chunks)

# Calculate Average Sentence Length

In [46]:
average_sentence_length = get_average_sentence_length(stringed_text)
average_sentence_length

17.202796550050405

# Create Corpus

In [47]:
CORPUS_PATH = '../output/NOS_corpus_5000_articles_without_stopwords.csv'

In [49]:
def create_corpus(full_text: list, save=False, stopwords=False) -> tuple:
    nlp.max_length = 1500000
    processed_texts = [text for text in tqdm(nlp.pipe(full_text), total=len(full_text))]
    
    lemmatized_texts = [[word.lemma_ for word in processed_text if not (word.is_punct or word.is_space or (stopwords and word.is_stop))] \
                        for processed_text in tqdm(processed_texts, total=len(processed_texts))]

    flattened_lemmatized_texts = [item for sublist in lemmatized_texts for item in sublist]
    word_count = Counter(flattened_lemmatized_texts)
    
    # Create Dataframe with Word Counts
    corpus = pd.DataFrame(columns=['word'])
    
    # Allows you to differentiate between subsets by creating columns for each of them
    new_word = list(set(word_count.keys()) - set(corpus['word']))
    corpus = corpus.append(pd.DataFrame({'word': new_word}), ignore_index=True)

    wordlist = []
    for word in corpus['word']:
        if word in word_count.keys():
            wordlist.append(word_count[word])
        else:
            wordlist.append(0)

    corpus['nos'] = wordlist
    corpus.set_index('word', inplace=True)
    corpus.fillna(0, inplace=True)
    corpus.sort_values(by='nos', ascending=False, inplace=True)
    
    if save:
        corpus.to_csv(CORPUS_PATH)
    
    return corpus

In [184]:
# string_chunks[1][:5000].split(' ')
corpus = create_corpus(list_of_articles, save=False)
%time

Wall time: 0 ns


  0%|          | 0/4990 [00:00<?, ?it/s]

  0%|          | 0/4990 [00:00<?, ?it/s]

Unnamed: 0_level_0,nos
word,Unnamed: 1_level_1
jeugdig,1
winnen,187
katinka,2
veiligheidsbril,2
tijdstraf,1
...,...
wvggz,1
ryanair,26
werk-voucher,1
ruit,11


In [50]:
print(f"Document has {corpus['nos'].sum()} words, with {corpus['nos'].count()} unique values.")

Document has 855629 words, with 53323 unique values.


# Create tf & tf-idf

In [14]:
TF_IDF_CSV_PATH = '../output/NOS_tf_idf_5000_articles_without_stopwords.csv'
TF_CSV_PATH = '../output/NOS_tf_5000_articles_without_stopwords.csv'

In [29]:
corpus = pd.read_csv('../output/NOS_corpus_5000_articles_without_stopwords.csv')
corpus.set_index('word', inplace=True)

In [40]:
def calculate_tfidf(corpus, specified_path, include_idf=False, save=False):
    # Create Dataframe with Relative Word Frequencies
    tf_idf = {k: [] for k in corpus.columns}
    for index, row in tqdm(corpus.iterrows(), total=len(corpus)):
        docs_with = np.count_nonzero(row)

        for colname, count in row.items():
            
            total_uniques = np.count_nonzero(corpus[colname])
            tf = count / total_uniques

            if include_idf:
                idf = math.log(len(corpus.columns) / docs_with)
                tf *= idf

            tf_idf[colname].append(tf)

    tf_idf_df = pd.DataFrame.from_dict(tf_idf)
    tf_idf_df.set_index(corpus.index, inplace=True)
    tf_idf_df.sort_values(by='nos', ascending=False, inplace=True)
    
    if save:
        if include_idf:
            tf_idf_df.to_csv(TF_IDF_CSV_PATH)
            print(f"Saved TF-IDF DataFrame to '{specified_path}'")
        else:
            tf_idf_df.to_csv(TF_CSV_PATH)
            print(f"Saved TF DataFrame to '{specified_path}'")
            
    return tf_idf_df

In [41]:
tf_df = calculate_tfidf(corpus=corpus, include_idf=False, specified_path=TF_CSV_PATH, save=True)

  0%|          | 0/53323 [00:00<?, ?it/s]

Saved TF DataFrame to '../output/NOS_tf_5000_articles_without_stopwords.csv'


In [44]:
tf_idf_df = calculate_tfidf(corpus=corpus, include_idf=True, specified_path=TF_IDF_CSV_PATH, save=True)

  0%|          | 0/53323 [00:00<?, ?it/s]

Saved TF-IDF DataFrame to '../output/NOS_tf_idf_5000_articles_without_stopwords.csv'


# Basic Word Usage