In [1]:
import os
import json
import numpy as np

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Find top TFIDF Words

In [3]:
files = []
for file in os.listdir("./Months_Cleaned/"):
    if file.endswith(".json"):
        files.append(file)

In [4]:
corpus = []
files = sorted(sorted(files), key=lambda x: x[3:7])
for file in files:
    with open(f"./Months_Cleaned/{file}", 'r', encoding="utf-8") as f:
        tokens = [t.strip('.') for t in json.load(f)]
        corpus.append(tokens)

In [5]:
def identity_tokenizer(text):
    # bereits tokenized
    return text

transfomer = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)

In [6]:
tfidf = transfomer.fit_transform(corpus)

In [7]:
tfidf.shape

(84, 34863)

In [8]:
def get_highest_tfidf_terms():  
    feature_names = np.array(transfomer.get_feature_names())
    return feature_names[np.argmax(tfidf, axis=1)]

def get_highest_n_tfidf_terms(n=5):
    feature_array = np.array(transfomer.get_feature_names())
    tfidf_sorting = np.flip(np.argsort(tfidf.toarray(), axis=1), axis=1)
    top_n = [feature_array[x[:n]] for x in tfidf_sorting]
    return top_n

def filter_leere_begriffe(top_words):
    leere_begriffe= ("mensch","deutsch","deutschland","land","regierung","neu","woche","präsident","maßnahme","anderer")
    result = []
    for words in top_words:
        words = [word for word in words if word not in leere_begriffe]
        result.append(words[:15])
    return result

In [9]:
top_tfidf_words = filter_leere_begriffe(get_highest_n_tfidf_terms(n=25))

# Save as JSON

In [10]:
from collections import defaultdict
import datetime


In [11]:
year_month_topwords = defaultdict(dict)

i, month, year = 0, 6, 2014

for topwords in top_tfidf_words:
    if month > 12:
        month = 1
        year += 1
    date = datetime.date(year, month, 1)
    year_month_topwords[year][date.strftime("%b")] = topwords
    month += 1

In [12]:
with open("./year_month_topwords.json", 'w') as f:
    json.dump(year_month_topwords, f)