In [None]:
import os
import sys

sys.path.append('/Users/lukaskrabbe/Developement/PyCharm/kn/src')

import pymongo
import pandas as pd
import matplotlib.pyplot as plt
from kneed import KneeLocator
import numpy as np
from HanTa import HanoverTagger as ht
import nltk
import math
import warnings

from helpers.log import get_logger
from helpers.secrets import get_secret_from_env

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
secret = get_secret_from_env(secret="MONGO_USER_SECRET", path='../../secrets/')

client = pymongo.MongoClient(
    f"mongodb://{secret['user']}:{secret['password']}@81.169.252.177:27017/?authMechanism=DEFAULT&tls=false"
)
kn_db = client.kn_db
kn_collection = kn_db.get_collection("kn_data")

assert len(kn_collection.find_one({})) > 0, "Error, no Data or DB-Connection"

In [None]:
kiel_articles = kn_collection.find({
    #'releaseDate': day,
    'resort': {
        '$in': [
            'Kiel Aufschlag', 'Regionales Kiel'
        ]
    }
})
kiel_articles = list(kiel_articles)

article_list = []
for article in kiel_articles:
    text = article['body']
    #text = text.lower()

    if text.lower().startswith(' kiel. '):
        text = text[7:]

    article_list.append(text)
    print(text)
    print('----')
print(f"Got {len(article_list)} articles.")

In [None]:
tagger = ht.HanoverTagger('morphmodel_ger.pgz')

words = {}
words_art = {}
for article in article_list:
    word_list = nltk.word_tokenize(article)
    lemmata = tagger.tag_sent(word_list, taglevel= 1)

    for word, ground_word, word_art in lemmata:
        if word_art in ['NE']:
            word = word
        else:
            word = ground_word

        if len(word) > 1 and not word.startswith('www'):
            if word in words:
                words[word] = words[word] + 1
            else:
                words[word] = 1
                words_art[word] = word_art

words = pd.DataFrame().from_dict(words, orient='index').reset_index()
words.columns = ['word', 'count']
words = words.sort_values(by=["count"], ascending=False).reset_index(drop=True)

kn = KneeLocator(words.index, words['count'], S=2.5, curve='convex', direction='decreasing')

words['stop_word'] = np.where(words.index <= kn.knee, True, False)
words['word_art'] = words['word'].map(words_art)
stop_words = list(words[words['stop_word'] == True]['word'])

plt.plot(words.index, words['count'])
plt.plot([kn.knee for x in range(0, len(words))], list(words['count']))
plt.show()

print(f"Summe von Stop Words: {len(stop_words)}/{len(words)} ({round(len(stop_words)/len(words), 2)} %)")
print(f"Vorkommen von Stop Words: {words[words['stop_word'] == True]['count'].sum()}/{words['count'].sum()} ({round(words[words['stop_word'] == True]['count'].sum()/words['count'].sum(), 2)} %)")


In [None]:
clean_article_list = []
noun_article_list = []
for article in article_list:
    word_list = nltk.word_tokenize(article)
    word_list = [word for word in word_list if word.isalpha()]
    lemmata = tagger.tag_sent(word_list, taglevel=1)

    clean_article = []
    noun_article = []
    for word, ground_word, word_art in lemmata:
        if not ground_word in stop_words and not word in stop_words and len(word) > 1:
            clean_article.append(word)
            if word_art in ['NN', 'NE']:
                noun_article.append(word)
    clean_article_list.append(clean_article)
    noun_article_list.append(noun_article)

print(article_list[0])
print(clean_article_list[0])
print(noun_article_list[0])

In [None]:
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

clean_words = {}
for article in clean_article_list:
    for word in article:
        if word in clean_words:
            clean_words[word] = clean_words[word] + 1
        else:
            clean_words[word] = 1

clean_words = pd.DataFrame().from_dict(clean_words, orient='index').reset_index()
clean_words.columns = ['word', 'count']
clean_words = clean_words.sort_values(by=["count"], ascending=False).reset_index(drop=True)
clean_words = clean_words.set_index('word')

for i, article in enumerate(clean_article_list):
    clean_words['article_' + str(i) + '_count'] = None
    for word in article:
        clean_words.loc[word, 'article_' + str(i) + '_count'] = article.count(word) / clean_words.loc[word, 'count']

clean_words = clean_words.fillna(0).copy()
del clean_words['count']
clean_words_t = clean_words.transpose()

In [None]:
noun_words = {}
for article in noun_article_list:
    for word in article:
        if word in noun_words:
            noun_words[word] = noun_words[word] + 1
        else:
            noun_words[word] = 1

noun_words = pd.DataFrame().from_dict(noun_words, orient='index').reset_index()
noun_words.columns = ['word', 'count']
noun_words = noun_words.sort_values(by=["count"], ascending=False).reset_index(drop=True)
noun_words = noun_words.set_index('word')

for i, article in enumerate(noun_article_list):
    noun_words['article_' + str(i) + '_count'] = None
    for word in article:
        noun_words.loc[word, 'article_' + str(i) + '_count'] = article.count(word)

noun_words = noun_words.fillna(0).copy()

In [None]:
words = len([word for word in article for article in noun_article_list])
max_number_of_occ = max(noun_words[['article_' + str(i) + '_count' for i in range(len(noun_article_list))]].max())

documents = len(noun_article_list)

In [None]:
for i, article in enumerate(noun_article_list):
    noun_words['article_' + str(i) + '_tf'] = None
    noun_words['article_' + str(i) + '_idf'] = None
    max_val = max(noun_words['article_' + str(i) + '_count'])
    for word in article:
        noun_words.loc[word, 'article_' + str(i) + '_tf'] = noun_words.loc[word, 'article_' + str(i) + '_count'] / max_number_of_occ
        noun_words['article_' + str(i) + '_idf'] = math.log(documents/noun_words.loc[word, 'article_0_count':].max())
    noun_words.loc['article_' + str(i) + '_tf'] * noun_words.loc['article_' + str(i) + '_idf']

In [None]:
noun_words

In [None]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters = 50)

km = km.fit(noun_words)
noun_words['cluster'] = km.predict(noun_words)
noun_words

In [None]:
from sklearn.decomposition import PCA

for x in range(2, len(noun_words)):
    pca_2 = PCA(n_components=x)
    pca_2_result = pca_2.fit_transform(noun_words)

    print(np.sum(pca_2.explained_variance_ratio_))



In [None]:
pca_2 = PCA(n_components=2)
pca_2_result = pca_2.fit_transform(noun_words)
dataset_pca = pd.DataFrame(abs(pca_2.components_), columns=noun_words.columns)

dataset_pca.transpose()