In [None]:
import os
import re
import sys
import warnings

import hdbscan
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import pymongo
import seaborn as sns
import umap
from HanTa import HanoverTagger as ht
from kneed import KneeLocator
from matplotlib import colors as mcolors
from matplotlib.pyplot import figure
from nltk.stem.snowball import SnowballStemmer
from pandas.core.common import SettingWithCopyWarning
from scipy.sparse.csr import csr_matrix
from scipy.sparse.lil import lil_matrix
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm_notebook

try:
    from helpers.secrets import get_secret_from_env
except ImportError:
    sys.path.append(os.path.abspath(os.path.join("..")))
    from helpers.secrets import get_secret_from_env

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=DeprecationWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

# Generate Dataset

In [None]:
tagger = ht.HanoverTagger("morphmodel_ger.pgz")
nltk.download("stopwords")

stopwords = nltk.corpus.stopwords.words("german")
stemmer = SnowballStemmer("german")

In [None]:
secret = get_secret_from_env(secret="MONGO_USER_SECRET", path="../../secrets/")

client = pymongo.MongoClient(
    f"mongodb://{secret['user']}:{secret['password']}@81.169.252.177:27017/?authMechanism=DEFAULT&tls=false"
)
kn_db = client.kn_db
kn_collection = kn_db.get_collection("kn_data")

assert len(kn_collection.find_one({})) > 0, "Error, no Data or DB-Connection"

In [None]:
all_articles = list(
    kn_collection.find(
        {
            "city": "Kiel",
        }
    )
)
# all_articles= [article['body'] for article in all_articles]
print("Got %s Articles!" % len(all_articles))

In [None]:
all_articles[0]

In [None]:
def stem(tokens):
    stems = [stemmer.stem(t) for t in tokens]
    return stems


def tokenize(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [
        word.lower()
        for sent in nltk.sent_tokenize(text)
        for word in nltk.word_tokenize(sent)
    ]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search("[a-zA-Z]", token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
words = {}
words_art = {}
clean_articles = []
nouns = {}
for article in tqdm_notebook(all_articles):
    tokens = tokenize(article["body"])
    lemmata = tagger.tag_sent(tokens, taglevel=1)

    for word, ground_word, word_art in lemmata:
        if word.lower() in [
            "montag",
            "dienstag",
            "mittwoch",
            "donnerstag",
            "freitag",
            "samstag",
            "sonntag",
            "sonnabend",
        ]:
            continue
        if ground_word.lower() in [
            "montag",
            "dienstag",
            "mittwoch",
            "donnerstag",
            "freitag",
            "samstag",
            "sonntag",
            "sonnabend",
        ]:
            continue

        if word_art.startswith("N"):
            word = word.lower()
            if word in nouns:
                nouns[word] = nouns[word] + 1
            else:
                nouns[word] = 1

        if word_art in ["NE"]:
            word = word.lower()
        else:
            word = ground_word.lower()

        if len(word) > 1 and not word.startswith("www") and word.isalpha():
            if word in words:
                words[word] = words[word] + 1
            else:
                words[word] = 1
                words_art[word] = word_art

In [None]:
words = pd.DataFrame().from_dict(words, orient="index").reset_index()
words.columns = ["word", "count"]
words = words.sort_values(by=["count"], ascending=False).reset_index(drop=True)
words["word_art"] = words["word"].map(words_art)

kn = KneeLocator(
    words.index, words["count"], S=2.5, curve="convex", direction="decreasing"
)

words["stop_word"] = np.where(words.index <= kn.knee, True, False)
stop_words = list(words[words["stop_word"] == True]["word"])

plt.plot(words.index, words["count"])
plt.plot([kn.knee for x in range(0, len(words))], list(words["count"]))
plt.show()

print(
    f"Summe von Stop Words: {len(stop_words)}/{len(words)} ({round(len(stop_words)/len(words), 2)} %)"
)
print(
    f"Vorkommen von Stop Words: {words[words['stop_word'] == True]['count'].sum()}/{words['count'].sum()} ({round(words[words['stop_word'] == True]['count'].sum()/words['count'].sum(), 2)} %)"
)

In [None]:
nouns = pd.DataFrame().from_dict(nouns, orient="index").reset_index()
nouns.columns = ["word", "count"]
nouns = nouns.sort_values(by=["count"], ascending=False).reset_index(drop=True)

kn = KneeLocator(
    nouns.index, nouns["count"], S=2.5, curve="convex", direction="decreasing"
)

nouns["stop_word"] = np.where(nouns.index <= kn.knee, True, False)
stop_nouns = list(nouns[nouns["stop_word"] == True]["word"])

plt.plot(nouns.index, nouns["count"])
plt.plot([kn.knee for x in range(0, len(nouns))], list(nouns["count"]))
plt.show()

print(
    f"Summe von Stop Nouns: {len(stop_nouns)}/{len(nouns)} ({round(len(stop_nouns)/len(nouns), 2)} %)"
)
print(
    f"Vorkommen von Stop Nouns: {nouns[nouns['stop_word'] == True]['count'].sum()}/{nouns['count'].sum()} ({round(nouns[nouns['stop_word'] == True]['count'].sum()/nouns['count'].sum(), 2)} %)"
)

In [None]:
for i, article in enumerate(tqdm_notebook(all_articles)):
    tokens = tokenize(article["body"])
    lemmata = tagger.tag_sent(tokens, taglevel=1)

    txt = ""
    noun_txt = ""
    for word, ground_word, word_art in lemmata:
        if word.lower() in [
            "montag",
            "dienstag",
            "mittwoch",
            "donnerstag",
            "freitag",
            "samstag",
            "sonntag",
            "sonnabend",
        ]:
            continue
        if ground_word.lower() in [
            "montag",
            "dienstag",
            "mittwoch",
            "donnerstag",
            "freitag",
            "samstag",
            "sonntag",
            "sonnabend",
        ]:
            continue

        if word_art in ["NE"]:
            txt += word.lower()
            txt += " "
        else:
            if word in stop_words or ground_word in stop_words:
                continue
            else:
                txt += word.lower()
                txt += " "

        if word_art.startswith("N"):
            if word in stop_nouns or ground_word in stop_nouns:
                continue
            else:
                noun_txt += word.lower()
                noun_txt += " "

    all_articles[i]["clean_body"] = txt
    all_articles[i]["noun_body"] = noun_txt

In [None]:
all_vocabs = words[words["stop_word"] == False]["word"].to_list()
all_vocabs = [vocab for vocab in all_vocabs if vocab.isalpha()]
all_vocabs = list(set(all_vocabs))
print("Length of Vocabulary is %s words" % len(all_vocabs))

nouns_vocabs = nouns["word"].to_list()
# nouns_vocabs = nouns[nouns['stop_word'] == False]['word'].to_list()
nouns_vocabs = [vocab for vocab in nouns_vocabs if vocab.isalpha()]
nouns_vocabs = list(set(nouns_vocabs))
print("Length of Noun-Vocabulary is %s words" % len(nouns_vocabs))

# Vectorize Data with Tfidf

In [None]:
NOUN = False
if NOUN:
    print("Use Noun-based Textual Data")
    vocabs = nouns_vocabs
    body = "noun_body"
else:
    print("Use full Textual Data")
    vocabs = all_vocabs
    body = "clean_body"

In [None]:
count_vectorizer = CountVectorizer(vocabulary=vocabs)
article_vector = count_vectorizer.fit_transform(
    [article[body] for article in all_articles]
)

tfidf_transformer = TfidfTransformer()
tfidf_vector = tfidf_transformer.fit_transform(article_vector)

# Visualize Data

In [None]:
%matplotlib inline

sns.set(style="white", context="notebook", rc={"figure.figsize": (14, 10)})

In [None]:
plot_embedding = umap.UMAP(n_components=2).fit_transform(tfidf_vector.toarray())
print(
    "Reduction embedding Shape: (%s, %s) (used for Plotting)" % cluster_embeddings.shape
)

In [None]:
k = 3
km = KMeans(n_clusters=k, algorithm="elkan")
km.fit(tfidf_vector)
clusters = km.labels_.tolist()

df = pd.DataFrame(dict(x=plot_embedding[:, 0], y=plot_embedding[:, 1], label=clusters))
groups = df.groupby("label")

for i, (name, group) in enumerate(groups):
    plt.scatter(
        group.x,
        group.y,
        marker="o",
        s=12,
        label="Cluster " + str(name + 1),
    )

plt.gca().set_aspect("equal", "datalim")
plt.legend()
plt.title(f"UMAP projection of Dataset with K({k})-Means Clustering", fontsize=24)
plt.show()

# HDBSCAN

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=8,
    min_samples=1,
    # cluster_selection_epsilon=0.8,
    # cluster_selection_method='leaf',
)
clusterer.fit(tfidf_vector)

df = pd.DataFrame(
    dict(x=plot_embedding[:, 0], y=plot_embedding[:, 1], label=clusterer.labels_)
)
groups = df.groupby("label")
print("Clusters: %s" % len(groups))
print("Noise: %s" % groups.get_group(-1).shape[0])

In [None]:
for i, (name, group) in enumerate(groups):
    if name == -1:
        plt.scatter(
            group.x,
            group.y,
            marker="o",
            s=2,
            c="red",
            label="Cluster " + str(name),
        )
    else:
        plt.scatter(
            group.x,
            group.y,
            marker="o",
            s=12,
            label="Cluster " + str(name),
        )

plt.gca().set_aspect("equal", "datalim")
plt.legend()
# plt.title(f'UMAP projection of Dataset with K({k})-Means Clustering', fontsize=24)
plt.show()

In [None]:
for i, (name, group) in enumerate(groups):
    if name == -1:
        continue
    print("Cluster %s" % name)
    print("Size: %s" % group.shape[0])
    print("Articles:")
    for index, row in group.iterrows():
        print(all_articles[index]["title"])

# HDBSCAN with UMAP Reduction

In [None]:
cluster_embeddings = umap.UMAP(n_components=20).fit_transform(tfidf_vector.toarray())
print(
    "Reduction embedding Shape: (%s, %s) (used for Clustering)"
    % cluster_embeddings.shape
)

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=6,
    min_samples=2,
    cluster_selection_epsilon=0.1,
    # cluster_selection_method='leaf',
)
clusterer.fit(cluster_embeddings)

df = pd.DataFrame(
    dict(x=plot_embedding[:, 0], y=plot_embedding[:, 1], label=clusterer.labels_)
)
groups = df.groupby("label")
print("Clusters: %s" % len(groups))
print("Noise: %s" % groups.get_group(-1).shape[0])

In [None]:
figure(figsize=(15, 6), dpi=80)
slt = clusterer.single_linkage_tree_
slt.plot(cmap="viridis", colorbar=True)
plt.show()

In [None]:
figure(figsize=(15, 6), dpi=80)
slt = clusterer.condensed_tree_
slt.plot(
    cmap="viridis",
    colorbar=True,
    select_clusters=True,
    selection_palette=sns.color_palette(),
)
plt.show()

In [None]:
for i, (name, group) in enumerate(groups):
    if name == -1:
        # continue
        plt.scatter(
            group.x,
            group.y,
            marker="o",
            s=2,
            c="red",
            label="Cluster " + str(name),
        )
    else:
        plt.scatter(
            group.x,
            group.y,
            marker="o",
            s=12,
            label="Cluster " + str(name),
        )

plt.gca().set_aspect("equal", "datalim")
plt.legend()
# plt.title(f'UMAP projection of Dataset with K({k})-Means Clustering', fontsize=24)
plt.show()

In [None]:
for i, (name, group) in enumerate(groups):
    if name == -1:
        continue
    print("Cluster %s" % name)
    print("Size: %s" % group.shape[0])
    print("Articles: \n")
    for index, row in group.iterrows():
        print("\t" + all_articles[index]["title"])

    print("-" * 80 + "\n")

In [None]:
for i, (name, group) in enumerate(groups):
    if name == -1:
        continue
    print("Cluster %s" % name)
    print("Size: %s" % group.shape[0])
    print("Articles: \n")

    tfidf_cluster_vector = lil_matrix((group.shape[0], tfidf_vector.shape[1]))
    for i, (index, row) in enumerate(group.iterrows()):
        print("\t" + all_articles[index]["title"])
        tfidf_cluster_vector[i] = tfidf_vector[index]

    tfidf_cluster_vector = tfidf_cluster_vector.tocsr()

    # sum csr matrix and get top n indices:
    # https://stackoverflow.com/questions/26089893/sum-csr-matrix-rows-and-get-result-as-array
    tfidf_cluster_vector = tfidf_cluster_vector.sum(axis=0)
    tfidf_cluster_vector = np.squeeze(np.asarray(tfidf_cluster_vector))
    top_n = 5
    top_n_indices = tfidf_cluster_vector.argsort()[-top_n:][::-1]
    top_n_words = [all_vocabs[i] for i in top_n_indices]

    print("Top %s words: " % top_n)
    print("\t" + ", ".join(top_n_words))

    print("-" * 80 + "\n")