In [None]:
import os
import sys
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import pymongo
from pandas.core.common import SettingWithCopyWarning

try:
    from helpers.secrets import get_secret_from_env
except ImportError:
    sys.path.append(os.path.abspath(os.path.join("..")))
    from helpers.secrets import get_secret_from_env

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=DeprecationWarning)

In [None]:
secret = get_secret_from_env(secret="MONGO_USER_SECRET", path="../../secrets/")

client = pymongo.MongoClient(
    f"mongodb://{secret['user']}:{secret['password']}@81.169.252.177:27017/?authMechanism=DEFAULT&tls=false"
)
kn_db = client.kn_db
kn_collection = kn_db.get_collection("kn_data")

assert len(kn_collection.find_one({})) > 0, "Error, no Data or DB-Connection"

In [None]:
cursor = kn_collection.aggregate(
    [
        {
            "$group": {
                "_id": {"resort": "$resort", "releaseDate": "$releaseDate"},
                "count": {"$sum": 1},
            }
        }
    ]
)
df_resorts = pd.DataFrame(list(cursor))
df_resorts = df_resorts.rename(columns={"_id": "resort", "count": "anz"})
df_resorts.head()

In [None]:
cursor = kn_collection.aggregate(
    [
        {
            "$group": {
                "_id": {"resort": "$resort", "releaseDate": "$releaseDate"},
                "count": {"$sum": 1},
            }
        }
    ]
)
df_resorts = pd.DataFrame(list(cursor))
df_resorts[["resort", "date_col"]] = df_resorts["_id"].apply(pd.Series)
del df_resorts["_id"]
df_resorts = df_resorts[["resort", "date_col", "count"]]
df_resorts["date_col"] = pd.to_datetime(df_resorts["date_col"])
df_resorts["day_of_week"] = df_resorts["date_col"].dt.day_name()
df_resorts["year"] = df_resorts["date_col"].dt.year
df_resorts["week"] = df_resorts["date_col"].dt.week

In [None]:
tmp = df_resorts.groupby(["year", "week"])["count"].sum().reset_index()
plt.bar(tmp["year"].astype(str) + "_" + tmp["week"].astype(str), tmp["count"])
plt.show()

tmp = df_resorts.groupby(["date_col", "week"])["count"].sum().reset_index()
plt.plot(tmp["date_col"].astype(str), tmp["count"])
plt.xticks(range(len(tmp)), tmp["date_col"].astype(str), rotation=90)
plt.show()

In [None]:
for week_day in [
    "Monday",
    "Tuesday",
    "Wednesday",
    "Thursday",
    "Friday",
    "Saturday",
    "Sunday",
]:
    week_day_df = (
        df_resorts[df_resorts["day_of_week"] == week_day][["resort", "count"]]
        .groupby("resort")
        .mean()
    )
    plt.bar(week_day_df.index, week_day_df["count"], align="center")
    plt.xticks(range(len(week_day_df)), week_day_df.index, rotation=90)
    plt.title(f"Summe der Artikel nach Resort {week_day}")
    plt.show()

In [None]:
all_articles = kn_collection.find({})
all_articles = pd.DataFrame(list(all_articles))

author_article = kn_collection.find({"author": {"$exists": True}})
author_article = pd.DataFrame(list(author_article))
print(
    f"{round(len(author_article)/len(all_articles), 2)} % of the articles have an author!"
)

max_num_authors = len(author_article["author"].apply(pd.Series).columns)
print(f"Maximum number of articles: {max_num_authors}")
author_article = author_article.join(author_article["author"].apply(pd.Series))
author_article = author_article.drop("author", axis=1)
author_article

In [None]:
buffer = []
for i in range(0, max_num_authors):
    tmp = author_article[[f"author_{i}", "city", "resort"]]
    tmp.columns = ["author", "city", "resort"]
    buffer.append(tmp)

authors = pd.concat(buffer)
authors = authors[authors["author"].notna()]

In [None]:
authors

In [None]:
buffer = []
for author in list(set(authors.author)):
    num_articles = len(authors[authors["author"] == author])

    resorts = authors[authors["author"] == author]["resort"].value_counts()
    if not resorts.empty:
        top_resort = resorts.idxmax()
    else:
        top_resort = None

    places = authors[authors["author"] == author]["city"].value_counts()
    if not places.empty:
        top_place = places.idxmax()
    else:
        top_place = None
    buffer.append((author, num_articles, top_resort, top_place))

authors = pd.DataFrame(
    buffer, columns=["author", "num_articles", "top_resort", "top_place"]
)
authors = authors.sort_values("num_articles", ascending=False).reset_index(drop=True)
authors

In [None]:
n = 40
top_n = authors[:n]
plt.bar(top_n["author"].astype(str), top_n["num_articles"])
plt.xticks(range(len(top_n)), top_n["author"].astype(str), rotation=90)
plt.title(f"Number of Articles per Auhtor top-{n}")
plt.show()

In [None]:
n = 10
top_n = (
    authors.groupby("top_place")["num_articles"]
    .sum()
    .reset_index()
    .sort_values("num_articles", ascending=False)
    .reset_index(drop=True)[:n]
)
plt.bar(top_n.index.astype(str), top_n["num_articles"])
plt.xticks(range(len(top_n)), top_n["top_place"].astype(str), rotation=90)
plt.title(f"Number of Articles per City top-{n}")
plt.show()

# Kiel

In [None]:
kiel_authors = authors[authors["top_place"] == "Kiel"]
author_article_len = []
for author in list(set(kiel_authors.author)):
    if "fotos" in author.lower():
        continue

    article_set = kn_collection.find(
        {
            "$or": [
                {"author": {f"author_{i}": author}} for i in range(0, max_num_authors)
            ]
        }
    )
    article_set = list(article_set)
    if len(article_set) > 0:
        avg_len = sum([len(article["body"]) for article in list(article_set)]) / len(
            article_set
        )
        full_len = sum([len(article["body"]) for article in list(article_set)])
    else:
        avg_len = 0
    author_article_len.append((author, avg_len, full_len))

author_article_len = pd.DataFrame(
    author_article_len, columns=["author", "avg_len", "full_len"]
)
author_article_len = author_article_len.sort_values(
    "avg_len", ascending=False
).reset_index(drop=True)

kiel_authors = authors[authors["top_place"] == "Kiel"]
plt.bar(kiel_authors["author"].astype(str), kiel_authors["num_articles"])
plt.xticks(range(len(kiel_authors)), kiel_authors["author"].astype(str), rotation=90)
plt.title(f"Number of Articles per Author in Kiel")
plt.show()

plt.bar(author_article_len["author"].astype(str), author_article_len["avg_len"])
plt.xticks(
    range(len(author_article_len)),
    author_article_len["author"].astype(str),
    rotation=90,
)
plt.title(f"Average Length of Article per Author in Kiel")
plt.show()

author_article_len = author_article_len.sort_values(
    "full_len", ascending=False
).reset_index(drop=True)
plt.bar(author_article_len["author"].astype(str), author_article_len["full_len"])
plt.xticks(
    range(len(author_article_len)),
    author_article_len["author"].astype(str),
    rotation=90,
)
plt.title(f"Number of Letters for each Author in Kiel")
plt.show()