In [None]:
import os
import sys
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import pymongo
from pandas.core.common import SettingWithCopyWarning

from helpers.secrets import get_secret_from_env

sys.path.append("/Users/lukaskrabbe/Developement/PyCharm/kn/src")


warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
secret = get_secret_from_env(secret="MONGO_USER_SECRET", path="../../secrets/")

client = pymongo.MongoClient(
    f"mongodb://{secret['user']}:{secret['password']}@81.169.252.177:27017/?authMechanism=DEFAULT&tls=false"
)
kn_db = client.kn_db
kn_collection = kn_db.get_collection("kn_data")

assert len(kn_collection.find_one({})) > 0, "Error, no Data or DB-Connection"

In [None]:
parteien = ["SPD", "CDU", "Die Grünen", "FDP", "AFD", "Die Linken"]
all_articles = list(kn_collection.find({}))
print(f"Got {len(all_articles)} articles")

In [None]:
spd = []
cdu = []
fdp = []
afd = []
grn = []
lnk = []
for article in all_articles:
    text = article["body"]
    if "spd" in text.lower():
        if "city" in article.keys():
            spd.append(
                ("spd", article["releaseDate"], article["resort"], article["city"])
            )
        else:
            spd.append(("spd", article["releaseDate"], article["resort"], None))
    if "cdu" in text.lower():
        if "city" in article.keys():
            cdu.append(
                ("cdu", article["releaseDate"], article["resort"], article["city"])
            )
        else:
            cdu.append(("cdu", article["releaseDate"], article["resort"], None))
    if "fdp" in text.lower():
        if "city" in article.keys():
            fdp.append(
                ("fdp", article["releaseDate"], article["resort"], article["city"])
            )
        else:
            fdp.append(("fdp", article["releaseDate"], article["resort"], None))

    if "afd" in text.lower():
        if "city" in article.keys():
            afd.append(
                ("afd", article["releaseDate"], article["resort"], article["city"])
            )
        else:
            afd.append(("afd", article["releaseDate"], article["resort"], None))

    if "grünen" in text.lower():
        if "city" in article.keys():
            grn.append(
                ("grn", article["releaseDate"], article["resort"], article["city"])
            )
        else:
            grn.append(("grn", article["releaseDate"], article["resort"], None))

    if "linken" in text.lower():
        if "city" in article.keys():
            lnk.append(
                ("lnk", article["releaseDate"], article["resort"], article["city"])
            )
        else:
            lnk.append(("lnk", article["releaseDate"], article["resort"], None))

spd = pd.DataFrame(spd, columns=["parties", "releaseDate", "resort", "city"])
cdu = pd.DataFrame(cdu, columns=["parties", "releaseDate", "resort", "city"])
fdp = pd.DataFrame(fdp, columns=["parties", "releaseDate", "resort", "city"])
afd = pd.DataFrame(afd, columns=["parties", "releaseDate", "resort", "city"])
grn = pd.DataFrame(grn, columns=["parties", "releaseDate", "resort", "city"])
lnk = pd.DataFrame(lnk, columns=["parties", "releaseDate", "resort", "city"])

In [None]:
spd_ = spd.groupby("releaseDate").size().reset_index()
spd_.columns = ["releaseDate", "spd"]
spd_ = spd_.set_index("releaseDate")

cdu_ = cdu.groupby("releaseDate").size().reset_index()
cdu_.columns = ["releaseDate", "cdu"]
cdu_ = cdu_.set_index("releaseDate")

fdp_ = fdp.groupby("releaseDate").size().reset_index()
fdp_.columns = ["releaseDate", "fdp"]
fdp_ = fdp_.set_index("releaseDate")

afd_ = afd.groupby("releaseDate").size().reset_index()
afd_.columns = ["releaseDate", "afd"]
afd_ = afd_.set_index("releaseDate")

grn_ = grn.groupby("releaseDate").size().reset_index()
grn_.columns = ["releaseDate", "grn"]
grn_ = grn_.set_index("releaseDate")

lnk_ = lnk.groupby("releaseDate").size().reset_index()
lnk_.columns = ["releaseDate", "lnk"]
lnk_ = lnk_.set_index("releaseDate")

parties = pd.concat([spd_, cdu_, fdp_, afd_, grn_, lnk_], axis=1).fillna(0)
parties.index = pd.to_datetime(parties.index)

plt.plot(parties["spd"], label="SPD", color="r")
plt.plot(parties["cdu"], label="CDU", color="k")
plt.plot(parties["fdp"], label="FDP", color="y")
plt.plot(parties["grn"], label="Grüne", color="green")
plt.plot(parties["lnk"], label="Links", color="deeppink")
plt.plot(parties["afd"], label="AFD", color="peru")
plt.xticks(rotation=90)
plt.legend()
plt.title("Number of Articles containing Political Parties per Day")
plt.show()

parties["year"] = parties.index.year
parties["week"] = parties.index.week
parties["week_nr"] = parties["year"].astype(str) + "_" + parties["week"].astype(str)
parties.index = parties["year"].astype(str) + "_" + parties["week"].astype(str)
parties = (
    parties.groupby("week_nr")
    .sum()
    .drop("year", axis=1)
    .drop("week", axis=1)
    .reset_index()
)

# plt.bar(parties.index, parties['spd'], label='SPD', color='r', width=0.3)
# plt.bar(parties.index, parties['cdu'], label='CDU', color='k', width=0.3)
parties.plot(x="week_nr", y=["spd", "cdu", "grn", "fdp", "lnk", "afd"], kind="bar")
plt.title("Number of Articles containing Political Parties per Week")
plt.show()

tmp = parties.drop("week_nr", axis=1).sum()
plt.pie(tmp, labels=tmp.keys(), autopct="%1.1f%%")
plt.title("Overall proportion of political Parties on all Articles")
plt.show()

parties = pd.concat([spd_, cdu_, fdp_, afd_, grn_], axis=1).fillna(0)
parties.index = pd.to_datetime(parties.index)

In [None]:
spd_ = spd.groupby("resort").size().reset_index()
spd_.columns = ["resort", "spd"]
spd_ = spd_.set_index("resort")

cdu_ = cdu.groupby("resort").size().reset_index()
cdu_.columns = ["resort", "cdu"]
cdu_ = cdu_.set_index("resort")

fdp_ = fdp.groupby("resort").size().reset_index()
fdp_.columns = ["resort", "fdp"]
fdp_ = fdp_.set_index("resort")

afd_ = afd.groupby("resort").size().reset_index()
afd_.columns = ["resort", "afd"]
afd_ = afd_.set_index("resort")

grn_ = grn.groupby("resort").size().reset_index()
grn_.columns = ["resort", "grn"]
grn_ = grn_.set_index("resort")

lnk_ = grn.groupby("resort").size().reset_index()
lnk_.columns = ["resort", "lnk"]
lnk_ = lnk_.set_index("resort")

parties = pd.concat([spd_, cdu_, fdp_, afd_, grn_, lnk_], axis=1).fillna(0)
# parties.index = pd.to_datetime(parties.index)
parties = parties.reset_index()

parties.plot(x="resort", y=["spd", "cdu", "grn", "fdp", "lnk", "afd"], kind="bar")
plt.title("Number of Articles containing Political Parties per Week")
plt.show()

In [None]:
sh_headings = (
    "Schleswig Holstein",
    ["SH Aufschlag", "SH Wetter", "Schleswig Holstein", "Wirts. Reg."],
)
kiel_headings = (
    "Kiel",
    ["Kiel Aufschlag", "Lokales Kiel", "Regionales Kiel", "Umland Kiel"],
)
bund_headings = ("Bundesweit", ["Politik", "Blickpunkt", "Wirtschaft"])
overall_headings = ("Overall", ["Titel", "Sonntag", "Panorama"])
leserbriefe = ("Leserbriefe", ["Leserbriefe"])
rest_headings = (
    "Rest",
    ["ePaper-Magazin", "Kultur Aufschlag", "Reise", "Ratgeber", "Sport"],
)

parties = pd.concat([spd, cdu, fdp, afd, grn, lnk], axis=0)
for title, headings in [
    sh_headings,
    kiel_headings,
    bund_headings,
    overall_headings,
    leserbriefe,
    rest_headings,
]:
    articles_in_heading = pd.concat([parties[parties["resort"] == x] for x in headings])

    tmp = articles_in_heading.groupby("parties").size()
    plt.pie(tmp, labels=tmp.keys(), autopct="%1.1f%%")
    plt.title(f"Overall proportion of political Parties on articles in {title}")
    plt.show()

parties = pd.concat([spd, cdu, fdp, afd, grn, lnk], axis=0)
buffer = []
for title, headings in [sh_headings, kiel_headings, bund_headings]:
    articles_in_heading = pd.concat([parties[parties["resort"] == x] for x in headings])
    tmp = articles_in_heading.groupby("parties").size().to_frame()
    tmp["group"] = title
    tmp.columns = ["num_articles", "group"]
    tmp = tmp.reset_index()
    tmp = tmp[["group", "parties", "num_articles"]]
    buffer.append(tmp)
tmp = pd.concat(buffer)
tmp = tmp.pivot(index="group", columns="parties", values="num_articles")

tmp.reset_index().plot(
    x="group", y=["spd", "cdu", "grn", "fdp", "lnk", "afd"], kind="bar"
)
plt.show()