In [None]:
import os

from datetime import datetime
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import pymongo
import seaborn as sns
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import ast

In [None]:
plt.rcParams['figure.figsize'] = 14, 6
sns.set()

In [None]:
stemmer = SnowballStemmer("german")
stop_words = set(stopwords.words("german"))

In [None]:
CURRENT_PATH = Path.cwd()
LANGUAGE = 'de'
MONGO_DB = os.environ['MONGO_DB']
mydb = pymongo.MongoClient(MONGO_DB)
newspaper = mydb['newspaper']
collection = newspaper[LANGUAGE]

In [None]:
try:
    collection_df = pd.read_csv('collection.csv')
    collection_df = collection_df.assign(words=collection_df['words'].apply(ast.literal_eval))
    collection_df = collection_df.assign(length=collection_df['words'].apply(len))
    
except FileNotFoundError:
    filtered = list()
    collection_df = list()
    for entry in collection.find({"text": {"$exists": True}, "published_at": {"$type": "date"}}):
        title = entry.get('title')
        published = entry.get('published_at')
        author = entry.get('authors')
        url = entry.get('url')

        # Create some variables
        year = published.year
        month = published.month
        source = url.split('/')[2]

        # Process text
        text = entry.get('text')
        words = word_tokenize(text)
        words = [w.lower() for w in words if w.isalpha()]
        cleaned_words = [w for w in words if w not in stop_words]
        df = pd.Series(dict(
            title=title,
            author=author,
            published_at=published.strftime('%Y-%m-%d'),
            month=month,
            year=year,
            words=cleaned_words,
            source=source,
            url=url,
        ))

        filtered_words = set(words) - set(cleaned_words)
        filtered.append(filtered_words)
        collection_df.append(df)

    collection_df = pd.concat(collection_df, axis=1).T
    collection_df.to_csv('collection.csv', index=False)

In [None]:
today = pd.to_datetime('now')
a_week_ago = (today - pd.Timedelta('7 days')).strftime('%Y-%m-%d')
a_month_ago = (today - pd.Timedelta('30 days')).strftime('%Y-%m-%d')
six_months_ago = (today - pd.Timedelta('180 days')).strftime('%Y-%m-%d')

In [None]:
all_freq = (FreqDist(i) for i in collection_df.query('published_at >= @a_month_ago and length > 0')['words'])

In [None]:
freq = FreqDist()
for i in all_freq:
    freq += i

freq.most_common(10)

In [None]:
n_papers_per_months = collection_df.groupby(['year', 'month'])['title'].count().rename('n_papers')
n_papers_per_months.plot.bar()

In [None]:
n_sources = collection_df['source'].value_counts()
n_sources.plot.bar();