In [None]:
from pathlib import Path

import pandas as pd
import pymongo
import matplotlib.pyplot as plt
import seaborn as sns
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
plt.rcParams['figure.figsize'] = 14, 6
sns.set()

In [None]:
stemmer = SnowballStemmer("german")
stop_words = set(stopwords.words("german"))

In [None]:
CURRENT_PATH = Path.cwd()
LANGUAGE = 'de'
# Todo: better way of connecting to the db
MONGO_DB = 'mongo_db_string'
mydb = pymongo.MongoClient(MONGO_DB)
newspaper = mydb['newspaper']
collection = newspaper[LANGUAGE]

In [None]:
filtered = list()
collection_df = list()
for entry in collection.find():
    headline = entry.get('headline')
    published = entry.get('published_at') or entry.get('fetched_at')
    author = entry.get('authors')
    url = entry.get('url')

    # Create some variables
    year = published.year
    month = published.month
    source = url.split('/')[2]

    # Process text
    text = entry.get('text')
    words = word_tokenize(text)
    words = [w.lower() for w in words if w.isalpha()]
    cleaned_words = [w for w in words if w not in stop_words]
    df = pd.Series(dict(
        headline=headline,
        author=author,
        published_at=published.strftime('%Y-%m-%d'),
        month=month,
        year=year,
        words=cleaned_words,
        source=source,
        url=url,
    ))

    filtered_words = set(words) - set(cleaned_words)
    filtered.append(filtered_words)
    collection_df.append(df)

collection_df = pd.concat(collection_df, axis=1).T

In [None]:
collection_df.head()

In [None]:
n_papers_per_months = collection_df.groupby(['year', 'month'])['headline'].count().rename('n_papers')
n_papers_per_months.plot.bar()

In [None]:
n_sources = collection_df['source'].value_counts()
n_sources.plot.bar();

In [None]:
def get_most_common_words(df, top=10, print_=True):
    if df.empty:
        most_common = pd.Series(dtype=float)
    else:
        all_words = df['words'].sum()
        all_freq = FreqDist(all_words)
        most_common = all_freq.most_common(top)
        most_common = pd.Series({k: v for k, v in most_common})

    if print_:
        print(f'Most common {top} words: \n', most_common)

    return pd.Series(most_common)


In [None]:
all_most_common = get_most_common_words(collection_df)

In [None]:
today = pd.to_datetime('now')
a_week_ago = (today - pd.Timedelta('7 days')).strftime('%Y-%m-%d')
a_month_ago = (today - pd.Timedelta('30 days')).strftime('%Y-%m-%d')
six_months_ago = (today - pd.Timedelta('180 days')).strftime('%Y-%m-%d')


In [None]:
a_week_ago_df = collection_df.query('published_at >= @a_week_ago')
a_week_most_common = get_most_common_words(a_week_ago_df)

In [None]:
a_month_ago_df = collection_df.query('published_at >= @a_month_ago')
a_month_most_common = get_most_common_words(a_month_ago_df)


In [None]:
six_months_ago_df = collection_df.query('published_at >= @six_months_ago')
six_months_most_common = get_most_common_words(six_months_ago_df)

In [None]:
monthly_most_common = collection_df.groupby(['year', 'month']).apply(lambda x: get_most_common_words(x, print_=False))
print('Most common words for each month: \n', monthly_most_common)