In [None]:
import os

from datetime import datetime
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import pymongo
import seaborn as sns
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

In [None]:
plt.rcParams['figure.figsize'] = 14, 6
sns.set()

In [None]:
CURRENT_PATH = Path.cwd()
MONGO_DB = os.environ['MONGO_DB']
mydb = pymongo.MongoClient(MONGO_DB)
newspaper = mydb['newspaper']

In [None]:
LANGUAGES = [c for c in newspaper.list_collection_names() if c != 'TARGET']
target_collection = newspaper['TARGET']

In [None]:
source_counts = {}
for language in LANGUAGES:
    source_counts.update({language: target_collection.count_documents({'language': language})})

source_counts = pd.Series(source_counts)
source_counts.plot.bar()


In [None]:
language_info = {}
for language in LANGUAGES:
    language_collection = newspaper[language]
    n_papers = language_collection.count_documents({})
    n_todos = language_collection.count_documents({"text": {"$exists": False}})
    n_fails = language_collection.count_documents({"text": "Could not be fetched"})
    n_no_pas = language_collection.count_documents({"published_at": None})

    language_info.update({
        language: {
            'n_papers': n_papers,
            'n_todos': n_todos,
            'n_fails': n_fails,
            'n_no_pas': n_no_pas,
        }
    })

In [None]:
pd.DataFrame(language_info)