# Imports

In [17]:
import pandas as pd
from bs4 import BeautifulSoup
import pathlib
import requests
from tqdm.notebook import tqdm
import os

pd.set_option('display.max_rows', None)

# Code

In [18]:
ARTICLE_OUTPUT_FOLDER = '../input/articles'
NOS_ARCHIVE_URL = 'https://nos.nl/nieuws/archief/'
NOS_URL = 'https://nos.nl'

pathlib.Path(ARTICLE_OUTPUT_FOLDER).mkdir(parents=True, exist_ok=True)

# example article url: https://nos.nl/nieuws/archief/2020-05-25

In [25]:
# get list of formatted dates
dates = [x.strftime('%Y-%m-%d') for x in pd.date_range(start = '2020-04-01',end='2020-05-30', freq='D')] # datetime.today()

### Get Article URL's

In [28]:
def get_article_urls(dates):
    article_urls_dict = {}
    
    for date in tqdm(dates, total=len(dates)):
        urls = []
        response = requests.get(NOS_ARCHIVE_URL+date)
        soup = BeautifulSoup(response.content, "html.parser")
        
        for page_content in soup.findAll('a', href=True):
                urls.append(page_content['href'])
        urls = pd.Series(urls)
        urls = urls[urls.str.contains('/artikel/')]
        urls = NOS_URL + urls
        article_urls_dict[date] = list(urls)
        
    return(article_urls_dict)

article_urls = get_article_urls(dates)

  0%|          | 0/60 [00:00<?, ?it/s]

### Get Article Texts

In [13]:
l1=['1', '2', '3']
l2=l1
i=0
for x in l1:
    print('_', x)
    for y in l2:
        i+=1
        print('==', y)
        #print('--->', i)
        if i == 5:
            break
    else:
        continue
    break
    
print('__')

_ 1
== 1
== 2
== 3
_ 2
== 1
== 2
_ 3
== 1
== 2
== 3
__


In [32]:
def get_article_text(article_url, date):
    file_name = f"{ARTICLE_OUTPUT_FOLDER}/{date}/{article_url[article_url.find('/artikel/')+9:]}.txt"
    
    if not os.path.exists(file_name):
        response_text = requests.get(article_url)
        soup = BeautifulSoup(response_text.content, "html.parser")
        raw_paragraphs = soup.find_all('p', class_="text_3v_J6Y0G")
        texts = [p.get_text() for p in raw_paragraphs]
        try:
            with open(file_name, "w", encoding='utf-8') as txt_file:
                txt_file.write('\n'.join(texts))
        except:
            print(f'Could not save {file_name}')

In [33]:
for date in tqdm(article_urls, total=len(article_urls)):
    pathlib.Path(ARTICLE_OUTPUT_FOLDER + '/' + date).mkdir(parents=True, exist_ok=True)
    for article_url in tqdm(article_urls[date]):
        get_article_text(article_url, date)

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

Could not save ../input/articles/2020-04-15/2330512-wekdienst-15-4-persconferentie-kabinet-over-coronamaatregelen-verkiezingen-in-zuid-korea.txt


  0%|          | 0/41 [00:00<?, ?it/s]

Could not save ../input/articles/2020-04-16/2330685-coronacijfers-van-16-april-aantal-doden-daalt-minder-hard-dan-aantal-ziekenhuisopnames.txt


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

### Number of Articles

In [34]:
file_count = sum(len(files) for _, _, files in os.walk('../input/articles/'))
print(file_count)

1116


In [26]:
import os
article_files = os.listdir('../input/articles')
[x for x in dates if x not in article_files]

['2020-05-01',
 '2020-05-02',
 '2020-05-03',
 '2020-05-04',
 '2020-05-05',
 '2020-05-06',
 '2020-05-07',
 '2020-05-08',
 '2020-05-09',
 '2020-05-10',
 '2020-05-11',
 '2020-05-12',
 '2020-05-13',
 '2020-05-14',
 '2020-05-15',
 '2020-05-16',
 '2020-05-17',
 '2020-05-18',
 '2020-05-19',
 '2020-05-20',
 '2020-05-21',
 '2020-05-22',
 '2020-05-23',
 '2020-05-24',
 '2020-05-25',
 '2020-05-26',
 '2020-05-27',
 '2020-05-28',
 '2020-05-29',
 '2020-05-30']

In [31]:
#load_texts():
#    for os.listdir()
text_dict = {}
for date in dict(list(article_urls.items())[:3]):
    text_dict[date] = []
    #print(article_urls[date])
    for article in article_urls[date]:
        full_path_article = f"{ARTICLE_OUTPUT_FOLDER}/{date}/{article[article.find('/artikel/')+9:]}.txt"
        if os.path.exists(full_path_article):
            with open(full_path_article, "r", encoding='utf-8') as f:
                text_dict[date].append(f.readlines())
print(text_dict.keys())
        #print(os.path.exists()

dict_keys(['2020-04-01', '2020-04-02', '2020-04-03'])


In [43]:
import spacy
nlp = spacy.load("nl_core_news_sm")

In [65]:
TF_IDF_CSV_PATH = '../output/NOS_tf_idf.csv'
TF_CSV_PATH = '../output/NOS_tf.csv'
CORPUS_PATH = '../output/NOS_corpus.csv'

In [63]:
def calculate_tfidf(full_text: list, include_idf=True) -> tuple:
    corpus = pd.DataFrame(columns=['word'])

    # Create Dataframe with Word Counts
    word = [token.lemma_ for token in nlp(full_text) if not (token.is_stop or token.is_punct or token.is_space)]

    word_count = Counter(word)

    new_word = list(set(word_count.keys()) - set(corpus['word']))
    corpus = corpus.append(pd.DataFrame({'word': new_word}), ignore_index=True)

    wordlist = []
    for word in corpus['word']:
        if word in word_count.keys():
            wordlist.append(word_count[word])
        else:
            wordlist.append(0)

    corpus['nos'] = wordlist

    corpus.set_index('word', inplace=True)
    corpus.fillna(0, inplace=True)

    corpus.to_csv(CORPUS_PATH)


In [102]:
# parent_directory, directory_name = os.path.split('../input/articles')
# directory_name

for x in os.listdir('../input/articles'):
    for y in os.listdir('../input/articles/'+x):
        print(os.path.exists(f"../input/articles/{x}/{y}"))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [93]:
for directory, m, files in os.walk('../input/articles'):
    print(files)
    #_, directory_name = os.path.split(directory)
    #print(directory_name)
    for file in files:
        _, directory_name = os.path.split(directory)
        #directory_name
        #print(directory_name)
#         print(file)
#         break
#         if os.path.exists(directory+'/'+file):
#             print(directory+'/'+file)

[]
['2328999-vier-aanhoudingen-na-machete-aanval-in-rotterdam.txt', '2329000-biograaf-haalt-uit-naar-egoistische-harry-en-meghan.txt', '2329008-wekdienst-1-4-weer-coronahoesters-voor-rechter-en-kamer-bijgepraat-door-rivm.txt', '2329010-deze-man-helpt-honderden-amsterdamse-kinderen-met-huiswerk-en-veel-meer.txt', '2329013-begrip-voor-verlengen-sluiting-scholen-zorgen-over-kwetsbare-leerlingen-blijven.txt', '2329015-berlijn-helpt-kleine-bedrijven-door-coronacrisis-klein-wirtschaftswunder.txt', '2329017-hema-neemt-maatregelen-voortbestaan-in-gevaar.txt', '2329023-fred-westerbeke-krijgt-onderscheiding-op-initiatief-van-mh17-nabestaanden.txt', '2329024-aftellen-naar-de-bevrijding-met-nos-bevrijdingssite.txt', '2329025-roken-opnieuw-duurder-rookverbod-horeca-nog-onzichtbaar-door-sluiting.txt', '2329032-coronamaatregelen-in-italie-en-spanje-lijken-langzaam-effect-te-hebben.txt', '2329034-bolsonaro-kiest-andere-toon-corona-grootste-uitdaging-voor-deze-generatie.txt', '2329035-van-dissel-volhou

In [37]:
from collections import Counter

t = text_dict['2020-04-01']+text_dict['2020-04-02']
flat_list = [item for sublist in t for item in sublist]
len(flat_list)

#calculate_tfidf(' '.join(flat_list))

630

In [1]:
def calculate_tfidf(text_by_speaker: tuple, include_idf=True) -> tuple:
    """

    Calculates the tfidf per speaker per conference

    :return: a tuple containing Rutte texts and De Jonge texts respectively

    """
    corpus = pd.DataFrame(columns=['word'])
    nr_of_conferences = len(text_by_speaker[0])

    # Create Dataframe with Word Counts
    for i in tqdm(range(nr_of_conferences)):
        full_conference_text = text_by_speaker[0][i]['text'] + text_by_speaker[1][i]['text']
        word = [token.lemma_ for token in nlp(full_conference_text) if
                 not (token.is_stop or token.is_punct or token.is_space)]

        word_count = Counter(word)

        new_word = list(set(word_count.keys()) - set(corpus['word']))
        corpus = corpus.append(pd.DataFrame({'word': new_word}), ignore_index=True)

        wordlist = []
        for word in corpus['word']:
            if word in word_count.keys():
                wordlist.append(word_count[word])
            else:
                wordlist.append(0)

        corpus[text_by_speaker[0][i]['date']] = wordlist

    corpus.set_index('word', inplace=True)
    corpus.fillna(0, inplace=True)

    corpus.to_csv(CORPUS_PATH)

    tf_idf = {k: [] for k in corpus.columns}

    # Create Dataframe with Relative Word Frequencies
    for index, row in tqdm(corpus.iterrows(), total=len(corpus)):
        docs_with = np.count_nonzero(row)

        for colname, count in row.items():
            total_uniques = np.count_nonzero(corpus[colname])
            tf = count / total_uniques
            result = tf
            if include_idf:
                idf = math.log(len(corpus.columns) / docs_with)
                result *= idf

            tf_idf[colname].append(result)

    tf_idf_df = pd.DataFrame.from_dict(tf_idf)
    tf_idf_df.set_index(corpus.index, inplace=True)

    if include_idf:
        tf_idf_df.to_csv(TF_IDF_CSV_PATH)
        print(f"Saved results to 'output/corpus.csv and '{TF_IDF_CSV_PATH}'")
    else:
        tf_idf_df.to_csv(TF_CSV_PATH)
        print(f"Saved results to 'output/corpus.csv' and '{TF_CSV_PATH}'")

In [None]:
calculate_tfidf()