In [1]:
import json
import csv
from newspaper import Article, fulltext

import spacy
from spacy import displacy

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from tqdm import tqdm

In [3]:
# method to download articles
def download_article(url):
    try:
        article = Article(url)
        article.download()
        return article
    except Exception as e:
        # print(f"Exception occurred during downloading for URL {url}: {e}")
        return None

# method to parse articles
def parse_article(article):
    try:
        article.parse()
        return [article.source_url, article.title, article.text.replace('\n', ' '), ", ".join(article.authors), str(article.publish_date)]
    except Exception as e:
        # print(f"Exception occurred during parsing: {e}")
        return [article.source_url, "Parse Error", None, None, None]


# write to csv file
def write_to_csv(filename, rows):
    with open(filename, 'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(rows)


def process_batch(thread_executor, process_executor, urls, csv_file):
    # Multithreaded downloading
    future_to_url = {thread_executor.submit(download_article, url): url for url in urls}
    download_results = [future.result() for future in as_completed(future_to_url) if future.result() is not None]

    # Multiprocessing parsing
    future_to_article = {process_executor.submit(parse_article, article): article for article in download_results}
    parsed_results = [future.result() for future in as_completed(future_to_article)]

    # Write to CSV
    write_to_csv(csv_file, parsed_results)
        
# method to download and parse articles
def get_article_text(channel):
    json_file = channel + '_urls.json'
    news_json = json.loads(open(json_file).read())
    print(f"Found {len(news_json)} articles for {channel}...")
    csv_file = channel + '_articles.csv'

    # Initialize CSV file with headers
    with open(csv_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['url', 'title', 'text', 'authors', 'publish_date'])

    print(f"Downloading and parsing articles for {channel}...")
    batch_size = 500
    # Multithreaded downloading using 10 processors * 5 threads = 50 threads
    # Multiprocessing parsing using 10 processors
    with ThreadPoolExecutor(max_workers=50) as thread_executor, ProcessPoolExecutor() as process_executor:
        for i in range(0, len(news_json), batch_size):
            batch_urls = news_json[i:i+batch_size]
            process_batch(thread_executor, process_executor, batch_urls, csv_file)
            print(f"Done with {i} articles")

In [4]:
%%time

articles = get_article_text('cnn')

Found 32958 articles for cnn...
Downloading and parsing articles for cnn...
Done with 0 articles
Done with 500 articles
Done with 1000 articles
Done with 1500 articles
Done with 2000 articles
Done with 2500 articles
Done with 3000 articles
Done with 3500 articles
Done with 4000 articles
Done with 4500 articles
Done with 5000 articles
Done with 5500 articles
Done with 6000 articles
Done with 6500 articles
Done with 7000 articles
Done with 7500 articles
Done with 8000 articles
Done with 8500 articles
Done with 9000 articles
Done with 9500 articles
Done with 10000 articles
Done with 10500 articles
Done with 11000 articles
Done with 11500 articles
Done with 12000 articles
Done with 12500 articles
Done with 13000 articles
Done with 13500 articles
Done with 14000 articles
Done with 14500 articles
Done with 15000 articles
Done with 15500 articles
Done with 16000 articles
Done with 16500 articles
Done with 17000 articles
Done with 17500 articles
Done with 18000 articles
Done with 18500 article

In [5]:
%%time

articles = get_article_text('abc')

Found 62738 articles for abc...
Downloading and parsing articles for abc...
Done with 0 articles
Done with 500 articles
Done with 1000 articles
Done with 1500 articles
Done with 2000 articles
Done with 2500 articles
Done with 3000 articles
Done with 3500 articles
Done with 4000 articles
Done with 4500 articles
Done with 5000 articles
Done with 5500 articles
Done with 6000 articles
Done with 6500 articles
Done with 7000 articles
Done with 7500 articles
Done with 8000 articles
Done with 8500 articles
Done with 9000 articles
Done with 9500 articles
Done with 10000 articles
Done with 10500 articles
Done with 11000 articles
Done with 11500 articles
Done with 12000 articles
Done with 12500 articles
Done with 13000 articles
Done with 13500 articles
Done with 14000 articles
Done with 14500 articles
Done with 15000 articles
Done with 15500 articles
Done with 16000 articles
Done with 16500 articles
Done with 17000 articles
Done with 17500 articles
Done with 18000 articles
Done with 18500 article

In [6]:
%%time

articles = get_article_text('cbs')

Found 37907 articles for cbs...
Downloading and parsing articles for cbs...
Done with 0 articles
Done with 500 articles
Done with 1000 articles
Done with 1500 articles
Done with 2000 articles
Done with 2500 articles
Done with 3000 articles
Done with 3500 articles
Done with 4000 articles
Done with 4500 articles
Done with 5000 articles
Done with 5500 articles
Done with 6000 articles
Done with 6500 articles
Done with 7000 articles
Done with 7500 articles
Done with 8000 articles
Done with 8500 articles
Done with 9000 articles
Done with 9500 articles
Done with 10000 articles
Done with 10500 articles
Done with 11000 articles
Done with 11500 articles
Done with 12000 articles
Done with 12500 articles
Done with 13000 articles
Done with 13500 articles
Done with 14000 articles
Done with 14500 articles
Done with 15000 articles
Done with 15500 articles
Done with 16000 articles
Done with 16500 articles
Done with 17000 articles
Done with 17500 articles
Done with 18000 articles
Done with 18500 article

In [7]:
%%time

articles = get_article_text('nbc')

Found 17968 articles for nbc...
Downloading and parsing articles for nbc...
Done with 0 articles
Done with 500 articles
Done with 1000 articles
Done with 1500 articles
Done with 2000 articles
Done with 2500 articles
Done with 3000 articles
Done with 3500 articles
Done with 4000 articles
Done with 4500 articles
Done with 5000 articles
Done with 5500 articles
Done with 6000 articles
Done with 6500 articles
Done with 7000 articles
Done with 7500 articles
Done with 8000 articles
Done with 8500 articles
Done with 9000 articles
Done with 9500 articles
Done with 10000 articles
Done with 10500 articles
Done with 11000 articles
Done with 11500 articles
Done with 12000 articles
Done with 12500 articles
Done with 13000 articles
Done with 13500 articles
Done with 14000 articles
Done with 14500 articles
Done with 15000 articles
Done with 15500 articles
Done with 16000 articles
Done with 16500 articles
Done with 17000 articles
Done with 17500 articles
CPU times: user 1h 9min 18s, sys: 19min 53s, tot

In [8]:
%%time

articles = get_article_text('npr')

Found 15108 articles for npr...
Downloading and parsing articles for npr...
Done with 0 articles
Done with 500 articles
Done with 1000 articles
Done with 1500 articles
Done with 2000 articles
Done with 2500 articles
Done with 3000 articles
Done with 3500 articles
Done with 4000 articles
Done with 4500 articles
Done with 5000 articles
Done with 5500 articles
Done with 6000 articles
Done with 6500 articles
Done with 7000 articles
Done with 7500 articles
Done with 8000 articles
Done with 8500 articles
Done with 9000 articles
Done with 9500 articles
Done with 10000 articles
Done with 10500 articles
Done with 11000 articles
Done with 11500 articles
Done with 12000 articles
Done with 12500 articles
Done with 13000 articles
Done with 13500 articles
Done with 14000 articles
Done with 14500 articles
Done with 15000 articles
CPU times: user 59min 22s, sys: 16min 47s, total: 1h 16min 10s
Wall time: 41min


In [9]:
%%time

articles = get_article_text('nyt')

Found 26738 articles for nyt...
Downloading and parsing articles for nyt...


Building prefix dict from /home/distill1994/workspace/top_news/.venv/lib/python3.10/site-packages/jieba/dict.txt ...
Building prefix dict from /home/distill1994/workspace/top_news/.venv/lib/python3.10/site-packages/jieba/dict.txt ...
Building prefix dict from /home/distill1994/workspace/top_news/.venv/lib/python3.10/site-packages/jieba/dict.txt ...
Dumping model to file cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Loading model cost 9.437125444412231 seconds.
Prefix dict has been built succesfully.
Loading model cost 8.908550262451172 seconds.
Prefix dict has been built succesfully.
Loading model cost 7.9084861278533936 seconds.
Prefix dict has been built succesfully.


Done with 0 articles
Done with 500 articles


Building prefix dict from /home/distill1994/workspace/top_news/.venv/lib/python3.10/site-packages/jieba/dict.txt ...
Loading model from cache /tmp/jieba.cache
Loading model cost 18.981618881225586 seconds.
Prefix dict has been built succesfully.


Done with 1000 articles
Done with 1500 articles


Building prefix dict from /home/distill1994/workspace/top_news/.venv/lib/python3.10/site-packages/jieba/dict.txt ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.8977100849151611 seconds.
Prefix dict has been built succesfully.


Done with 2000 articles


Building prefix dict from /home/distill1994/workspace/top_news/.venv/lib/python3.10/site-packages/jieba/dict.txt ...
Loading model from cache /tmp/jieba.cache
Loading model cost 15.222313404083252 seconds.
Prefix dict has been built succesfully.


Done with 2500 articles
Done with 3000 articles
Done with 3500 articles


Building prefix dict from /home/distill1994/workspace/top_news/.venv/lib/python3.10/site-packages/jieba/dict.txt ...
Loading model from cache /tmp/jieba.cache
Loading model cost 18.67027711868286 seconds.
Prefix dict has been built succesfully.


Done with 4000 articles


Building prefix dict from /home/distill1994/workspace/top_news/.venv/lib/python3.10/site-packages/jieba/dict.txt ...
Loading model from cache /tmp/jieba.cache
Loading model cost 4.090705394744873 seconds.
Prefix dict has been built succesfully.


Done with 4500 articles
Done with 5000 articles


Building prefix dict from /home/distill1994/workspace/top_news/.venv/lib/python3.10/site-packages/jieba/dict.txt ...
Loading model from cache /tmp/jieba.cache
Loading model cost 27.434269666671753 seconds.
Prefix dict has been built succesfully.


Done with 5500 articles
Done with 6000 articles
Done with 6500 articles
Done with 7000 articles
Done with 7500 articles
Done with 8000 articles
Done with 8500 articles
Done with 9000 articles
Done with 9500 articles
Done with 10000 articles
Done with 10500 articles


Building prefix dict from /home/distill1994/workspace/top_news/.venv/lib/python3.10/site-packages/jieba/dict.txt ...
Loading model from cache /tmp/jieba.cache
Loading model cost 12.549663305282593 seconds.
Prefix dict has been built succesfully.


Done with 11000 articles
Done with 11500 articles
Done with 12000 articles
Done with 12500 articles
Done with 13000 articles
Done with 13500 articles
Done with 14000 articles
Done with 14500 articles
Done with 15000 articles
Done with 15500 articles
Done with 16000 articles
Done with 16500 articles
Done with 17000 articles
Done with 17500 articles
Done with 18000 articles
Done with 18500 articles
Done with 19000 articles
Done with 19500 articles
Done with 20000 articles
Done with 20500 articles
Done with 21000 articles
Done with 21500 articles
Done with 22000 articles
Done with 22500 articles
Done with 23000 articles
Done with 23500 articles
Done with 24000 articles
Done with 24500 articles
Done with 25000 articles
Done with 25500 articles
Done with 26000 articles
Done with 26500 articles
CPU times: user 1h 53min 50s, sys: 33min 53s, total: 2h 27min 44s
Wall time: 1h 20min 27s


In [10]:
%%time

articles = get_article_text('usat')

Found 40637 articles for usat...
Downloading and parsing articles for usat...
Done with 0 articles
Done with 500 articles
Done with 1000 articles
Done with 1500 articles
Done with 2000 articles
Done with 2500 articles
Done with 3000 articles
Done with 3500 articles
Done with 4000 articles
Done with 4500 articles
Done with 5000 articles
Done with 5500 articles
Done with 6000 articles
Done with 6500 articles
Done with 7000 articles
Done with 7500 articles
Done with 8000 articles
Done with 8500 articles
Done with 9000 articles
Done with 9500 articles
Done with 10000 articles
Done with 10500 articles
Done with 11000 articles
Done with 11500 articles
Done with 12000 articles
Done with 12500 articles
Done with 13000 articles
Done with 13500 articles
Done with 14000 articles
Done with 14500 articles
Done with 15000 articles
Done with 15500 articles
Done with 16000 articles
Done with 16500 articles
Done with 17000 articles
Done with 17500 articles
Done with 18000 articles
Done with 18500 artic

In [11]:
%%time

articles = get_article_text('wapo')

Found 33764 articles for wapo...
Downloading and parsing articles for wapo...
Done with 0 articles
Done with 500 articles
Done with 1000 articles
Done with 1500 articles
Done with 2000 articles
Done with 2500 articles
Done with 3000 articles
Done with 3500 articles
Done with 4000 articles
Done with 4500 articles
Done with 5000 articles
Done with 5500 articles
Done with 6000 articles
Done with 6500 articles
Done with 7000 articles
Done with 7500 articles
Done with 8000 articles
Done with 8500 articles
Done with 9000 articles
Done with 9500 articles
Done with 10000 articles
Done with 10500 articles
Done with 11000 articles
Done with 11500 articles
Done with 12000 articles
Done with 12500 articles
Done with 13000 articles
Done with 13500 articles
Done with 14000 articles
Done with 14500 articles
Done with 15000 articles
Done with 15500 articles
Done with 16000 articles
Done with 16500 articles
Done with 17000 articles
Done with 17500 articles
Done with 18000 articles
Done with 18500 artic

In [12]:
%%time

articles = get_article_text('politico')

Found 2456 articles for politico...
Downloading and parsing articles for politico...
Done with 0 articles
Done with 500 articles
Done with 1000 articles
Done with 1500 articles
Done with 2000 articles
CPU times: user 9min 40s, sys: 4min 1s, total: 13min 42s
Wall time: 7min 13s


In [13]:
%%time

articles = get_article_text('propub')

Found 559 articles for propub...
Downloading and parsing articles for propub...
Done with 0 articles
Done with 500 articles
CPU times: user 2min 27s, sys: 1min 1s, total: 3min 28s
Wall time: 1min 51s


In [14]:
%%time

articles = get_article_text('lat')

Found 19510 articles for lat...
Downloading and parsing articles for lat...
Done with 0 articles
Done with 500 articles
Done with 1000 articles
Done with 1500 articles
Done with 2000 articles
Done with 2500 articles
Done with 3000 articles
Done with 3500 articles
Done with 4000 articles
Done with 4500 articles
Done with 5000 articles
Done with 5500 articles
Done with 6000 articles
Done with 6500 articles
Done with 7000 articles
Done with 7500 articles
Done with 8000 articles
Done with 8500 articles
Done with 9000 articles
Done with 9500 articles
Done with 10000 articles
Done with 10500 articles
Done with 11000 articles
Done with 11500 articles
Done with 12000 articles
Done with 12500 articles
Done with 13000 articles
Done with 13500 articles
Done with 14000 articles
Done with 14500 articles
Done with 15000 articles
Done with 15500 articles
Done with 16000 articles
Done with 16500 articles
Done with 17000 articles
Done with 17500 articles
Done with 18000 articles
Done with 18500 article