In [12]:
# Imports
import requests
import pandas as pd
import os
import tempfile
import glob

import newspaper
from newspaper import fulltext, Article

from sqlalchemy.engine.url import URL
from sqlalchemy import create_engine

from bs4 import BeautifulSoup
from dotenv import load_dotenv

### Connect to PostgreSQL

In [2]:
# Load .env file
load_dotenv('.env')

# Get credentials from .env
config = dict(
    drivername='postgres',
    username=os.getenv("USERNAME"),
    password=os.getenv("PASSWORD"),
    host=os.getenv("HOST"),
    database=os.getenv("DATABASE")
)

# Create the connection string
url = URL(**config)
print(url)

# Create the engine
engine = create_engine(url)

postgres://postgres:Data4PGSQL@localhost/news


In [3]:
def fetch_cnn_articles(source_url, category_url):
    # Build paper
    paper = newspaper.build('http://cnn.com', memoize=False)
    # Print paper size
    print('CNN paper size: {}'.format(paper.size()))
    # Get paper news categories
    categories = [category for category in paper.category_urls()]
    print('CNN categories: {}'.format(categories))
    # Build for specific category
    cnn_politics = newspaper.build(category_url, language='en', memoize_articles = False)
    # Put all articles from target cateogry in list
    articles = [article for article in cnn_politics.articles]
    print('{} articles in {}'.format(len(articles), category_url))
    return articles

def process_cnn_articles(articles):
    metadata = []
    for article in articles:
        # Download and parse articles
        article.download()
        article.parse()
        # Only fetch articles with authors
        if len(article.authors) > 0:
            # Add metadata to dictionary
            parsed_dict = {
                "title": article.title,
                "authors": article.authors,
                "date": article.publish_date
                "full_text": article.text,
            }
            # Perform NLP to get more metadata
            article.nlp()
            nlp_dict = {
                "summary": article.summary,
                "keywords": article.keywords    
            }
            # Append NLP metadata to main dictionary
            parsed_dict.update(nlp_dict)
            # Append dictionary to list
            metadata.append(parsed_dict)
    print('Fetched metadata for {} CNN articles'.format(len(metadata)))
    return metadata

### Get new articles from CNN

In [4]:
# Fetch data for CNN
source_url = 'http://cnn.com'
category_url = 'http://cnn.com/politics'
cnn_articles = fetch_cnn_articles(source_url, category_url)
cnn_metadata = process_cnn_articles(cnn_articles)

# Transform list to dataframe
cnn_df = pd.DataFrame(cnn_metadata)

CNN paper size: 1034
CNN categories: ['http://cnn.it', 'http://cnn.com', 'http://cnn.com/weather', 'http://arabic.cnn.com', 'http://cnn.com/middle-east', 'http://cnn.com/asia', 'http://cnn.com/transcripts', 'http://cnn.com/opinions', 'http://cnn.com/india', 'https://www.cnn.com', 'http://cnn.com/uk', 'http://cnn.com/accessibility', 'http://cnn.com/videos', 'http://cnn.com/style', 'http://cnn.com/africa', 'http://cnn.com/europe', 'https://money.cnn.com', 'http://cnn.com/us', 'http://edition.cnn.com', 'http://cnnespanol.cnn.com', 'http://us.cnn.com', 'http://cnn.com/australia', 'http://cnn.com/vr', 'http://cnn.com/health', 'http://cnn.com/world', 'http://cnn.com/more', 'http://cnn.com/china', 'http://cnn.com/travel', 'http://cnn.com/tour', 'http://cnn.com/business', 'http://cnn.com/collection', 'http://cnn.com/politics', 'http://cnn.com/americas', 'http://cnn.com/entertainment']
1034 articles in http://cnn.com/politics
Fetched metadata for 11 CNN articles


In [6]:
# Write to PostgreSQL
cnn_df.to_sql('news_politics', engine, if_exists='append')

### Add cached articles to PostgreSQL

In [17]:
# Get path for 'memoized' (cached) articles
cached_article_path = os.path.join(tempfile.gettempdir(), '.newspaper_scraper', 'memoized')

# List files
os.listdir(cached_article_path)

['cnn.com.txt']

In [24]:
# Load cached articles by file name
my_file = open(os.path.join(cached_article_path, 'cnn.com.txt'), "r")
content = my_file.read().split("\n")
my_file.close()
len(content)

In [None]:
content

In [None]:
# # Put all keywords in a list
# keywords = [item for sublist in df.keywords.tolist() for item in sublist]
# len(flat_list)

# # Put keyword counts into a data frame
# keywords_counts = pd.DataFrame(pd.DataFrame(keywords, columns=['keyword'])['keyword'].value_counts()).sort_values('keyword', ascending=False)

# # Keyword counts are not robust

In [None]:
# # Extract paper name (source)
# source = string.split('//')[1].split('.com')[0]