In [1]:
from bs4 import BeautifulSoup as soup
# needs a separate library in order to access the site b/c BS4 
#    isn't an HTTP client
import urllib.request as url
import re
from datetime import datetime

import pandas as pd

# stuff to access the Google API client
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request



In [160]:
# this accesses the google database I use to get geographic info
SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']
SAMPLE_SPREADSHEET_ID = '19X7PwbLrKLGJySaFSHr5Pze-VRM-6l_9EOqXDu43T3k'
SAMPLE_RANGE_NAME = 'Sheet1!A1:E171'

creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
    with open('token.pickle', 'rb') as token:
        creds = pickle.load(token)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file(
            'credentials.json', SCOPES)
        creds = flow.run_local_server(port=0)
    # Save the credentials for the next run
    with open('token.pickle', 'wb') as token:
        pickle.dump(creds, token)

service = build('sheets', 'v4', credentials=creds)

sheet = service.spreadsheets()

result = sheet.values().get(spreadsheetId=SAMPLE_SPREADSHEET_ID,
                                range=SAMPLE_RANGE_NAME).execute()
values = result.get('values', [])

df = pd.DataFrame(values)

# replaces first row of dataframe with the headers b/c it's not automatic
headers = df.iloc[0]
df = df[1:]
df.columns = headers

In [161]:
def get_foreign_policy():
    html = url.urlopen('https://foreignpolicy.com/category/latest/')
    data = soup(html, 'html.parser')
    entries = list(data.find("div", class_="blog-list").children)
    
    # foreign policy does this weird thing where they have one of the
    #    class's children as '\n' instead of just padding each entry.
    def remove_newlines(list_):
        filtered = list()
        for item in list_:
            if item == '\n':
                pass
            else:
                filtered.append(item)
        return filtered
    
    entries = remove_newlines(entries)
    
    # TODO: Verification function that checks list against 
    
    print(f"There are {len(entries)} posts on Foreign Policy's latest page.")
    return entries

In [162]:
# I'm making the 'get_foreign_policy' and 'extract_fp_data' separate for
#    now because it's necessary to experiment on individual entries at the
#    moment.
new_posts = get_foreign_policy()

There are 19 posts on Foreign Policy's latest page.


In [163]:
# the scrape commands and helper scripts are specific to a website's layout
class ForeignPolicyData(object):
    
    def __init__(self, post):
        self.link = post.find_all("a")[1].get('href')
        self.title = post.h3.text
        self.subtitle = post.p.text
        self.date = self.get_date(self.link)
        self.author = re.sub(r'[\ \n]{2,}', 
                             '', 
                             post.find_all("a", class_='author')[0].text)
    
    # the date's a bit complicated because it's not on the first page
    #    and contains significant formatting that interferes with the 
    #    the scrape.
    def get_date(self, url_):
        date = soup(url.urlopen(url_), 'html.parser').find_all("time")[0].text

        def clean_date(date):
            regex = re.sub(r'\n', '', date)
            regex = re.sub(r',', '', regex)
            regex = re.sub(r'\s*..:..\s..', '', regex)
            return regex

        date = clean_date(date)
        date = datetime.strftime(datetime.strptime(date, '%B %d %Y'), '%m/%d/%Y')
        return date

In [164]:
class AtlasData(object):
    
    # assumes that a post is a Beautiful Soup object
    def __init__(self, article, site=None):
        
        self.site = site
        
        if self.site == 'Foreign Policy':
            article = ForeignPolicyData(article)
        
        # information for the database
        self.link = article.link
        self.title = article.title
        self.subtitle = article.subtitle
        self.date = article.date
        self.author = article.author
        
        # assigned separate from instantiation in second round
        self.geo_class = ""
        self.region = ""
        self.bilateral = ""
        self.country = ""
        
        # assigned separate from instantiation in third round
        self.topic = ""
        self.subsection = ""
        self.tone = ""
        self.theme = ""
        self.subtheme = ""
    
    def __repr__(self):
        return f"{self.site}\n\n{self.title}\n{self.subtitle}\n{self.date}\n{self.country}"

In [165]:
# returns a list of AtlasPost objects
database_entries = list()
for post in new_posts:
    try:
        database_entries.append(AtlasData(post, site='Foreign Policy'))
    # haven't figured out a way to get it to acknowledge if an 
    #   attribute isn't present or it hits the end of the posts
    except AttributeError:
        pass
    except IndexError:
        pass

In [166]:
len(database_entries)

18

In [167]:
def sort_entries(database_entries):
    not_fp = list()
    invalid = list()
    single = list()
    multiple = list()
    
    print(len(database_entries))
    for entry in database_entries:
        _class, country = assign_geo_class(get_gpes(entry.title), df)
        if _class == 'invalid':
            invalid.append(entry)
        elif _class == 'single':
            single.append(entry)
        elif _class == 'multiple':
            multiple.append(entry)
        else:
            not_fp.append(entry)
        
    
    print(f"There are {len(not_fp)} articles that were not Foreign Policy-related, \n \
            There are {len(invalid)} articles that were classified as invalid, \n \
            There are {len(single)} articles that were classified as single, \n \
            There are {len(multiple)} articles that were classified as multiple.")
    
    return not_fp, invalid, single, multiple

In [168]:
not_fp, invalid, single, multiple = sort_entries(database_entries)

18
There are 9 articles that were not Foreign Policy-related, 
             There are 3 articles that were classified as invalid, 
             There are 4 articles that were classified as single, 
             There are 2 articles that were classified as multiple.
