In [1]:
# for site information
from urllib.parse import urlparse
import urllib.request as url
from bs4 import BeautifulSoup as soup

from foreignpolicy_scraper import ForeignPolicyScraper



In [2]:
foreign_policy = 'https://foreignpolicy.com/'

In [3]:
import pandas as pd
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

In [4]:
if os.path.exists('token.pickle'):
    with open('token.pickle', 'rb') as token:
        creds = pickle.load(token)

# this accesses the google database I use to get geographic info
SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']

ATLAS_ID = '1RC8N4wosbYuf5ikCRpL9H5aV1uuxJumHCDO95zH8osA'
ATLAS_RANGE = 'Main!A1:P1170'

service = build('sheets', 'v4', credentials=creds)
sheet = service.spreadsheets()

atlas_result = sheet.values().get(spreadsheetId=ATLAS_ID,
                                range=ATLAS_RANGE).execute()
atlas_values = atlas_result.get('values', [])

def create_df_with_first_row_headers(sheets_values):
    df = pd.DataFrame(sheets_values)
    
    # replaces first row of dataframe with the headers b/c it's not automatic
    headers = df.iloc[0]
    df = df[1:]
    df.columns = headers
    
    return df

atlas_df = create_df_with_first_row_headers(atlas_values)

def check_for_duplicate(item, col, df):
    if len(df.loc[df[col] == item]) > 0:
        return True
    else:
        return False
    
def remove_list_duplicates(_list):
    seen = set()
    for x in _list:
        if x not in seen:
            yield x
            seen.add(x)

# the FP site's main page contains links that interfere with the dedup
#    process b/c they're the same link, they just begin with dates.
def remove_non_https(_list):
    for x in _list:
        if urlparse(x).scheme == 'https':
            yield x
        else:
            pass

# also removes invalid articles, like their featured projects
def remove_old_articles(_list, month_as_int):
    for x in _list:
        try:
            if int(urlparse(x).path[6:8]) < month_as_int:
                pass
            else:
                yield x
        except ValueError:
            pass

def remove_atlas_duplicates(_list):
    for x in _list:
        if check_for_duplicate(x, 'Link', atlas_df):
            pass
        else:
            yield x

# _list is a list of urls from the Foreign Policy page.
def remove_non_applicables(_list):
    for x in _list:
        if ForeignPolicyScraper(x).label == 'none':
            pass
        else:
            yield (x, ForeignPolicyScraper(x).label)

In [5]:
front_page = soup(url.urlopen(foreign_policy), 'html.parser')
len(front_page.find_all('a', class_='hed-heading'))

links = list()
for section in front_page.find_all('a', class_='hed-heading'):
    links.append(section['href'])

# remember, these are all FP specific, they don't work for other sites
links = remove_list_duplicates(links)
links = remove_non_https(links)
links = remove_old_articles(links, 9)
links = remove_atlas_duplicates(links)
links = list(remove_non_applicables(links))

In [7]:
ForeignPolicyScraper(links[0][0]).country

'Russia'