In [34]:
# for site information
from urllib.parse import urlparse
import urllib.request as url
from bs4 import BeautifulSoup as soup

In [7]:
foreign_policy = 'https://foreignpolicy.com/'

In [3]:
import pandas as pd
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request



In [74]:
if os.path.exists('token.pickle'):
    with open('token.pickle', 'rb') as token:
        creds = pickle.load(token)

# this accesses the google database I use to get geographic info
SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']

ATLAS_ID = '1RC8N4wosbYuf5ikCRpL9H5aV1uuxJumHCDO95zH8osA'
ATLAS_RANGE = 'Main!A1:P1170'

service = build('sheets', 'v4', credentials=creds)
sheet = service.spreadsheets()

atlas_result = sheet.values().get(spreadsheetId=ATLAS_ID,
                                range=ATLAS_RANGE).execute()
atlas_values = atlas_result.get('values', [])

def create_df_with_first_row_headers(sheets_values):
    df = pd.DataFrame(sheets_values)
    
    # replaces first row of dataframe with the headers b/c it's not automatic
    headers = df.iloc[0]
    df = df[1:]
    df.columns = headers
    
    return df

atlas_df = create_df_with_first_row_headers(atlas_values)

def check_for_duplicate(item, col, df):
    if len(df.loc[df[col] == item]) > 0:
        return True
    else:
        return False
    
def remove_list_duplicates(_list):
    rmv_dup = list()
    for item in _list:
        if item in rmv_dup:
            pass
        else:
            rmv_dup.append(item)
    
    return rmv_dup

# the FP site's main page contains links that interfere with the dedup
#    process b/c they're the same link, they just begin with dates.
def remove_non_https(_list):
    rmv_non = list()
    for item in _list:
        if urlparse(item).scheme == 'https':
            rmv_non.append(item)
        else:
            pass
    
    return rmv_non

# also removes invalid articles, like their featured projects
def remove_old_articles(_list, month_as_int):
    rmv_old = list()
    for item in _list:
        try:
            if int(urlparse(item).path[6:8]) < month_as_int:
                pass
            else:
                rmv_old.append(item)
        except ValueError:
            pass
    
    return rmv_old

def remove_atlas_duplicates(_list):
    rmv_atlas = list()
    for item in _list:
        if check_for_duplicate(item, 'Link', atlas_df):
            pass
        else:
            rmv_atlas.append(item)
    return rmv_atlas

In [75]:
front_page = soup(url.urlopen(foreign_policy), 'html.parser')

In [76]:
len(front_page.find_all('a', class_='hed-heading'))

59

In [77]:
links = list()
for section in front_page.find_all('a', class_='hed-heading'):
    links.append(section['href'])

links = remove_list_duplicates(links)
links = remove_non_https(links)
links = remove_old_articles(links, 9)
links = remove_atlas_duplicates(links)
len(links)

22

In [78]:
links

['https://foreignpolicy.com/2020/09/23/germanys-lateral-thinkers-unite/',
 'https://foreignpolicy.com/2020/09/23/italy-mismanagment-covid-19-health-pandemic/',
 'https://foreignpolicy.com/2020/09/23/zambia-is-on-the-verge-of-a-pandemic-related-debt-default/',
 'https://foreignpolicy.com/2020/09/22/pompeo-trump-2020-elections-state-department-trips-used-for-political-purposes-democrats-accuse-diplomacy-wisconsin-texas-speech/',
 'https://foreignpolicy.com/2020/09/22/united-states-need-new-strategic-mindset-infinite-perspective/',
 'https://foreignpolicy.com/2020/09/22/taliban-afghanistan-mining-peace-talks/',
 'https://foreignpolicy.com/2020/09/22/the-ethiopian-egyptian-water-war-has-begun/',
 'https://foreignpolicy.com/2020/09/21/what-the-u-n-is-good-for-or-could-be/',
 'https://foreignpolicy.com/2020/09/22/critics-misrepresent-commission-unalienable-rights/',
 'https://foreignpolicy.com/2020/09/21/much-maligned-but-still-necessary-the-u-n-at-75/',
 'https://foreignpolicy.com/2020/09/2