In [2]:
# for site information
import urllib.request as url
from bs4 import BeautifulSoup as soup
from datetime import datetime
import re

# for geographic information
import spacy
nlp = spacy.load('en_core_web_sm')
import pandas as pd

# for taxonomy information

# stuff to access the Google API client
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request



In [3]:
if os.path.exists('token.pickle'):
    with open('token.pickle', 'rb') as token:
        creds = pickle.load(token)

In [4]:
# this accesses the google database I use to get geographic info
SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']

ATLAS_ID = '1RC8N4wosbYuf5ikCRpL9H5aV1uuxJumHCDO95zH8osA'
ATLAS_RANGE = 'Main!A1:P1168'

GEOTAG_ID = '19X7PwbLrKLGJySaFSHr5Pze-VRM-6l_9EOqXDu43T3k'
GEOTAG_RANGE = 'Sheet1!A1:E171'

service = build('sheets', 'v4', credentials=creds)
sheet = service.spreadsheets()

In [5]:
atlas_result = sheet.values().get(spreadsheetId=ATLAS_ID,
                                range=ATLAS_RANGE).execute()
geotag_result = sheet.values().get(spreadsheetId=GEOTAG_ID,
                                range=GEOTAG_RANGE).execute()

In [6]:
atlas_values = atlas_result.get('values', [])
geotag_values = geotag_result.get('values', [])

In [7]:
def create_df_with_first_row_headers(sheets_values):
    df = pd.DataFrame(sheets_values)
    
    # replaces first row of dataframe with the headers b/c it's not automatic
    headers = df.iloc[0]
    df = df[1:]
    df.columns = headers
    
    return df

In [8]:
atlas_df = create_df_with_first_row_headers(atlas_values)
geotag_df = create_df_with_first_row_headers(geotag_values)

In [19]:
def check_for_duplicate(item, col, df):
    if len(df.loc[df[col] == item]) > 0:
        return True
    else:
        return False

In [23]:
link = 'https://foreignpolicy.com/2020/09/22/taliban-afghanistan-mining-peace-talks/'

In [26]:
check_for_duplicate(link, 'Link', atlas_df)

False

In [27]:
article = soup(url.urlopen(link), 'html.parser')

In [32]:
article.find('h1', class_='hed').text

'The Taliban, at Least, Are Striking Gold in Afghanistan'

In [33]:
article.find('h2', class_='dek-heading').text

'The militant group mines almost half a billion dollars a year from Afghan soil—and wants more.'

In [43]:
# the date's a bit complicated because it's not on the first page
    #    and contains significant formatting that interferes with the 
    #    the scrape.
def get_date(date):

    def clean_date(date):
        regex = re.sub(r'\n', '', date)
        regex = re.sub(r',', '', regex)
        regex = re.sub(r'\s*..:..\s..', '', regex)
        return regex

    date = clean_date(date)
    date = datetime.strftime(datetime.strptime(date, '%B %d %Y'), '%m/%d/%Y')
    return date

In [44]:
get_date(article.find_all('time')[0].text)

'09/22/2020'

In [57]:
re.sub('[\ \n]{2,}', '', article.find('span', class_='pre').parent.findNext('a').text)

'Lynne O’Donnell'