In [2]:
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt
from bs4 import BeautifulSoup, element
import time as tm
import os
import requests

In [3]:
chars_to_remove = ",^\()|\"\n"

In [4]:
# this function constructs the url for an archived webpage according to https://wayback.archive-it.org formatting
def get_html(collection_id, folder, data):

    # creates a local directory named 'folder' if it doesn't exist
    if not os.path.isdir(folder):
        os.mkdir(folder)

    # the base url for the archive
    ai_url = "https://wayback.archive-it.org"

    # this gets a unique list of dates and urls so we can avoid duplication
    date_list = pd.to_datetime(data['date'].unique())
    urls = data['url'].unique()

    # this loop iterates through all the urls, searches for the archived webpage
    # then looks for any hyperlinks on the page that are associated with a date in the list
    for url in urls:

        # this is the url of an archived webpage's home 
        # (e.g. https://wayback.archive-it.org/12706/*/http://mypaperonline.com)
        home = ai_url + '/' + str(collection_id) + '/*/' + url

        # this block gets all the date hyperlinks
        page = requests.get(home)
        soup = BeautifulSoup(page.content, "lxml")
        alist = soup.find_all("a")
        alist = [a for a in alist if 'onclick' in a.attrs]
        dates = [dt.strptime(a.text, '%b %d, %Y') for a in alist]

        # this loops through all the date hyperlinks and if they're in
        # the date_list it saves them locally
        for a in alist:

            # this creates a local path for the html, it converts URLS+dates
            # into unique simple strings
            path = folder + '/' + dt.strptime(a.text, '%b %d, %Y').strftime('%Y%m%d') + '_' + ''.join(re.findall('(\d+|[a-zA-Z]+|-|\.)', url)) + '.html'

            # if url == 'http://www.newarkblack.com/queens-new-york-annual-pop-shop-artist-market-set-held-multiple-dates-nov-25-2017-december-23-22017':
            #     print('path')
            #     print('https:' + a.attrs['href'])
            #     print(dt.strptime(a.text, '%b %d, %Y') in date_list)
            #     print(not os.path.isfile(path))

            # this is where it saves the html ('onclick' in a.attrs is probably redundant)
            if dt.strptime(a.text, '%b %d, %Y') in date_list and not os.path.isfile(path):
                wp = requests.get('https:' + a.attrs['href'])
                f = open(path, 'wb')
                f.write(wp.content)
                f.close()

In [4]:
chosen = pd.read_csv('final_chosen.csv')

In [5]:
start = tm.time()
get_html(12706, 'chosenSamp', chosen)
span = tm.time() - start
print('completed in ', span, ' seconds')

completed in  929.747517824173  seconds


In [6]:
ctr = 0
folder = 'chosenSamp'

# this creates and sets all the new variables to None
chosen['cleaned'] = None
chosen['title'] = None
chosen['published'] = None
chosen['author'] = None

start = tm.time()

for index, row in chosen.iterrows():

    # these arrays are only expected to have len()==1 but may have more
    # if there exist multiple objects
    articles = []
    titles = []
    published = []
    author = []

    # this constructs the path of the html file
    path = folder + '/' + ''.join(re.findall('\d+', row.date)) + '_' + ''.join(re.findall('(\d+|[a-zA-Z]+|-|\.)', row.url)) + '.html'

    # reads the html
    f = open(path, 'rb')
    soup = BeautifulSoup(f.read(), "lxml")
    
    # collects variables from Morristown Life webpages
    if row['domain'] == 'www.mypaperonline.com':
        articles = soup.find_all(class_ = "entry-content")
        titles = soup.find_all(class_ = "entry-title single-post-title")
        published = soup.find_all(attrs = {'itemprop' : 'datePublished'})
        author = soup.find_all(class_ = "author vcard")

    # collects variables from New Brunswick Today webpages
    if row['domain'] in ['newbrunswicktoday.com', 'www.newbrunswicktoday.com'] :
        articles = soup.find_all(attrs = {'property' : 'dc:description'})
        titles = soup.find_all(attrs = {'property' : 'dc:title'})
        published = soup.find_all(attrs = {'property' : 'dc:issued'})
        author = soup.find_all(attrs = {'property' : 'dc:creator'})

    # collects variables from Newark Black
    if row['domain'] == 'www.newarkblack.com':
        articles = soup.find_all(class_ = 'td-post-content td-pb-padding-side')
        titles = soup.find_all('h1', class_ = 'entry-title')
        published = soup.find_all(attrs = {'itemprop' : 'dateCreated'})
        author = soup.find_all(attrs = {'itemprop' : 'author'})

    # overwrites the text variable with the cleaner extracted version
    if len(articles) == 1: 
        ctr = ctr + 1
        text = articles[0].text.translate ({ord(c): "" for c in chars_to_remove})
        chosen.loc[index, 'text'] = text
        chosen.loc[index, 'cleaned'] = 1
    else:
        chosen.loc[index, 'cleaned'] = 0

    # adds the title variable
    if len(titles) >= 1:
        chosen.loc[index, 'title'] = titles[0].text.translate ({ord(c): "" for c in chars_to_remove})
    
    # adds the published variable
    if len(published) >= 1:
        chosen.loc[index, 'published'] = published[0].text.translate ({ord(c): "" for c in chars_to_remove})

    # adds the author variable
    if len(author) >= 1:
        chosen.loc[index, 'author'] = author[0].text.translate ({ord(c): "" for c in chars_to_remove})

    # these if statements are all warnings if there are duplicative extractions
    # if len(articles) > 1:
    #     print(row.url, " has multiple article objects")
    
    # if len(titles) > 1:
    #     print(row.url, " has multiple titles objects")

    # if len(published) > 1:
    #     print(row.url, " has multiple published objects")

    # if len(author) > 1:
    #     print(row.url, " has multiple author objects")

    f.close()

    tm.sleep(.01)

span = tm.time() - start

print('completed in ', span, ' seconds')


completed in  377.32148933410645  seconds


In [7]:
print(ctr)

1350


In [8]:
chosen

Unnamed: 0.1,Unnamed: 0,date,domain,url,text,cleaned,title,published,author
0,16,2019-10-08,www.newarkblack.com,http://www.newarkblack.com,NewarkBlack.com - Newark African American News...,0,,September 25 2019,
1,17,2019-10-08,www.newarkblack.com,http://www.newarkblack.com,�,0,,September 25 2019,
2,18,2019-10-08,www.newarkblack.com,http://www.newarkblack.com/?s={search_term_str...,You searched for {search_term_string} - Newark...,0,{search_term_string} - search results,October 7 2019,
3,19,2019-10-08,www.newarkblack.com,http://www.newarkblack.com/advertise,Advertise - NewarkBlack.com CLOSE Home Adverti...,0,,October 7 2019,
4,20,2019-10-08,www.newarkblack.com,http://www.newarkblack.com/category/news,News Archives - NewarkBlack.com CLOSE Home Adv...,0,News,October 3 2019,WA Public Relations Company
...,...,...,...,...,...,...,...,...,...
2408,4553,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/first-library-wi...,By: Michele DiPasquale To some a library may b...,1,First Library Will Help Build Libraries of Babies,October 1 2019,new_view_media
2409,4554,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/hackettstowns-he...,By Dawn M Chiossi Going on since February of ...,1,Hackettstown’s Heath Village Inspires Staff wi...,October 1 2019,new_view_media
2410,4557,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/dont-miss-these-...,BPT – Snacks produce drinks or sweets – if it ...,1,Don’t miss these must-have snacks that define ...,April 3 2016,new_view_media
2411,4565,2019-11-02,www.mypaperonline.com,http://www.mypaperonline.com/10th-anniversary-...,10th Anniversary Golf Classic Scores High For ...,1,10th Anniversary Golf Classic Scores High For ...,August 9 2015,new_view_media


In [9]:
chosen.drop('Unnamed: 0', axis=1).to_csv('cleaned1.csv')