In [78]:
import pandas as pd
import requests
import numpy as np
import re
from datetime import datetime as dt
from bs4 import BeautifulSoup, element
import time as tm
import os

In [79]:
# list of dates for extraction, formatted as dates are in the dataset
# dates = ['', # none
#          '20191104', # none
#          '20191001', # yes
#          '20191030', # yes
#          '20191024', # yes
#          '20190927', # none
#          '20191012', # yes
#          '20190930', # yes
#          '20191002', # none
#          '20191003', # none
#          '20191004'] # none
         
dates = ['20191008', 
         '20191012', 
         '20191020', 
         '20191024', 
         '20191030', 
         '20191101', 
         '20191111', 
         '20190930', 
         '20191001', 
         '20191002', 
         '20191003',
         '20191004'] 

# domain names for extraction, will filter www.mypaperonline.com by 
# "/category/the-morristown-news" after initial extraction
doms = ['https://www.mypaperonline.com',
        'https://www.newbrunswicktoday.com',
        'https://www.newarkblack.com',
        'http://www.mypaperonline.com',
        'http://www.newbrunswicktoday.com',
        'http://www.newarkblack.com',
        'https://mypaperonline.com',
        'https://newbrunswicktoday.com',
        'https://newarkblack.com',
        'http://mypaperonline.com',
        'http://newbrunswicktoday.com',
        'http://newarkblack.com',
        'mypaperonline.com',
        'newbrunswicktoday.com',
        'newarkblack.com',
        'www.mypaperonline.com',
        'www.newbrunswicktoday.com',
        'www.newarkblack.com']

chars_to_remove = ",^\()|\""

In [80]:
# save start time for timing extraction
start = tm.time()

# initialize dataframe
sample = pd.DataFrame(columns = ['date','domain','url','text'])
all_dates = pd.DataFrame(columns = ['date','domain','url','text'])

# open 12706-fulltext.txt file for reading
with open('12706-fulltext.txt', 'rb') as raw:

    # read first line
    line = raw.readline()

    # check if each line matches a date and a domain
    while line:
        # split line by first 3 commas
        key = line.decode('UTF-8').split(',', 3)
        # remove non-numeric characters from the date
        key[0] = re.sub('[^0-9]', '', key[0])

        # saves to all_dates if in domain list, save to sample if in domain list and date list
        if key[1] in doms:
            all_dates = all_dates.append({'date': key[0], 'domain': key[1], 'url': key[2], 'text': key[3]}, ignore_index=True)
            if key [0] in dates:
                sample = sample.append({'date': key[0], 'domain': key[1], 'url': key[2], 'text': key[3]}, ignore_index=True)
                
        # read next line
        line = raw.readline()

# take time difference
total = tm.time() - start

In [81]:
# remove problematic characters from text
text = [s.translate ({ord(c): "" for c in chars_to_remove}) for s in sample['text']]
text_ad = [s.translate ({ord(c): "" for c in chars_to_remove}) for s in all_dates['text']]
# replace old text with cleaned text
sample = sample.assign(text = text)
all_dates = all_dates.assign(text = text_ad)
# convert date to standard format
sample['date'] = pd.to_datetime(sample['date'])
all_dates['date'] = pd.to_datetime(all_dates['date'])

In [82]:
print('for chosen sample, extracted ' + str(len(sample)) + ' articles in ' + str(round(total)) + ' seconds')
print('for all dates, extracted ' + str(len(all_dates)) + ' articles in ' + str(round(total)) + ' seconds')
# this data includes all www.mypaperonline.com webpages from the given dates
print("chosen sample:")
display(sample)
print("all dates:")
display(all_dates)

for chosen sample, extracted 4337 articles in 72 seconds
for all dates, extracted 5641 articles in 72 seconds
chosen sample:


Unnamed: 0,date,domain,url,text
0,2019-09-30,www.mypaperonline.com,https://www.mypaperonline.com/category/the-ran...,The Randolph News Archive My Paper Online Hom...
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
2,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/local-art-exhibi...,Local Art Exhibit And Sale At CCM My Paper On...
3,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/wp-json/oembed/1...,{version:1.0provider_name:My Paper Onlineprovi...
4,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/category/the-hac...,The Hackettstown News Archive My Paper Online...
...,...,...,...,...
4332,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/hauntings-around...,Hauntings Around the Area for Halloween My Pa...
4333,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/west-orange-resi...,West Orange Resident Centenary Student Studies...
4334,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/18-year-old-west...,18-Year-Old West Orange Teen Battles Cancer an...
4335,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/millburn-residen...,Millburn Resident Wins Award for Activism My...


all dates:


Unnamed: 0,date,domain,url,text
0,2019-09-30,www.mypaperonline.com,https://www.mypaperonline.com/category/the-ran...,The Randolph News Archive My Paper Online Hom...
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
2,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/local-art-exhibi...,Local Art Exhibit And Sale At CCM My Paper On...
3,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/wp-json/oembed/1...,{version:1.0provider_name:My Paper Onlineprovi...
4,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/category/the-hac...,The Hackettstown News Archive My Paper Online...
...,...,...,...,...
5636,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/millburn-residen...,Millburn Resident Wins Award for Activism My...
5637,2019-11-01,www.mypaperonline.com,https://www.mypaperonline.com/category/the-mt-...,The Mt. Olive News Archive My Paper Online Ho...
5638,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
5639,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/four-tips-for-a-...,Four Tips For A Safe Holiday Season My Paper ...


In [88]:
nonmpo = sample.loc[(sample.url.str.contains('www.mypaperonline.com') == False) | (sample.url.str.contains('/category/the-morristown-news') == True)].copy()
mpo = sample.loc[(sample.url.str.contains('www.mypaperonline.com') == True)].copy()
nonmpo_alldates = all_dates.loc[(all_dates.url.str.contains('www.mypaperonline.com') == False) | (all_dates.url.str.contains('/category/the-morristown-news') == True)].copy()
mpo_alldates = all_dates.loc[(all_dates.url.str.contains('www.mypaperonline.com') == True)].copy()

In [90]:
def get_html(collection_id, folder, data):

    if not os.path.isdir(folder):
        os.mkdir(folder)

    ai_url = "https://wayback.archive-it.org"

    date_list = pd.to_datetime(data['date'].unique())
    urls = data['url'].unique()

    ctr = 0

    for url in urls:

        ctr = ctr + 1

        home = ai_url + '/' + str(collection_id) + '/*/' + url

        page = requests.get(home)
        soup = BeautifulSoup(page.content, "lxml")
        alist = soup.find_all("a")
        alist = [a for a in alist if 'onclick' in a.attrs]
        dates = [dt.strptime(a.text, '%b %d, %Y') for a in alist]

        for a in alist:

            path = folder + '/' + dt.strptime(a.text, '%b %d, %Y').strftime('%Y%m%d') + '_' + ''.join(re.findall('(\d+|[a-zA-Z]+|-|\.)', url.split('/')[-1]))

            if 'onclick' in a.attrs and dt.strptime(a.text, '%b %d, %Y') in date_list and not os.path.isfile(path):
                wp = requests.get('https:' + a.attrs['href'])
                f = open(path, 'wb')
                f.write(wp.content)
                f.close()

In [91]:
get_html(12706, 'mypaper_full', mpo_alldates)

In [92]:

def label_morris(data, folder):

    data.loc[:,'morristown'] = None

    for index, row in data.iterrows():

        path = folder + '/' + ''.join(re.findall('\d+', row.date.strftime('%Y-%m-%d'))) + '_' + ''.join(re.findall('(\d+|[a-zA-Z]+|-|\.)', row.url.split('/')[-1]))

        try:
            f = open(path, 'rb')
            html = f.read()
        except:
            raise Exception('nope')

        try:
            soup = BeautifulSoup(html, "lxml")
            article = soup.find('article').attrs['class']
        except:
            article = []
            data.loc[index, 'morristown'] = 0
        
        if 'category-the-morristown-news' in article:
            data.loc[index, 'morristown'] = 1
        else:
            data.loc[index, 'morristown'] = 0


In [93]:
label_morris(mpo, 'mypaper_full')
label_morris(mpo_alldates, 'mypaper_full')

In [94]:
morristown = mpo.loc[(mpo.morristown == True)].copy().drop('morristown', axis = 1)
morristown_alldates = mpo_alldates.loc[(mpo_alldates.morristown == True)].copy().drop('morristown', axis = 1)

In [95]:
display(morristown)
display(morristown_alldates)

Unnamed: 0,date,domain,url,text
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
15,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
191,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/mayo-performing-...,Mayo Performing Arts Center Celebrates 25 Year...
192,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/the-morristown-f...,The Morristown Festival of Books is Back My P...
193,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/international-pa...,International Pain Foundation Announces Our 20...
...,...,...,...,...
3964,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/local-nonprofit-...,Local Nonprofit Foundation Receives Federal Mo...
3965,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/lead-east-introd...,Lead East Introduces New Generation To The ‘50...
3966,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/governor-cody-ai...,Governor Cody Aims To Prevent Teen Suicide My...
3967,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/contest-offers-c...,Contest Offers Cash Prizes To Young Teams With...


Unnamed: 0,date,domain,url,text
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
15,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
152,2019-10-31,www.mypaperonline.com,https://www.mypaperonline.com/how-to-combat-co...,How To Combat Cold And Flu My Paper Online Ho...
153,2019-10-31,www.mypaperonline.com,https://www.mypaperonline.com/helping-older-pe...,Helping Older People to Maintain Independence ...
154,2019-10-31,www.mypaperonline.com,https://www.mypaperonline.com/protect-your-hea...,Protect Your Health Protect Your Toothbrush M...
...,...,...,...,...
5436,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/first-library-wi...,First Library Will Help Build Libraries of Bab...
5437,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/hackettstowns-he...,Hackettstown’s Heath Village Inspires Staff wi...
5440,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/dont-miss-these-...,Don’t miss these must-have snacks that define ...
5448,2019-11-02,www.mypaperonline.com,http://www.mypaperonline.com/10th-anniversary-...,10th Anniversary Golf Classic Scores High For ...


In [96]:
complete = nonmpo.append(morristown)
complete_alldates = nonmpo_alldates.append(morristown_alldates)