In [20]:
import pandas as pd
import requests
import numpy as np
import re
from datetime import datetime as dt
from bs4 import BeautifulSoup, element
import time as tm
import os

In [21]:
# list of dates for extraction, formatted as dates are in the dataset
# dates = ['', # none
#          '20191104', # none
#          '20191001', # yes
#          '20191030', # yes
#          '20191024', # yes
#          '20190927', # none
#          '20191012', # yes
#          '20190930', # yes
#          '20191002', # none
#          '20191003', # none
#          '20191004'] # none
         
dates = ['20191008', 
         '20191012', 
         '20191020', 
         '20191024', 
         '20191030', 
         '20191101',
         '20191102',  
         '20191111', 
         '20190930', 
         '20191001', 
         '20191002', 
         '20191003',
         '20191004'] 

# domain names for extraction, will filter www.mypaperonline.com by 
# "/category/the-morristown-news" after initial extraction
doms = ['https://www.mypaperonline.com',
        'https://www.newbrunswicktoday.com',
        'https://www.newarkblack.com',
        'http://www.mypaperonline.com',
        'http://www.newbrunswicktoday.com',
        'http://www.newarkblack.com',
        'https://mypaperonline.com',
        'https://newbrunswicktoday.com',
        'https://newarkblack.com',
        'http://mypaperonline.com',
        'http://newbrunswicktoday.com',
        'http://newarkblack.com',
        'mypaperonline.com',
        'newbrunswicktoday.com',
        'newarkblack.com',
        'www.mypaperonline.com',
        'www.newbrunswicktoday.com',
        'www.newarkblack.com']

chars_to_remove = ",^\()|\""

In [22]:
# save start time for timing extraction
start = tm.time()

# initialize dataframe
sample = pd.DataFrame(columns = ['date','domain','url','text'])
all_dates = pd.DataFrame(columns = ['date','domain','url','text'])

# open 12706-fulltext.txt file for reading
with open('12706-fulltext.txt', 'rb') as raw:

    # read first line
    line = raw.readline()

    # check if each line matches a date and a domain
    while line:
        # split line by first 3 commas
        key = line.decode('UTF-8').split(',', 3)
        # remove non-numeric characters from the date
        key[0] = re.sub('[^0-9]', '', key[0])

        # saves to all_dates if in domain list, save to sample if in domain list and date list
        if key[1] in doms:
            all_dates = all_dates.append({'date': key[0], 'domain': key[1], 'url': key[2], 'text': key[3]}, ignore_index=True)
            if key [0] in dates:
                sample = sample.append({'date': key[0], 'domain': key[1], 'url': key[2], 'text': key[3]}, ignore_index=True)
                
        # read next line
        line = raw.readline()

# take time difference
total = tm.time() - start

In [23]:
# remove problematic characters from text
text = [s.translate ({ord(c): "" for c in chars_to_remove}) for s in sample['text']]
text_ad = [s.translate ({ord(c): "" for c in chars_to_remove}) for s in all_dates['text']]
# take off trailing /
url = [s.strip('/') for s in sample['url']]
url_ad = [s.strip('/') for s in all_dates['url']] 
# replace old text with cleaned text
sample = sample.assign(text = text).assign(url = url)
all_dates = all_dates.assign(text = text_ad).assign(url = url_ad)
# convert date to standard format
sample['date'] = pd.to_datetime(sample['date'])
all_dates['date'] = pd.to_datetime(all_dates['date'])

In [24]:
print('for chosen sample, extracted ' + str(len(sample)) + ' articles in ' + str(round(total)) + ' seconds')
print('for all dates, extracted ' + str(len(all_dates)) + ' articles in ' + str(round(total)) + ' seconds')
# this data includes all www.mypaperonline.com webpages from the given dates
print("chosen sample:")
display(sample)
print("all dates:")
display(all_dates)

for chosen sample, extracted 4758 articles in 73 seconds
for all dates, extracted 5641 articles in 73 seconds
chosen sample:


Unnamed: 0,date,domain,url,text
0,2019-09-30,www.mypaperonline.com,https://www.mypaperonline.com/category/the-ran...,The Randolph News Archive My Paper Online Hom...
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
2,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/local-art-exhibi...,Local Art Exhibit And Sale At CCM My Paper On...
3,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/wp-json/oembed/1...,{version:1.0provider_name:My Paper Onlineprovi...
4,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/category/the-hac...,The Hackettstown News Archive My Paper Online...
...,...,...,...,...
4753,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/millburn-residen...,Millburn Resident Wins Award for Activism My...
4754,2019-11-01,www.mypaperonline.com,https://www.mypaperonline.com/category/the-mt-...,The Mt. Olive News Archive My Paper Online Ho...
4755,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
4756,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/four-tips-for-a-...,Four Tips For A Safe Holiday Season My Paper ...


all dates:


Unnamed: 0,date,domain,url,text
0,2019-09-30,www.mypaperonline.com,https://www.mypaperonline.com/category/the-ran...,The Randolph News Archive My Paper Online Hom...
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
2,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/local-art-exhibi...,Local Art Exhibit And Sale At CCM My Paper On...
3,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/wp-json/oembed/1...,{version:1.0provider_name:My Paper Onlineprovi...
4,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/category/the-hac...,The Hackettstown News Archive My Paper Online...
...,...,...,...,...
5636,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/millburn-residen...,Millburn Resident Wins Award for Activism My...
5637,2019-11-01,www.mypaperonline.com,https://www.mypaperonline.com/category/the-mt-...,The Mt. Olive News Archive My Paper Online Ho...
5638,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
5639,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/four-tips-for-a-...,Four Tips For A Safe Holiday Season My Paper ...


In [26]:
# in this block the mypaperonline.com webpages (besides the morristown life homepage) are being separated fromt the rest
nonmpo = sample.loc[(sample.url.str.contains('www.mypaperonline.com') == False) | (sample.url.str.contains('/category/the-morristown-news') == True)].copy()
mpo = sample.loc[(sample.url.str.contains('www.mypaperonline.com') == True) & (sample.url.str.contains('/category/the-morristown-news') == False)].copy()
nonmpo_alldates = all_dates.loc[(all_dates.url.str.contains('www.mypaperonline.com') == False) | (all_dates.url.str.contains('/category/the-morristown-news') == True)].copy()
mpo_alldates = all_dates.loc[(all_dates.url.str.contains('www.mypaperonline.com') == True)].copy()

In [27]:
# this function constructs the url for an archived webpage according to https://wayback.archive-it.org formatting
def get_html(collection_id, folder, data):

    # creates a local directory named 'folder' if it doesn't exist
    if not os.path.isdir(folder):
        os.mkdir(folder)

    # the base url for the archive
    ai_url = "https://wayback.archive-it.org"

    # this gets a unique list of dates and urls so we can avoid duplication
    date_list = pd.to_datetime(data['date'].unique())
    urls = data['url'].unique()

    # this loop iterates through all the urls, searches for the archived webpage
    # then looks for any hyperlinks on the page that are associated with a date in the list
    for url in urls:

        # this is the url of an archived webpage's home 
        # (e.g. https://wayback.archive-it.org/12706/*/http://mypaperonline.com)
        home = ai_url + '/' + str(collection_id) + '/*/' + url

        # this block gets all the date hyperlinks
        page = requests.get(home)
        soup = BeautifulSoup(page.content, "lxml")
        alist = soup.find_all("a")
        alist = [a for a in alist if 'onclick' in a.attrs]
        dates = [dt.strptime(a.text, '%b %d, %Y') for a in alist]

        # this loops through all the date hyperlinks and if they're in
        # the date_list it saves them locally
        for a in alist:

            # this creates a local path for the html, it converts URLS+dates
            # into unique simple strings
            path = folder + '/' + dt.strptime(a.text, '%b %d, %Y').strftime('%Y%m%d') + '_' + ''.join(re.findall('(\d+|[a-zA-Z]+|-|\.)', url.split('/')[-1])) + '.html'

            # this is where it saves the html ('onclick' in a.attrs is probably redundant)
            if 'onclick' in a.attrs and dt.strptime(a.text, '%b %d, %Y') in date_list and not os.path.isfile(path):
                wp = requests.get('https:' + a.attrs['href'])
                f = open(path, 'wb')
                f.write(wp.content)
                f.close()

In [28]:
get_html(12706, 'mypaper_full', mpo_alldates)

In [29]:

# this function goes through all of the mypaperonline.com webpages
# and labels them if they have an article object labeled "category-the-morristown-news"
def label_morris(data, folder):

    # adds the column for labeling 1/0 if is/is not morristown life article
    data.loc[:,'morristown'] = None

    # this loop interates through every row in the dataframe, identifies
    # the local html file that is that row's webpage and checks to see
    # if that html has an article object labeled "category-the-morristown-news"
    # and populates the morristown column accordingly
    for index, row in data.iterrows():

        # this creates a local path for the html, it converts URLS+dates
        # into unique simple strings
        path = folder + '/' + ''.join(re.findall('\d+', row.date.strftime('%Y-%m-%d'))) + '_' + ''.join(re.findall('(\d+|[a-zA-Z]+|-|\.)', row.url.split('/')[-1])) + '.html'

        # this tries opening the local html and throws an error if it doesn't exist
        try:
            f = open(path, 'rb')
            html = f.read()
        except:
            raise Exception('No HTML')

        # this checks to see if there is an article object in the html
        # if there isn't it labels morristown as 0
        try:
            soup = BeautifulSoup(html, "lxml")
            article = soup.find('article').attrs['class']
        except:
            article = []
            data.loc[index, 'morristown'] = 0
        
        # this checks to see if the article in the html is labeled 
        # with 'category-the-morristown-news' 
        if 'category-the-morristown-news' in article or 'category-morristown-digital-edition' in article:
            data.loc[index, 'morristown'] = 1
        else:
            data.loc[index, 'morristown'] = 0


In [30]:
label_morris(mpo, 'mypaper_full')
label_morris(mpo_alldates, 'mypaper_full')

In [31]:
# remove morristown column
morristown = mpo.loc[(mpo.morristown == True)].copy().drop('morristown', axis = 1)
morristown_alldates = mpo_alldates.loc[(mpo_alldates.morristown == True)].copy().drop('morristown', axis = 1)

In [32]:
display(morristown)
display(morristown_alldates)

Unnamed: 0,date,domain,url,text
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
15,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
171,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/category/digital...,Morristown Digital Edition Archive My Paper O...
192,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/mayo-performing-...,Mayo Performing Arts Center Celebrates 25 Year...
193,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/the-morristown-f...,The Morristown Festival of Books is Back My P...
...,...,...,...,...
4553,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/first-library-wi...,First Library Will Help Build Libraries of Bab...
4554,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/hackettstowns-he...,Hackettstown’s Heath Village Inspires Staff wi...
4557,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/dont-miss-these-...,Don’t miss these must-have snacks that define ...
4565,2019-11-02,www.mypaperonline.com,http://www.mypaperonline.com/10th-anniversary-...,10th Anniversary Golf Classic Scores High For ...


Unnamed: 0,date,domain,url,text
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
15,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
152,2019-10-31,www.mypaperonline.com,https://www.mypaperonline.com/how-to-combat-co...,How To Combat Cold And Flu My Paper Online Ho...
153,2019-10-31,www.mypaperonline.com,https://www.mypaperonline.com/helping-older-pe...,Helping Older People to Maintain Independence ...
154,2019-10-31,www.mypaperonline.com,https://www.mypaperonline.com/protect-your-hea...,Protect Your Health Protect Your Toothbrush M...
...,...,...,...,...
5436,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/first-library-wi...,First Library Will Help Build Libraries of Bab...
5437,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/hackettstowns-he...,Hackettstown’s Heath Village Inspires Staff wi...
5440,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/dont-miss-these-...,Don’t miss these must-have snacks that define ...
5448,2019-11-02,www.mypaperonline.com,http://www.mypaperonline.com/10th-anniversary-...,10th Anniversary Golf Classic Scores High For ...


In [33]:
# recombine mypaponline.com webpages with the rest
complete = nonmpo.append(morristown)
complete_alldates = nonmpo_alldates.append(morristown_alldates)

In [34]:
complete.to_csv('final_chosen.csv')
complete_alldates.to_csv('final_chosen_alldates.csv')

In [None]:
display(mpo[(mpo.url.str.contains('morristown'))].sample(10).drop('text', axis = 1))

In [19]:
pd.set_option('display.max_colwidth', None)

In [20]:
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)