In [79]:
import pandas as pd
import requests
import numpy as np
import re
from datetime import datetime as dt
from bs4 import BeautifulSoup, element
import time as tm
import os

In [80]:
# list of dates for extraction, formatted as dates are in the dataset
# dates = ['', # none
#          '20191104', # none
#          '20191001', # yes
#          '20191030', # yes
#          '20191024', # yes
#          '20190927', # none
#          '20191012', # yes
#          '20190930', # yes
#          '20191002', # none
#          '20191003', # none
#          '20191004'] # none
         
dates = ['20191008', 
         '20191012', 
         '20191020', 
         '20191024', 
         '20191030', 
         '20191101', 
         '20191111', 
         '20190930', 
         '20191001', 
         '20191002', 
         '20191003',
         '20191004'] 

# domain names for extraction, will filter www.mypaperonline.com by 
# "/category/the-morristown-news" after initial extraction
doms = ['https://www.mypaperonline.com',
        'https://www.newbrunswicktoday.com',
        'https://www.newarkblack.com',
        'http://www.mypaperonline.com',
        'http://www.newbrunswicktoday.com',
        'http://www.newarkblack.com',
        'https://mypaperonline.com',
        'https://newbrunswicktoday.com',
        'https://newarkblack.com',
        'http://mypaperonline.com',
        'http://newbrunswicktoday.com',
        'http://newarkblack.com',
        'mypaperonline.com',
        'newbrunswicktoday.com',
        'newarkblack.com',
        'www.mypaperonline.com',
        'www.newbrunswicktoday.com',
        'www.newarkblack.com']

chars_to_remove = ",^\()|\""

In [81]:
# save start time for timing extraction
start = tm.time()

# initialize dataframe
sample = pd.DataFrame(columns = ['date','domain','url','text'])
all_dates = pd.DataFrame(columns = ['date','domain','url','text'])

# open 12706-fulltext.txt file for reading
with open('12706-fulltext.txt', 'rb') as raw:

    # read first line
    line = raw.readline()

    # check if each line matches a date and a domain
    while line:
        # split line by first 3 commas
        key = line.decode('UTF-8').split(',', 3)
        # remove non-numeric characters from the date
        key[0] = re.sub('[^0-9]', '', key[0])

        # saves to all_dates if in domain list, save to sample if in domain list and date list
        if key[1] in doms:
            all_dates = all_dates.append({'date': key[0], 'domain': key[1], 'url': key[2], 'text': key[3]}, ignore_index=True)
            if key [0] in dates:
                sample = sample.append({'date': key[0], 'domain': key[1], 'url': key[2], 'text': key[3]}, ignore_index=True)
                
        # read next line
        line = raw.readline()

# take time difference
total = tm.time() - start

In [82]:
# remove problematic characters from text
text = [s.translate ({ord(c): "" for c in chars_to_remove}) for s in sample['text']]
text_ad = [s.translate ({ord(c): "" for c in chars_to_remove}) for s in all_dates['text']]
# replace old text with cleaned text
sample = sample.assign(text = text)
all_dates = all_dates.assign(text = text_ad)
# convert date to standard format
sample['date'] = pd.to_datetime(sample['date'])
all_dates['date'] = pd.to_datetime(all_dates['date'])

In [83]:
print('for chosen sample, extracted ' + str(len(sample)) + ' articles in ' + str(round(total)) + ' seconds')
print('for all dates, extracted ' + str(len(all_dates)) + ' articles in ' + str(round(total)) + ' seconds')
# this data includes all www.mypaperonline.com webpages from the given dates
print("chosen sample:")
display(sample)
print("all dates:")
display(all_dates)

for chosen sample, extracted 4337 articles in 76 seconds
for all dates, extracted 5641 articles in 76 seconds
chosen sample:


Unnamed: 0,date,domain,url,text
0,2019-09-30,www.mypaperonline.com,https://www.mypaperonline.com/category/the-randolph-news,The Randolph News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on Fac...
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-news?paper=the-randolph-news,Local News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on Facebook F...
2,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/local-art-exhibit-and-sale-at-ccm.html,Local Art Exhibit And Sale At CCM My Paper Online Home Disclaimer Privacy Sitemap Contact Frien...
3,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.mypaperonline.com%2...,{version:1.0provider_name:My Paper Onlineprovider_url:https://www.mypaperonline.comtitle:Local A...
4,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/category/the-hackettstown-news,The Hackettstown News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on...
...,...,...,...,...
4332,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/hauntings-around-the-area-for-halloween.html,Hauntings Around the Area for Halloween My Paper Online Home Disclaimer Privacy Sitemap Contact...
4333,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/west-orange-resident-centenary-student-studies-shakespeare-studies...,West Orange Resident Centenary Student Studies Shakespeare Studies Abroad My Paper Online Home ...
4334,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/18-year-old-west-orange-teen-battles-cancer-and-is-featured-in-sto...,18-Year-Old West Orange Teen Battles Cancer and is Featured in Stop & Shop Fundraiser My Paper ...
4335,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/millburn-resident-wins-award-for-activism.html,Millburn Resident Wins Award for Activism My Paper Online Home Disclaimer Privacy Sitemap Cont...


all dates:


Unnamed: 0,date,domain,url,text
0,2019-09-30,www.mypaperonline.com,https://www.mypaperonline.com/category/the-randolph-news,The Randolph News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on Fac...
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-news?paper=the-randolph-news,Local News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on Facebook F...
2,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/local-art-exhibit-and-sale-at-ccm.html,Local Art Exhibit And Sale At CCM My Paper Online Home Disclaimer Privacy Sitemap Contact Frien...
3,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.mypaperonline.com%2...,{version:1.0provider_name:My Paper Onlineprovider_url:https://www.mypaperonline.comtitle:Local A...
4,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/category/the-hackettstown-news,The Hackettstown News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on...
...,...,...,...,...
5636,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/millburn-resident-wins-award-for-activism.html,Millburn Resident Wins Award for Activism My Paper Online Home Disclaimer Privacy Sitemap Cont...
5637,2019-11-01,www.mypaperonline.com,https://www.mypaperonline.com/category/the-mt-olive-news,The Mt. Olive News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on Fa...
5638,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/category/local-news?paper=the-mt-olive-news,Local News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on Facebook F...
5639,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/four-tips-for-a-safe-holiday-season.html,Four Tips For A Safe Holiday Season My Paper Online Home Disclaimer Privacy Sitemap Contact Fri...


In [84]:
# in this block the mypaperonline.com webpages (besides the morristown life homepage) are being separated fromt the rest
nonmpo = sample.loc[(sample.url.str.contains('www.mypaperonline.com') == False) | (sample.url.str.contains('/category/the-morristown-news') == True)].copy()
mpo = sample.loc[(sample.url.str.contains('www.mypaperonline.com') == True) & (sample.url.str.contains('/category/the-morristown-news') == False)].copy()
nonmpo_alldates = all_dates.loc[(all_dates.url.str.contains('www.mypaperonline.com') == False) | (all_dates.url.str.contains('/category/the-morristown-news') == True)].copy()
mpo_alldates = all_dates.loc[(all_dates.url.str.contains('www.mypaperonline.com') == True)].copy()

In [85]:
# this function constructs the url for an archived webpage according to https://wayback.archive-it.org formatting
def get_html(collection_id, folder, data):

    # creates a local directory named 'folder' if it doesn't exist
    if not os.path.isdir(folder):
        os.mkdir(folder)

    # the base url for the archive
    ai_url = "https://wayback.archive-it.org"

    # this gets a unique list of dates and urls so we can avoid duplication
    date_list = pd.to_datetime(data['date'].unique())
    urls = data['url'].unique()

    # this loop iterates through all the urls, searches for the archived webpage
    # then looks for any hyperlinks on the page that are associated with a date in the list
    for url in urls:

        # this is the url of an archived webpage's home 
        # (e.g. https://wayback.archive-it.org/12706/*/http://mypaperonline.com)
        home = ai_url + '/' + str(collection_id) + '/*/' + url

        # this block gets all the date hyperlinks
        page = requests.get(home)
        soup = BeautifulSoup(page.content, "lxml")
        alist = soup.find_all("a")
        alist = [a for a in alist if 'onclick' in a.attrs]
        dates = [dt.strptime(a.text, '%b %d, %Y') for a in alist]

        # this loops through all the date hyperlinks and if they're in
        # the date_list it saves them locally
        for a in alist:

            # this creates a local path for the html, it converts URLS+dates
            # into unique simple strings
            path = folder + '/' + dt.strptime(a.text, '%b %d, %Y').strftime('%Y%m%d') + '_' + ''.join(re.findall('(\d+|[a-zA-Z]+|-|\.)', url.split('/')[-1]))

            # this is where it saves the html ('onclick' in a.attrs is probably redundant)
            if 'onclick' in a.attrs and dt.strptime(a.text, '%b %d, %Y') in date_list and not os.path.isfile(path):
                wp = requests.get('https:' + a.attrs['href'])
                f = open(path, 'wb')
                f.write(wp.content)
                f.close()

In [86]:
get_html(12706, 'mypaper_full', mpo_alldates)

In [87]:

# this function goes through all of the mypaperonline.com webpages
# and labels them if they have an article object labeled "category-the-morristown-news"
def label_morris(data, folder):

    # adds the column for labeling 1/0 if is/is not morristown life article
    data.loc[:,'morristown'] = None

    # this loop interates through every row in the dataframe, identifies
    # the local html file that is that row's webpage and checks to see
    # if that html has an article object labeled "category-the-morristown-news"
    # and populates the morristown column accordingly
    for index, row in data.iterrows():

        # this creates a local path for the html, it converts URLS+dates
        # into unique simple strings
        path = folder + '/' + ''.join(re.findall('\d+', row.date.strftime('%Y-%m-%d'))) + '_' + ''.join(re.findall('(\d+|[a-zA-Z]+|-|\.)', row.url.split('/')[-1]))

        # this tries opening the local html and throws an error if it doesn't exist
        try:
            f = open(path, 'rb')
            html = f.read()
        except:
            raise Exception('No HTML')

        # this checks to see if there is an article object in the html
        # if there isn't it labels morristown as 0
        try:
            soup = BeautifulSoup(html, "lxml")
            article = soup.find('article').attrs['class']
        except:
            article = []
            data.loc[index, 'morristown'] = 0
        
        # this checks to see if the article in the html is labeled 
        # with 'category-the-morristown-news' 
        if 'category-the-morristown-news' in article or 'category-morristown-digital-edition':
            data.loc[index, 'morristown'] = 1
        else:
            data.loc[index, 'morristown'] = 0


In [88]:
label_morris(mpo, 'mypaper_full')
label_morris(mpo_alldates, 'mypaper_full')

In [89]:
# remove morristown column
morristown = mpo.loc[(mpo.morristown == True)].copy().drop('morristown', axis = 1)
morristown_alldates = mpo_alldates.loc[(mpo_alldates.morristown == True)].copy().drop('morristown', axis = 1)

In [90]:
display(morristown)
display(morristown_alldates)

Unnamed: 0,date,domain,url,text
0,2019-09-30,www.mypaperonline.com,https://www.mypaperonline.com/category/the-randolph-news,The Randolph News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on Fac...
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-news?paper=the-randolph-news,Local News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on Facebook F...
2,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/local-art-exhibit-and-sale-at-ccm.html,Local Art Exhibit And Sale At CCM My Paper Online Home Disclaimer Privacy Sitemap Contact Frien...
3,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.mypaperonline.com%2...,{version:1.0provider_name:My Paper Onlineprovider_url:https://www.mypaperonline.comtitle:Local A...
4,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/category/the-hackettstown-news,The Hackettstown News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on...
...,...,...,...,...
4332,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/hauntings-around-the-area-for-halloween.html,Hauntings Around the Area for Halloween My Paper Online Home Disclaimer Privacy Sitemap Contact...
4333,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/west-orange-resident-centenary-student-studies-shakespeare-studies...,West Orange Resident Centenary Student Studies Shakespeare Studies Abroad My Paper Online Home ...
4334,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/18-year-old-west-orange-teen-battles-cancer-and-is-featured-in-sto...,18-Year-Old West Orange Teen Battles Cancer and is Featured in Stop & Shop Fundraiser My Paper ...
4335,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/millburn-resident-wins-award-for-activism.html,Millburn Resident Wins Award for Activism My Paper Online Home Disclaimer Privacy Sitemap Cont...


Unnamed: 0,date,domain,url,text
0,2019-09-30,www.mypaperonline.com,https://www.mypaperonline.com/category/the-randolph-news,The Randolph News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on Fac...
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-news?paper=the-randolph-news,Local News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on Facebook F...
2,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/local-art-exhibit-and-sale-at-ccm.html,Local Art Exhibit And Sale At CCM My Paper Online Home Disclaimer Privacy Sitemap Contact Frien...
3,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.mypaperonline.com%2...,{version:1.0provider_name:My Paper Onlineprovider_url:https://www.mypaperonline.comtitle:Local A...
4,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/category/the-hackettstown-news,The Hackettstown News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on...
...,...,...,...,...
5636,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/millburn-resident-wins-award-for-activism.html,Millburn Resident Wins Award for Activism My Paper Online Home Disclaimer Privacy Sitemap Cont...
5637,2019-11-01,www.mypaperonline.com,https://www.mypaperonline.com/category/the-mt-olive-news,The Mt. Olive News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on Fa...
5638,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/category/local-news?paper=the-mt-olive-news,Local News Archive My Paper Online Home Disclaimer Privacy Sitemap Contact Friend on Facebook F...
5639,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/four-tips-for-a-safe-holiday-season.html,Four Tips For A Safe Holiday Season My Paper Online Home Disclaimer Privacy Sitemap Contact Fri...


In [91]:
# recombine mypaponline.com webpages with the rest
complete = nonmpo.append(morristown)
complete_alldates = nonmpo_alldates.append(morristown_alldates)

In [92]:
complete.to_csv('final_chosen.csv')
complete_alldates.to_csv('final_chosen_alldates.csv')

In [93]:
display(mpo[(mpo.url.str.contains('morristown'))].sample(10).drop('text', axis = 1))

Unnamed: 0,date,domain,url,morristown
3876,2019-10-01,www.mypaperonline.com,http://www.mypaperonline.com/moms-helping-moms-partners-with-homeless-solutions-of-morristown-2....,1
727,2019-10-20,www.mypaperonline.com,https://www.mypaperonline.com/the-morristown-festival-of-books-is-back.html,1
2648,2019-10-08,www.mypaperonline.com,https://www.mypaperonline.com/category/digital-editions/morristown-digital-edition,1
3332,2019-10-24,www.mypaperonline.com,http://www.mypaperonline.com/moms-helping-moms-partners-with-homeless-solutions-of-morristown-2....,1
3754,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.mypaperonline.com%2...,1
1130,2019-11-11,www.mypaperonline.com,https://www.mypaperonline.com/teams-for-kids-foundation-donates-to-morristown-medical-center.html,1
3753,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/peplowski-to-pay-tribute-to-benny-goodman-in-morristown.html,1
2850,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-news?paper=the-morristown-news,1
1024,2019-10-08,www.mypaperonline.com,https://www.mypaperonline.com/peplowski-to-pay-tribute-to-benny-goodman-in-morristown.html/feed,1
3236,2019-10-24,www.mypaperonline.com,https://www.mypaperonline.com/category/digital-editions/morristown-digital-edition,1


In [94]:
pd.set_option('display.max_colwidth', None)

In [95]:
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)