In [39]:
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt
import time as tm

In [40]:
# list of dates for extraction, formatted as dates are in the dataset
dates = ['20191013', # none
         '20191104', # none
         '20191001', # yes
         '20191030', # yes
         '20191024', # yes
         '20190927', # none
         '20191012', # yes
         '20190930', # yes
         '20191002', # none
         '20191003', # none
         '20191004'] # none

# domain names for extraction, will filter www.mypaperonline.com by 
# "/category/the-morristown-news" after initial extraction
doms = ['www.mypaperonline.com',
        'www.newbrunswicktoday.com',
        'www.newarkblack.com']

chars_to_remove = ",^\()|\""

In [46]:
# save start time for timing extraction
start = tm.time()

# initialize dataframe
sample = pd.DataFrame(columns = ['date','domain','url','text'])
all_dates = pd.DataFrame(columns = ['date','domain','url','text'])

# open 12706-fulltext.txt file for reading
with open('12706-fulltext.txt', 'rb') as raw:

    # read first line
    line = raw.readline()

    # check if each line matches a date and a domain
    while line:
        # split line by first 3 commas
        key = line.decode('UTF-8').split(',', 3)
        # remove non-numeric characters from the date
        key[0] = re.sub('[^0-9]', '', key[0])

        if key[1] in doms:
            all_dates = all_dates.append({'date': key[0], 'domain': key[1], 'url': key[2], 'text': key[3]}, ignore_index=True)
            if key [0] in dates:
                sample = sample.append({'date': key[0], 'domain': key[1], 'url': key[2], 'text': key[3]}, ignore_index=True)
                
        # read next line
        line = raw.readline()

# take time difference
total = tm.time() - start

In [47]:
# remove problematic characters from text
text = [s.translate ({ord(c): "" for c in chars_to_remove}) for s in sample['text']]
text_ad = [s.translate ({ord(c): "" for c in chars_to_remove}) for s in all_dates['text']]
# replace old text with cleaned text
sample = sample.assign(text = text)
all_dates = all_dates.assign(text = text_ad)
# convert date to standard format
sample['date'] = pd.to_datetime(sample['date'])
all_dates['date'] = pd.to_datetime(all_dates['date'])

In [48]:
print('for chosen sample, extracted ' + str(len(sample)) + ' articles in ' + str(round(total)) + ' seconds')
print('for all dates, extracted ' + str(len(all_dates)) + ' articles in ' + str(round(total)) + ' seconds')
# this data includes all www.mypaperonline.com webpages from the given dates
print("chosen sample:")
display(sample)
print("all dates:")
display(all_dates)

for chosen sample, extracted 2457 articles in 68 seconds
for all dates, extracted 5213 articles in 68 seconds
chosen sample:


Unnamed: 0,date,domain,url,text
0,2019-09-30,www.mypaperonline.com,https://www.mypaperonline.com/category/the-ran...,The Randolph News Archive My Paper Online Hom...
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
2,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/local-art-exhibi...,Local Art Exhibit And Sale At CCM My Paper On...
3,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/wp-json/oembed/1...,{version:1.0provider_name:My Paper Onlineprovi...
4,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/category/the-hac...,The Hackettstown News Archive My Paper Online...
...,...,...,...,...
2452,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
2453,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/hauntings-around...,Hauntings Around the Area for Halloween My Pa...
2454,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/west-orange-resi...,West Orange Resident Centenary Student Studies...
2455,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/18-year-old-west...,18-Year-Old West Orange Teen Battles Cancer an...


all dates:


Unnamed: 0,date,domain,url,text
0,2019-09-30,www.mypaperonline.com,https://www.mypaperonline.com/category/the-ran...,The Randolph News Archive My Paper Online Hom...
1,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
2,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/local-art-exhibi...,Local Art Exhibit And Sale At CCM My Paper On...
3,2019-10-01,www.mypaperonline.com,https://www.mypaperonline.com/wp-json/oembed/1...,{version:1.0provider_name:My Paper Onlineprovi...
4,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/category/the-hac...,The Hackettstown News Archive My Paper Online...
...,...,...,...,...
5208,2019-10-12,www.mypaperonline.com,https://www.mypaperonline.com/millburn-residen...,Millburn Resident Wins Award for Activism My...
5209,2019-11-01,www.mypaperonline.com,https://www.mypaperonline.com/category/the-mt-...,The Mt. Olive News Archive My Paper Online Ho...
5210,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/category/local-n...,Local News Archive My Paper Online Home Discl...
5211,2019-11-02,www.mypaperonline.com,https://www.mypaperonline.com/four-tips-for-a-...,Four Tips For A Safe Holiday Season My Paper ...


In [51]:
# filter out www.mypaperonline.com sites that don't have "/category/the-morristown-news" in it's whole url
filtered_sample = sample[(sample.url.str.contains('www.mypaperonline.com') == False) | (sample.url.str.contains('/category/the-morristown-news') == True)]
filtered_all_dates = all_dates[(all_dates.url.str.contains('www.mypaperonline.com') == False) | (all_dates.url.str.contains('/category/the-morristown-news') == True)]
print(str(len(filtered_sample)) + ' articles left after filtering in chosen sample')
print(str(len(filtered_all_dates)) + ' articles left after filtering in all dates')

846 articles left after filtering in chosen sample
1448 articles left after filtering in all dates


In [52]:
# save dataframe to file
filtered_sample.to_csv('chosensamp.csv', index=False)
filtered_all_dates.to_csv('all_dates.csv', index=False)