# Scraping Editor & Publisher

Scraping articles from the trade journal editorandpublisher.com to analyze the issues faced by a the struggling newspaper industry.

In [3]:
# Necessary imports
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import seaborn as sns
import matplotlib.pyplot as plt
import dateutil.parser
import re

## Recent articles 
Post-Aug 10, 2018

### Getting links to articles

In [32]:
# grabbing links to article pages for each issue in last 2 years

industry_news_links = []

for page_num in range(373): # 373 pages 
    link = "http://www.editorandpublisher.com/browse.html?page_size=20&search_filter_mode=and&sub_type=stories%2Cphotos%2Cvideos%2Cspecialsections%2Cprintissues%2Ceeditions%2Cpackages%2Cmagazines%2Cmaps%2Cfeeds%2Cpolls&page={}".format(page_num)    # making list of pages with industry news
    industry_news_links.append(link)                                               


In [41]:
# getting soup for each list page and parsing out soups for tables with links for each page

rows = []    # will have all article page links

for url in industry_news_links:
    response = requests.get(url)                                                   # reading each list page (161 total)
    if response.status_code != 200:
        print(url, response.status_code)    # checkign response is in 200s for each URL
    page = response.text
    soup = BeautifulSoup(page, "html5lib")                                         # getting soup for each list page
    row = soup.find_all('div', class_='story_list')  # narrowing soups down to only sections with article page links 
    rows.append(row)                                                               # adding each soup to list, makes list of lists, each list featuring all links from a single year

In [44]:
type(rows[0]) # list of lists

bs4.element.ResultSet

In [104]:
# taking each soup of tables and pulling 

recent_article_links = {}    # setting empty dict for key=name val=link

for soup in rows:
    for element in soup:
        for article in element.find_all(class_='headline'):
            link = article.find('a')   
            title, url = link.text, link['href']
            title_clean = title.replace('\n','').replace('\t','')    # clean up html formatting in title strings
            recent_article_links[title_clean] = url

In [105]:
len(recent_article_links)

7024

In [106]:
recent_article_links['How LION Publishers is Becoming the Destination for News Entrepreneurs']

'/stories/how-lion-publishers-is-becoming-the-destination-for-news-entrepreneurs,170466?'

### Getting text from article pages
#### Testing on single article page

In [112]:
response = requests.get('http://www.editorandpublisher.com/stories/how-lion-publishers-is-becoming-the-destination-for-news-entrepreneurs,170466?')      

In [113]:
response.status_code

200

In [114]:
page = response.text
soup = BeautifulSoup(page, "html5lib") 

In [117]:
# title

title = soup.find('h1').text
title

'How LION Publishers is Becoming the Destination for News Entrepreneurs'

In [133]:
# pub date

pub_date = soup.find('time').text
pub_date = pub_date.replace('\n','').replace('\t','') 
pub_date = pub_date.split()[1:4]
pub_date = ' '.join(pub_date)
pub_date

'August 7, 2020'

In [140]:
# author/source

byline = soup.find('div', class_='byline').text
author, source = byline.split(sep='|')
author = author.strip()
source = source.strip()
print(author)
print(source)

Anika Anand
LION Publishers


In [153]:
# body text

body_text = ''

for paragraph in soup.find('div', class_='body main-body clearfix').find_all('p'):
    body_text += paragraph.text
    
body_text = body_text.replace('Click here to read more.','')
body_text

'If we want to help preserve the impact of good journalism, we must support journalism entrepreneurship that leads to financially sustainable news organizations. And this is most important for communities that have been historically underrepresented or mischaracterized by existing legacy news publications.Fortunately, an ecosystem is beginning to emerge among those of us who are trying to create and support digital news startups. While each of us may have different theories of change, I’m confident we’re all after the same goal: To ensure that the future of independent media is equitable, impactful and sustainable.'

#### Functions to clean up strings

In [15]:
def to_date(datestring):
    if datestring:
        datestring = datestring.replace('\n','').replace('\t','') 
        datestring = datestring.split()[1:4]
        datestring = ' '.join(datestring)
        datestring = dateutil.parser.parse(datestring)
        return datestring
    else:
        return None
    

def clean_text_string(text_string):
    if text_string:
        text_string = (text_string
                       .strip()
                       .replace('\n', '')
                       .replace('\t', '')
                       .replace('Click here to read more.','')
                       .replace('Click here to read','')
                       .replace('\xa0 more.', ''))
        
        return text_string
    else:
        return None
    
    
def separate_byline(text_string):
    if text_string is None:
        return None
        
    else:
        author, source = text_string.split(sep='|')
        author = author.strip()
        source = source.strip()
        return author, source

#### Functions to grab page elements

In [3]:
def get_title(soup):
    
    '''Takes a soup and returns article title or None if nothing is found.'''
    
    obj = soup.find('h1')

    if not obj: 
        return None
    
    if obj:
        return obj.text 
    
    else:
        return None
    
    
def get_pub_date(soup):
    
    '''Takes a soup returns the article publish date or None if nothing is found.'''
    
    obj = soup.find('time')

    if not obj: 
        return None
    
    if obj:
        return obj.text 
    
    else:
        return None
    
    
def get_author(soup):
    
    '''Takes a soup returns the article source or None if nothing is found.'''
    
    obj = soup.find('div', class_='byline')

    if not obj: 
        return None
    
    if obj:
        byline = obj.text
        author = byline.split(sep='|')[0]
        return author 
    
    else:
        return None
    

    
def get_source(soup):
    
    '''Takes a soup returns the article author or None if nothing is found.'''
    
    obj = soup.find('div', class_='byline')

    if not obj: 
        return None
    
    byline = obj.text
    
    if len(byline.split(sep='|')) > 1:
        
        author = byline.split(sep='|')[0]
        source = byline.split(sep='|')[1]
        
        if source != author:
            return source
        
        else: 
            return None
    
    else:
        return None
    
def get_body_text(soup):
    
    '''Takes a soup returns the article body text, or None if nothing is found.'''
    
    obj = soup.find('div', class_='body main-body clearfix').find_all('p')

    if not obj: 
        return None
    
    if obj:
        body_text = ''
        
        if len(obj) > 1:
            for paragraph in obj:
                    body_text += ' '
                    body_text += paragraph.text

                
        else:
            obj2 = soup.find('div', class_='body main-body clearfix').find('p')
            body_text += obj2.text   # handles cases with only one paragraph
            
        return body_text
    
    else:
        return None


#### Extending to all article pages

In [4]:
def get_article_dict(link):
    '''
    From Editor and Publisher link stub, request movie html, parse with BeautifulSoup, and
    collect 
        'title', 
        'pub_date', 
        'author',
        'source',
        'body_text',
        'url'
    Return information as a dictionary.
    '''
    
    base_url = 'http://www.editorandpublisher.com/'
    
    #Create full url to scrape
    url = base_url + link
    
    # Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "html5lib")

    
    headers = ['title',
               'pub_date',
               'author',
               'source',
               'body_text',
               'url']
    try:
        # Get title
        title_string = get_title(soup)
        title = clean_text_string(title_string)

        # Get publish date
        pub_date_string = get_pub_date(soup)
        pub_date = to_date(pub_date_string)

        # Get author 
        author_string = get_author(soup)
        author = clean_text_string(author_string)

        # Get source 
        source_string = get_source(soup)
        source = clean_text_string(source_string)

        # Get body text
        body_text_string = get_body_text(soup)
        body_text = clean_text_string(body_text_string)
    
    except AttributeError:
        print('Oops! There was an error with an article, so we skilled ', url)
        pass
    
    #Create movie dictionary and return
    article_dict = dict(zip(headers, [title,
                                      pub_date,
                                      author,
                                      source,
                                      body_text,
                                      url
                                     ]))

    return article_dict

#### Testing on pages with slightly different attributes

In [345]:
# Testing on one page (all elements present)

test_link = 'http://www.editorandpublisher.com/stories/how-lion-publishers-is-becoming-the-destination-for-news-entrepreneurs,170466?'
get_article_dict(test_link)

{'title': 'How LION Publishers is Becoming the Destination for News Entrepreneurs',
 'pub_date': datetime.datetime(2020, 8, 7, 0, 0),
 'author': 'Anika Anand',
 'source': 'LION Publishers',
 'body_text': 'If we want to help preserve the impact of good journalism, we must support journalism entrepreneurship that leads to financially sustainable news organizations. And this is most important for communities that have been historically underrepresented or mischaracterized by existing legacy news publications. Fortunately, an ecosystem is beginning to emerge among those of us who are trying to create and support digital news startups. While each of us may have different theories of change, I’m confident we’re all after the same goal: To ensure that the future of independent media is equitable, impactful and sustainable. ',
 'url': 'http://www.editorandpublisher.com/http://www.editorandpublisher.com/stories/how-lion-publishers-is-becoming-the-destination-for-news-entrepreneurs,170466?'}

In [346]:
# Testing on one page (no byline)

test_link = 'http://www.editorandpublisher.com/stories/saving-the-youngstown-vindicator,169850?'
get_article_dict(test_link)

{'title': 'Saving the Youngstown Vindicator',
 'pub_date': datetime.datetime(2020, 8, 5, 0, 0),
 'author': None,
 'source': None,
 'body_text': 'Listen to the audio Last year, the Maag-Brown family, owners of The Youngstown (Ohio) Vindicator, announced it would shut down after serving its local community for 150 years. As a result, 144 employees and 250 carriers were laid off. Not finding a buyer, the family made an announcement to their 34,000+ subscribers in June 2019 that their last edition would be published Aug. 31. Hearing the news, Ogden Newspapers put together a plan to publish a daily product for Youngstown and the Mahoning County, Ohio residents from their operation 16 miles north at the Warren Tribune Chronicle. In just three months, the staff at the Tribune Chronicle approached the Maag-Brown family and negotiated the purchase of the masthead, subscription list and Vindicator website (Vindy.com), and began the task of generating a quality local news product for the resident

In [347]:
# Testing on one page (only single paragraph of body text)

test_link = 'http://www.editorandpublisher.com/stories/mike-clark-longtime-usa-today-film-critic-dies-at-73,170192?'
get_article_dict(test_link)

{'title': 'Mike Clark, Longtime USA Today Film Critic, Dies at 73',
 'pub_date': datetime.datetime(2020, 8, 5, 0, 0),
 'author': 'Greg Evans',
 'source': 'Deadline',
 'body_text': 'Mike Clark, the film critic for USA Today from 1985 until 2009, died July 31 at a Reston, Virginia, hospital from a head injury sustained in a fall at his Virginia home on July 27. He was 73, and had been battling liver disease for several years.',
 'url': 'http://www.editorandpublisher.com/http://www.editorandpublisher.com/stories/mike-clark-longtime-usa-today-film-critic-dies-at-73,170192?'}

In [348]:
# Testing on one page (author but no source)

test_link = 'http://www.editorandpublisher.com/stories/northwestern-local-news-initiative-launches-research-and-development-project,88094?'
get_article_dict(test_link)



{'title': 'Northwestern Local News Initiative Launches Research and Development Project',
 'pub_date': datetime.datetime(2018, 8, 9, 0, 0),
 'author': 'Rachael Garcia',
 'source': None,
 'body_text': 'Facilitator Tran Ha gives a presentation to news executives, thought leaders, philanthropists and representatives of the Learning Lab news organizations during the Medill Local News Summit.    There’s no question consumers are rapidly changing how and where they access news and information. How newspapers should reinvent themselves to remain a relevant news source is a different story. The Northwestern University’s Local News Initiative is a two-year research and development project hoping to tell that story by providing a greater understanding of how individuals engage with local news and to find new approaches to bolster business models.    In addition, Northwestern (home to the Medill School of Journalism, Media, Integrated Marketing Communication in Evanston, Ill.) has also partnered 

In [349]:
# Testing on one page (author but no source)

test_link = 'http://www.editorandpublisher.com/stories/wall-street-journal-staff-members-push-for-big-changes-in-news-coverage,167688?'
get_article_dict(test_link)



{'title': 'Wall Street Journal Staff Members Push for Big Changes in News Coverage',
 'pub_date': datetime.datetime(2020, 7, 10, 0, 0),
 'author': 'Marc Tracy and Ben Smith',
 'source': 'New York Times',
 'body_text': 'Staff members of The Wall Street Journal have been pressing newsroom leaders to make fundamental changes in how the newspaper covers race, policing, and its primary focus, the business world, along with other matters. Click here to read\xa0 more.',
 'url': 'http://www.editorandpublisher.com/http://www.editorandpublisher.com/stories/wall-street-journal-staff-members-push-for-big-changes-in-news-coverage,167688?'}

#### Scraping all recent articles

In [350]:
# function to feed link list to scraping function

def scrape_articles_list(links, empty_list):
    if len(links) > 1:
        for link in links:
            empty_list.append(get_article_dict(link))
    else:
        empty_list.append(get_article_dict(links))


In [384]:
### ONLY RUN THIS LINE THE FIRST TIME USING ###
### DO NOT USE IN SUBSEQUENT ATTEMPS WHEN PICKIN UP HALFWAY DOWN LIST ###

article_info_list_1 = []      

In [385]:
# Converting link stubs dict to a list

recent_article_links_list = list(recent_article_links.values())

In [386]:
# RUNNING SCRAPER ON ARTICLE PAGES POST AUG 10 2018 - FIRST ATTEMPT

scrape_articles_list(recent_article_links_list, article_info_list_1)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [387]:
len(article_info_list_1)

300

In [392]:
scrape_articles_list(recent_article_links_list[301:], article_info_list_1)

In [393]:
len(article_info_list_1)

7023

In [395]:
# CONVERTING RECENT ARTICLE DATA TO DF AND SAVING AS CSV

recent_articles_df = pd.DataFrame(article_info_list_1)

recent_articles_df.to_csv('data/recent_articles_raw.csv')

In [402]:
# most common sources

recent_articles_df.groupby('source').count().sort_values(by='title', ascending=False).head(20)

Unnamed: 0_level_0,title,pub_date,author,body_text,url
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Digiday,555,555,555,555,555
CJR,466,466,466,466,466
Nieman Lab,435,435,435,435,435
Poynter,374,374,374,374,374
INMA,285,285,285,285,285
Washington Post,270,270,270,270,270
New York Times,255,255,255,255,255
Guardian,184,184,184,184,184
Associated Press,162,162,162,162,162
journalism.co.uk,143,143,143,143,143


## Archived articles
Pre-Aug 10, 2018

### Getting links to articles

In [409]:
# grabbing links to article pages for each issue in last 2 years

archive_links = []

for page_num in range(7706): # 7706 pages 
    link = "http://www.editorandpublisher.com/browse.html?content_source=archive&page_size=20&search_filter_mode=and&sub_type=stories%2Cphotos%2Cvideos%2Cspecialsections%2Cprintissues%2Ceeditions%2Cpackages%2Cmagazines%2Cmaps%2Cfeeds%2Cpolls&page={}".format(page_num)    # making list of pages with industry news
    archive_links.append(link)                                               


In [419]:
len(archive_links)

7706

In [421]:
# getting soup for each list page and parsing out soups for tables with links for each page

rows2 = []    # will have all article page links

for url in archive_links:
    response = requests.get(url)                                                   # reading each list page (161 total)
    if response.status_code != 200:
        print(url, response.status_code)    # checkign response is in 200s for each URL
        pass
    page = response.text
    soup = BeautifulSoup(page, "html5lib")                                         # getting soup for each list page
    row = soup.find_all('div', class_='story_list')  # narrowing soups down to only sections with article page links 
    rows2.append(row)                                                               # adding each soup to list, makes list of lists, each list featuring all links from a single year

In [422]:
len(rows2)

7706

In [423]:
# CONVERTING ARCHIVE ARTICLE SOUPS TO DF AND SAVING AS CSV

archive_article_soups_df = pd.DataFrame(rows2)

archive_article_soups_df.to_csv('data/archive_article_soups.csv')

In [424]:
# taking each soup of tables and pulling 

archive_article_links = {}    # setting empty dict for key=name val=link

for soup in rows2:
    for element in soup:
        for article in element.find_all(class_='headline'):
            link = article.find('a')   
            title, url = link.text, link['href']
            title_clean = title.replace('\n','').replace('\t','')    # clean up html formatting in title strings
            archive_article_links[title_clean] = url

In [427]:
# Converting link stubs dict to a list
archive_article_links_list = list(archive_article_links.values())

In [428]:
len(archive_article_links_list)

74955

In [429]:
# CONVERTING ARCHIVE ARTICLE LINK STUBS TO DF AND SAVING AS CSV

archive_article_links_df = pd.DataFrame(archive_article_links_list)

archive_article_links_df.to_csv('data/archive_article_links.csv')

In [434]:
### ONLY RUN THIS LINE THE FIRST TIME USING ###
### DO NOT USE IN SUBSEQUENT ATTEMPS WHEN PICKIN UP HALFWAY DOWN LIST ###

article_info_list_2 = []

In [435]:
# RUNNING SCRAPER ON ARTICLE PAGES POST AUG 10 2018 - FIRST ATTEMPT

scrape_articles_list(archive_article_links_list, article_info_list_2)

ConnectionError: HTTPConnectionPool(host='www.editorandpublisher.com', port=80): Max retries exceeded with url: //stories/newsonomics-will-facebooks-troubles-finally-cure-publishers-of-platformitis,7417 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x2eff63c70>: Failed to establish a new connection: [Errno 60] Operation timed out'))

In [436]:
len(article_info_list_2)

1120

In [437]:
# RUNNING SCRAPER ON ARTICLE PAGES POST AUG 10 2018 - SECOND ATTEMPT

scrape_articles_list(archive_article_links_list[1121:], article_info_list_2)

KeyboardInterrupt: 

In [438]:
len(article_info_list_2)

7991

In [576]:
# CONVERTING PARTIAL ARCHIVE ARTICLE INFO TO DF AND SAVING AS CSV

archive_article_info_df = pd.DataFrame(article_info_list_2)

archive_article_info_df.to_csv('data/archive_articles_raw.csv')

In [490]:
# RUNNING SCRAPER ON ARTICLE PAGES POST AUG 10 2018 - SECOND thru Nth ATTEMPT

scrape_articles_list(archive_article_links_list[74957:], article_info_list_2)

In [485]:
len(article_info_list_2)

74955

In [489]:
len(archive_article_links_list)

74955

In [535]:
# CONVERTING PARTIAL ARCHIVE ARTICLE INFO TO DF AND SAVING AS CSV

archive_article_info_df = pd.DataFrame(article_info_list_2)

archive_article_info_df.to_csv('data/archive_articles_raw.csv')

In [24]:
archive_article_info_df = pd.read_csv('data/archive_articles_raw.csv')

In [25]:
archive_article_info_df.describe()

Unnamed: 0.1,Unnamed: 0
count,74955.0
mean,37477.0
std,21637.789051
min,0.0
25%,18738.5
50%,37477.0
75%,56215.5
max,74954.0


In [27]:
import datetime

In [28]:
# adding pub_year col to make navigating by date easier

archive_article_info_df['pub_year'] = pd.DatetimeIndex(archive_article_info_df['pub_date']).year

In [43]:
# text stoped scraping in 2010, all the way back to 1993. need to investigate this and rescrape

archive_article_info_df[archive_article_info_df.pub_year==2010]

Unnamed: 0.1,Unnamed: 0,title,pub_date,author,source,body_text,url,pub_year
27488,27488,Pulitzer Prize-Winning Author to Contribute fo...,2010-12-29,E&P Staff,,"Judith Miller, an author and a Pulitzer Prize-...",http://www.editorandpublisher.com//stories/pul...,2010
27489,27489,Philadelphia Inquirer Names New Editor,2010-12-29,E&P Staff,,"The Philadelphia Inquirer, under new managemen...",http://www.editorandpublisher.com//stories/phi...,2010
27490,27490,Gregg Birnbaum Joins POLITICO,2010-12-29,E&P Staff,,"Gregg Birnbaum, the former New York Post polit...",http://www.editorandpublisher.com//stories/gre...,2010
27491,27491,Matthew Ipsan Named Chief Digital Officer for ...,2010-12-29,E&P Staff,,Community Newspaper Holdings Inc. has named Ma...,http://www.editorandpublisher.com//stories/mat...,2010
27492,27492,Don Melvin Named Brussels News Editor for AP,2010-12-29,E&P Staff,,"Don Melvin, a veteran London-based editor for ...",http://www.editorandpublisher.com//stories/don...,2010
...,...,...,...,...,...,...,...,...
30286,30286,Hearst Sails this Skiff Into the E-future,2010-01-01,Jennifer Saba,,,http://www.editorandpublisher.com//stories/hea...,2010
30287,30287,Chicago Bullish on Local News Co-op,2010-01-01,E&P Staff,,,http://www.editorandpublisher.com//stories/chi...,2010
30288,30288,"Buffalo, Buffett-Style",2010-01-01,Joe Strupp,,,http://www.editorandpublisher.com//stories/buf...,2010
30289,30289,California Angel Still Reading at 102,2010-01-01,E&P Staff,,,http://www.editorandpublisher.com//stories/cal...,2010


In [572]:
# last record with body_text

archive_article_info_df.iloc[29671]

title        Unification Church Putting 'Washington Times' ...
pub_date                                   2010-05-01 00:00:00
author                                               E&P Staff
source                                                    None
body_text    Executives at The Washington Times are negotia...
url          http://www.editorandpublisher.com//stories/uni...
Name: 29671, dtype: object

In [573]:
archive_article_info_df.iloc[29671].url

'http://www.editorandpublisher.com//stories/unification-church-putting-washington-times-up-for-sale,138203?'

In [574]:
#first record w/o body_text. everything beyond this point must be scraped again

archive_article_info_df.iloc[29672]

title        What Does Philadelphia Newspaper Auction Say A...
pub_date                                   2010-04-30 00:00:00
author                                               E&P Staff
source                                                    None
body_text                                                 None
url          http://www.editorandpublisher.com//stories/wha...
Name: 29672, dtype: object

In [575]:
archive_article_info_df.iloc[29672].url

'http://www.editorandpublisher.com//stories/what-does-philadelphia-newspaper-auction-say-about-the-value-of-newspapers,56824?'

## Modifying scraper to capture body_text from older articles
articles from 2010 and before have different HTLM formatting


In [6]:
response = requests.get('http://www.editorandpublisher.com//stories/what-does-philadelphia-newspaper-auction-say-about-the-value-of-newspapers,56824?')      

In [7]:
response.status_code

200

In [8]:
page = response.text
soup = BeautifulSoup(page, "html5lib") 

In [18]:
get_author(soup)

'E&P Staff'

In [20]:
# misses source, not a huge deal

get_source(soup)

In [21]:
get_pub_date(soup)

'\n\t\t\tFriday, \n\t\t\tApril 30, 2010 \n\t\t\t12:00 am\n\t\t'

In [22]:
# missing body text is a critical error

get_body_text(soup)

In [37]:
soup.find('div', class_='byline').text

'E&P Staff'

In [46]:
# new method to find body text

clean_text_string(soup.find('div', class_='body main-body clearfix').text)[:200]

'E&P StaffBy: Mark Fitzgerald    The furious last-minute bidding for Philadelphia Newspapers -- which saw the price bumping up $10 million in cash each round, according to participants  -- thrilled som'

New method unavoidably captures author info, but I might be able to clean out later with regex if it turns up in top topic words. Next I'll make an updated get_body fcn and rescrape archived articles before the layout change date. 

In [47]:

def get_body_text2(soup):
    
    '''Takes a soup returns the article body text, or None if nothing is found.'''
    
    obj = soup.find('div', class_='body main-body clearfix')

    if not obj: 
        return None
    
    if obj:
            
        return obj.text
    
    else:
        return None

In [49]:
def get_article_dict2(link):
    '''
    From Editor and Publisher link stub, request movie html, parse with BeautifulSoup, and
    collect 
        'title', 
        'pub_date', 
        'author',
        'source',
        'body_text',
        'url'
    Return information as a dictionary.
    '''
    
    base_url = 'http://www.editorandpublisher.com/'
    
    #Create full url to scrape
    url = base_url + link
    
    # Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "html5lib")

    
    headers = ['title',
               'pub_date',
               'author',
               'source',
               'body_text',
               'url']
    try:
        # Get title
        title_string = get_title(soup)
        title = clean_text_string(title_string)

        # Get publish date
        pub_date_string = get_pub_date(soup)
        pub_date = to_date(pub_date_string)

        # Get author 
        author_string = get_author(soup)
        author = clean_text_string(author_string)

        # Get source 
        source_string = get_source(soup)
        source = clean_text_string(source_string)

        # Get body text
        body_text_string = get_body_text2(soup)          # updated this function
        body_text = clean_text_string(body_text_string)
    
    except AttributeError:
        print('Oops! There was an error with an article, so we skilled ', url)
        pass
    
    #Create movie dictionary and return
    article_dict = dict(zip(headers, [title,
                                      pub_date,
                                      author,
                                      source,
                                      body_text,
                                      url
                                     ]))

    return article_dict

In [67]:
# function to feed link list to scraping function, REVISED

def scrape_articles_list2(links, empty_list):
    if len(links) > 1:
        for link in links:
            empty_list.append(get_article_dict2(link))
    else:
        empty_list.append(get_article_dict2(links))


In [95]:
# stopping point of body text from last round of archive scraping. pickup here. 

archive_article_links_df = pd.read_csv('data/archive_article_links.csv').drop(['Unnamed: 0'], axis=1)
archive_article_links2 = archive_article_links_df.iloc[29674:,0].tolist()

In [71]:
# INITIALIZING NEW (3rd) LIST FOR RESCRAPED ARCHIVED ARTICLES

article_info_list_3 = []

In [106]:
# RUNNING SCRAPER ON ARTICLE PAGES BEFORE FORMAT CHAGE - FIRST ATTEMPT

scrape_articles_list2(archive_article_links2[29665:], article_info_list_3)

In [107]:
len(article_info_list_3)

45280

In [109]:
# CONVERTING PARTIAL ARCHIVE ARTICLE INFO TO DF AND SAVING AS CSV

archive_article_info_df2 = pd.DataFrame(article_info_list_3)

archive_article_info_df2.to_csv('data/archive_articles_raw2.csv')