# Metis Final Project Part III. Using Beautiful Soup to scrape article text-content and store the content in a Pandas dataframe. 

In [1]:
import pickle
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
doc = pickle.load( open('completelist.pickle','rb'))
url_list = list(doc)

In [2]:
len(url_list)

19702

In [4]:
url_list0to1000 = url_list[0:1000]

In [5]:
len(url_list0to1000)

1000

### The below code writes a function to scrape NYT article URLs for text and date content. The article tags identified for both types of content were determined through a trial and error process searching through articles from different sections (i.e. Business, Economics, DealBook, etc.). The data is then appended to a blank dataframe. 

In [8]:
url_test = 'https://www.nytimes.com/2001/01/16/nyregion/c-corrections-459399.html'

In [10]:
text_dfs = [get_text1(url) for url in url_list0to1000]

In [12]:
text_dfs3 = pd.concat(text_dfs).reset_index()

In [13]:
text_dfs3.shape

(1000, 4)

In [14]:
text_dfs3

Unnamed: 0,index,0,1,2
0,0,,https://www.nytimes.com/2001/01/16/nyregion/c-...,A chart of financial indicators at the bottom ...
1,0,"July 9, 2018",https://www.nytimes.com/2018/07/09/opinion/tru...,"According to early indications, recent U.S. ec..."
2,0,"JAN. 22, 2008JAN. 21, 2008JAN. 22, 2008JAN. 22...",https://www.nytimes.com/2008/01/22/business/wo...,"The Federal Reserve, responding to an internat..."
3,0,"August 25, 2015August 25, 2015 7:28 am",https://news.blogs.nytimes.com/2015/08/25/morn...,CHINA CUTS RATES AFTER SHANGHAI STOCKS PLUNGE ...
4,0,"DEC. 8, 2010",https://www.nytimes.com/2010/12/09/opinion/09t...,There are compelling reasons why Congress shou...
5,0,"June 29, 2018",https://www.nytimes.com/2018/06/29/us/politics...,"— Larry Kudlow, the White House economic advi..."
6,0,,https://www.nytimes.com/2003/06/10/business/fo...,By signaling that he is prepared to cut intere...
7,0,,https://www.nytimes.com/2000/11/05/weekinrevie...,A chart last Sunday illustrating definitions o...
8,0,"Sept. 15, 2015",https://www.nytimes.com/2015/09/16/business/ec...,WASHINGTON — Data released on Tuesday revealed...
9,0,"Nov. 15, 2016",https://www.nytimes.com/2016/11/16/business/ec...,The harvest had just begun when agents from th...


In [15]:
text_dfs3.columns = ['index','date', 'url','article']

In [16]:
text_dfs3.head(1)

Unnamed: 0,index,date,url,article
0,0,,https://www.nytimes.com/2001/01/16/nyregion/c-...,A chart of financial indicators at the bottom ...


In [17]:
text_dfs8 = text_dfs3

### The below code removes na' values from the data frames and checks to ensure that the code worked properly. The code then proceeds to the regex_date function, which parses through the 'url' column in the pandas data frame to access the date in a clean fashion. The results from the scraper did not generate sensible date content, thus this alternative method was employed. 

In [18]:
text_dfs8 = text_dfs8.dropna(subset = ['url','article'])


In [19]:
text_dfs8.isnull().values.sum()

0

In [20]:
import re, datetime
def regex_date(string):
    try:
        match = re.search('\d{4}/\d{2}/\d{2}',string)
        date = datetime.datetime.strptime(match.group(), '%Y/%m/%d').date()
        return date
    except:
        return None

In [21]:
text_dfs8['url_date'] = text_dfs8['url'].apply(regex_date)

In [22]:
text_dfs8.head()

Unnamed: 0,index,date,url,article,url_date
0,0,,https://www.nytimes.com/2001/01/16/nyregion/c-...,A chart of financial indicators at the bottom ...,2001-01-16
1,0,"July 9, 2018",https://www.nytimes.com/2018/07/09/opinion/tru...,"According to early indications, recent U.S. ec...",2018-07-09
2,0,"JAN. 22, 2008JAN. 21, 2008JAN. 22, 2008JAN. 22...",https://www.nytimes.com/2008/01/22/business/wo...,"The Federal Reserve, responding to an internat...",2008-01-22
3,0,"August 25, 2015August 25, 2015 7:28 am",https://news.blogs.nytimes.com/2015/08/25/morn...,CHINA CUTS RATES AFTER SHANGHAI STOCKS PLUNGE ...,2015-08-25
4,0,"DEC. 8, 2010",https://www.nytimes.com/2010/12/09/opinion/09t...,There are compelling reasons why Congress shou...,2010-12-09


In [23]:
text_dfs8.shape

(1000, 5)

In [24]:
text_dfs8.dtypes

index        int64
date        object
url         object
article     object
url_date    object
dtype: object

In [25]:
pickle.dump( text_dfs8, open( "text_dfs8_0to1000", "wb" ) )

In [28]:
text_dfs8_0to1000 = text_dfs8

In [30]:
url_list1001to2000 = url_list[1001:2000]

### The below code creates the same data frame tables for sections of every 1000 articles in the first list (date range 2000-2018). The dataframes are stored in pickle files as well. 

In [31]:
text_dfs8_1001to2000 = [get_text1(url) for url in url_list1001to2000]

In [32]:
text_dfs8_1001to2000 = pd.concat(text_dfs).reset_index()

In [33]:
text_dfs8_1001to2000.shape

(1000, 4)

In [34]:
text_dfs8_1001to2000.head(10)

Unnamed: 0,index,0,1,2
0,0,,https://www.nytimes.com/2001/01/16/nyregion/c-...,A chart of financial indicators at the bottom ...
1,0,"July 9, 2018",https://www.nytimes.com/2018/07/09/opinion/tru...,"According to early indications, recent U.S. ec..."
2,0,"JAN. 22, 2008JAN. 21, 2008JAN. 22, 2008JAN. 22...",https://www.nytimes.com/2008/01/22/business/wo...,"The Federal Reserve, responding to an internat..."
3,0,"August 25, 2015August 25, 2015 7:28 am",https://news.blogs.nytimes.com/2015/08/25/morn...,CHINA CUTS RATES AFTER SHANGHAI STOCKS PLUNGE ...
4,0,"DEC. 8, 2010",https://www.nytimes.com/2010/12/09/opinion/09t...,There are compelling reasons why Congress shou...
5,0,"June 29, 2018",https://www.nytimes.com/2018/06/29/us/politics...,"— Larry Kudlow, the White House economic advi..."
6,0,,https://www.nytimes.com/2003/06/10/business/fo...,By signaling that he is prepared to cut intere...
7,0,,https://www.nytimes.com/2000/11/05/weekinrevie...,A chart last Sunday illustrating definitions o...
8,0,"Sept. 15, 2015",https://www.nytimes.com/2015/09/16/business/ec...,WASHINGTON — Data released on Tuesday revealed...
9,0,"Nov. 15, 2016",https://www.nytimes.com/2016/11/16/business/ec...,The harvest had just begun when agents from th...


In [36]:
text_dfs8_1001to2000.columns = ['index','date', 'url','article']

In [37]:
text_dfs8_1001to2000.head(10)

Unnamed: 0,index,date,url,article
0,0,,https://www.nytimes.com/2001/01/16/nyregion/c-...,A chart of financial indicators at the bottom ...
1,0,"July 9, 2018",https://www.nytimes.com/2018/07/09/opinion/tru...,"According to early indications, recent U.S. ec..."
2,0,"JAN. 22, 2008JAN. 21, 2008JAN. 22, 2008JAN. 22...",https://www.nytimes.com/2008/01/22/business/wo...,"The Federal Reserve, responding to an internat..."
3,0,"August 25, 2015August 25, 2015 7:28 am",https://news.blogs.nytimes.com/2015/08/25/morn...,CHINA CUTS RATES AFTER SHANGHAI STOCKS PLUNGE ...
4,0,"DEC. 8, 2010",https://www.nytimes.com/2010/12/09/opinion/09t...,There are compelling reasons why Congress shou...
5,0,"June 29, 2018",https://www.nytimes.com/2018/06/29/us/politics...,"— Larry Kudlow, the White House economic advi..."
6,0,,https://www.nytimes.com/2003/06/10/business/fo...,By signaling that he is prepared to cut intere...
7,0,,https://www.nytimes.com/2000/11/05/weekinrevie...,A chart last Sunday illustrating definitions o...
8,0,"Sept. 15, 2015",https://www.nytimes.com/2015/09/16/business/ec...,WASHINGTON — Data released on Tuesday revealed...
9,0,"Nov. 15, 2016",https://www.nytimes.com/2016/11/16/business/ec...,The harvest had just begun when agents from th...


### The below code applies the previously defined 'regex_date' function to the 'url' column of the dataframe to extract the date in an appropriate format. 

### The remainder of this notebook consists of the creation and storage via pickling of different dataframes containing article text, url, and date content. This process is repeated until ultimately the ~19,000 articles are all stored in memory.

In [38]:
text_dfs8_1001to2000['url_date'] = text_dfs8_1001to2000['url'].apply(regex_date)

In [39]:
text_dfs8_1001to2000.head()

Unnamed: 0,index,date,url,article,url_date
0,0,,https://www.nytimes.com/2001/01/16/nyregion/c-...,A chart of financial indicators at the bottom ...,2001-01-16
1,0,"July 9, 2018",https://www.nytimes.com/2018/07/09/opinion/tru...,"According to early indications, recent U.S. ec...",2018-07-09
2,0,"JAN. 22, 2008JAN. 21, 2008JAN. 22, 2008JAN. 22...",https://www.nytimes.com/2008/01/22/business/wo...,"The Federal Reserve, responding to an internat...",2008-01-22
3,0,"August 25, 2015August 25, 2015 7:28 am",https://news.blogs.nytimes.com/2015/08/25/morn...,CHINA CUTS RATES AFTER SHANGHAI STOCKS PLUNGE ...,2015-08-25
4,0,"DEC. 8, 2010",https://www.nytimes.com/2010/12/09/opinion/09t...,There are compelling reasons why Congress shou...,2010-12-09


In [40]:
pickle.dump( text_dfs8_1001to2000, open( "text_dfs8_1001to2000", "wb" ) )

In [41]:
url_list2001to3000 = url_list[2001:3000]

In [42]:
len(url_list2001to3000)

999

In [43]:
text_dfs8_2001to3000 = [get_text1(url) for url in url_list2001to3000]

In [44]:
text_dfs8_2001to3000 = pd.concat(text_dfs8_2001to3000).reset_index()

In [45]:
text_dfs8_2001to3000.shape

(999, 4)

In [46]:
text_dfs8_2001to3000.columns = ['index','date', 'url','article']

In [47]:
text_dfs8_2001to3000['url_date'] = text_dfs8_2001to3000['url'].apply(regex_date)

In [48]:
text_dfs8_2001to3000.head(10)

Unnamed: 0,index,date,url,article,url_date
0,0,"JULY 10, 2011JULY 10, 2011SEP 5SEP 1AUG 28AUG ...",https://www.nytimes.com/2011/07/11/opinion/11d...,The Republican Party’s strategy in the debt-ce...,2011-07-11
1,0,"JAN. 5, 2009",https://www.nytimes.com/2009/01/06/business/06...,The Securities and Exchange Commission needs t...,2009-01-06
2,0,"August 6, 2013 2:56 pm",https://douthat.blogs.nytimes.com/2013/08/06/r...,"This National Journal piece, the latest to mis...",2013-08-06
3,0,"OCT. 8, 2008MAY 11MAY 11MAY 4APR 27APR 16",https://www.nytimes.com/2008/10/09/business/yo...,It’s a question we’ve all asked in our darker ...,2008-10-09
4,0,,https://www.nytimes.com/interactive/2009/03/06...,,2009-03-06
5,0,"JUNE 20, 2013",https://www.nytimes.com/2013/06/21/us/21iht-le...,NEW YORK — The rich are getting richer. That’...,2013-06-21
6,0,"APRIL 4, 2009",https://www.nytimes.com/2009/04/05/weekinrevie...,WASHINGTON — As he cashiered the head of one o...,2009-04-05
7,0,"July 19, 2011 10:00 am",https://economix.blogs.nytimes.com/2011/07/19/...,"Yet more on the consumer bust, this time from ...",2011-07-19
8,0,"NOV. 13, 2006",https://www.nytimes.com/2006/11/13/opinion/13h...,“America moved me all over again — it was an a...,2006-11-13
9,0,"APRIL 8, 2008APRIL 8, 2008APRIL 8, 2008APRIL 8...",https://www.nytimes.com/2008/04/08/us/politics...,The resignation of Senator Hillary Rodham Clin...,2008-04-08


In [49]:
pickle.dump( text_dfs8_2001to3000, open( "text_dfs8_2001to3000", "wb" ) )

In [50]:
#text_dfs8 = text_dfs8.dropna(subset = ['url','article'])


In [59]:
text_dfs8_2001to3000_test = text_dfs8_2001to3000

In [63]:
text_dfs8_2001to3000_test =  text_dfs8_2001to3000_test.dropna()

In [64]:
text_dfs8_2001to3000_test

Unnamed: 0,index,date,url,article,url_date
0,0,"JULY 10, 2011JULY 10, 2011SEP 5SEP 1AUG 28AUG ...",https://www.nytimes.com/2011/07/11/opinion/11d...,The Republican Party’s strategy in the debt-ce...,2011-07-11
1,0,"JAN. 5, 2009",https://www.nytimes.com/2009/01/06/business/06...,The Securities and Exchange Commission needs t...,2009-01-06
2,0,"August 6, 2013 2:56 pm",https://douthat.blogs.nytimes.com/2013/08/06/r...,"This National Journal piece, the latest to mis...",2013-08-06
3,0,"OCT. 8, 2008MAY 11MAY 11MAY 4APR 27APR 16",https://www.nytimes.com/2008/10/09/business/yo...,It’s a question we’ve all asked in our darker ...,2008-10-09
4,0,,https://www.nytimes.com/interactive/2009/03/06...,,2009-03-06
5,0,"JUNE 20, 2013",https://www.nytimes.com/2013/06/21/us/21iht-le...,NEW YORK — The rich are getting richer. That’...,2013-06-21
6,0,"APRIL 4, 2009",https://www.nytimes.com/2009/04/05/weekinrevie...,WASHINGTON — As he cashiered the head of one o...,2009-04-05
7,0,"July 19, 2011 10:00 am",https://economix.blogs.nytimes.com/2011/07/19/...,"Yet more on the consumer bust, this time from ...",2011-07-19
8,0,"NOV. 13, 2006",https://www.nytimes.com/2006/11/13/opinion/13h...,“America moved me all over again — it was an a...,2006-11-13
9,0,"APRIL 8, 2008APRIL 8, 2008APRIL 8, 2008APRIL 8...",https://www.nytimes.com/2008/04/08/us/politics...,The resignation of Senator Hillary Rodham Clin...,2008-04-08


In [71]:
text_dfs8_2001to3000_test.isnull().values.sum()

0

In [74]:
text_dfs8_2001to3000_test.shape

(966, 5)

In [76]:
text_dfs8_1001to2000.head(10)

Unnamed: 0,index,date,url,article,url_date
0,0,,https://www.nytimes.com/2001/01/16/nyregion/c-...,A chart of financial indicators at the bottom ...,2001-01-16
1,0,"July 9, 2018",https://www.nytimes.com/2018/07/09/opinion/tru...,"According to early indications, recent U.S. ec...",2018-07-09
2,0,"JAN. 22, 2008JAN. 21, 2008JAN. 22, 2008JAN. 22...",https://www.nytimes.com/2008/01/22/business/wo...,"The Federal Reserve, responding to an internat...",2008-01-22
3,0,"August 25, 2015August 25, 2015 7:28 am",https://news.blogs.nytimes.com/2015/08/25/morn...,CHINA CUTS RATES AFTER SHANGHAI STOCKS PLUNGE ...,2015-08-25
4,0,"DEC. 8, 2010",https://www.nytimes.com/2010/12/09/opinion/09t...,There are compelling reasons why Congress shou...,2010-12-09
5,0,"June 29, 2018",https://www.nytimes.com/2018/06/29/us/politics...,"— Larry Kudlow, the White House economic advi...",2018-06-29
6,0,,https://www.nytimes.com/2003/06/10/business/fo...,By signaling that he is prepared to cut intere...,2003-06-10
7,0,,https://www.nytimes.com/2000/11/05/weekinrevie...,A chart last Sunday illustrating definitions o...,2000-11-05
8,0,"Sept. 15, 2015",https://www.nytimes.com/2015/09/16/business/ec...,WASHINGTON — Data released on Tuesday revealed...,2015-09-16
9,0,"Nov. 15, 2016",https://www.nytimes.com/2016/11/16/business/ec...,The harvest had just begun when agents from th...,2016-11-16


In [80]:
abc = text_dfs8_1001to2000

In [81]:
abc.dropna()

Unnamed: 0,index,date,url,article,url_date
0,0,,https://www.nytimes.com/2001/01/16/nyregion/c-...,A chart of financial indicators at the bottom ...,2001-01-16
1,0,"July 9, 2018",https://www.nytimes.com/2018/07/09/opinion/tru...,"According to early indications, recent U.S. ec...",2018-07-09
2,0,"JAN. 22, 2008JAN. 21, 2008JAN. 22, 2008JAN. 22...",https://www.nytimes.com/2008/01/22/business/wo...,"The Federal Reserve, responding to an internat...",2008-01-22
3,0,"August 25, 2015August 25, 2015 7:28 am",https://news.blogs.nytimes.com/2015/08/25/morn...,CHINA CUTS RATES AFTER SHANGHAI STOCKS PLUNGE ...,2015-08-25
4,0,"DEC. 8, 2010",https://www.nytimes.com/2010/12/09/opinion/09t...,There are compelling reasons why Congress shou...,2010-12-09
5,0,"June 29, 2018",https://www.nytimes.com/2018/06/29/us/politics...,"— Larry Kudlow, the White House economic advi...",2018-06-29
6,0,,https://www.nytimes.com/2003/06/10/business/fo...,By signaling that he is prepared to cut intere...,2003-06-10
7,0,,https://www.nytimes.com/2000/11/05/weekinrevie...,A chart last Sunday illustrating definitions o...,2000-11-05
8,0,"Sept. 15, 2015",https://www.nytimes.com/2015/09/16/business/ec...,WASHINGTON — Data released on Tuesday revealed...,2015-09-16
9,0,"Nov. 15, 2016",https://www.nytimes.com/2016/11/16/business/ec...,The harvest had just begun when agents from th...,2016-11-16


In [82]:
url_list3001to4000 = url_list[3001:4000]

In [83]:
text_dfs8_3001to4000 = [get_text1(url) for url in url_list3001to4000]

In [84]:
text_dfs8_3001to4000 = pd.concat(text_dfs8_3001to4000).reset_index()

In [86]:
text_dfs8_3001to4000.columns = ['index','date', 'url','article']

In [88]:
text_dfs8_3001to4000_testing = text_dfs8_3001to4000
text_dfs8_3001to4000.head(4)

Unnamed: 0,index,date,url,article
0,0,"May 2, 2017",https://www.nytimes.com/2017/05/02/business/au...,"For seven years, the steadily expanding auto i..."
1,0,"Feb. 1, 2015",https://www.nytimes.com/2015/02/02/business/ec...,Here are some of the top business stories to p...
2,0,"SEPT. 14, 2012",https://www.nytimes.com/2012/09/15/business/ec...,An increase in the cost of gasoline pushed con...
3,0,"FEB. 26, 2013FEB. 26, 2013FEB. 26, 2013FEB. 26...",https://www.nytimes.com/2013/02/27/business/ec...,"WASHINGTON — The Federal Reserve chairman, Ben..."


In [90]:
text_dfs8_3001to4000_testing = text_dfs8_3001to4000_testing.dropna()

In [91]:
text_dfs8_3001to4000_testing.isnull().values.sum()

0

In [92]:
text_dfs8_3001to4000_testing['url_date'] = text_dfs8_3001to4000_testing['url'].apply(regex_date)

In [106]:
text_dfs8_3001to4000_testing.head(5)

Unnamed: 0,index,date,url,article,url_date
0,0,"May 2, 2017",https://www.nytimes.com/2017/05/02/business/au...,"For seven years, the steadily expanding auto i...",2017-05-02
1,0,"Feb. 1, 2015",https://www.nytimes.com/2015/02/02/business/ec...,Here are some of the top business stories to p...,2015-02-02
2,0,"SEPT. 14, 2012",https://www.nytimes.com/2012/09/15/business/ec...,An increase in the cost of gasoline pushed con...,2012-09-15
3,0,"FEB. 26, 2013FEB. 26, 2013FEB. 26, 2013FEB. 26...",https://www.nytimes.com/2013/02/27/business/ec...,"WASHINGTON — The Federal Reserve chairman, Ben...",2013-02-27
4,0,"Aug. 30, 2016",https://www.nytimes.com/2016/08/31/business/ec...,What’s ahead for the American economy?That que...,2016-08-31


In [94]:
pickle.dump( text_dfs8_3001to4000_testing, open( "text_dfs8_3001to4000_testing", "wb" ) )

In [95]:
url_list4001to5000 = url_list[4001:5000]

In [96]:
text_dfs8_4001to5000 = [get_text1(url) for url in url_list4001to5000]

In [98]:
text_dfs8_4001to5000

Unnamed: 0,index,0,1,2
0,0,"MAY 4, 2009",https://www.nytimes.com/2009/05/05/opinion/05t...,There is something wrong with a system where s...
1,0,,https://www.nytimes.com/2001/11/07/opinion/whi...,Yesterday the Federal Reserve cut interest rat...
2,0,"Dec. 4, 2014",https://www.nytimes.com/2014/12/04/opinion/rog...,New York has always been a pretty good baromet...
3,0,"APRIL 21, 2008",https://www.nytimes.com/2008/04/21/business/re...,THE dream of retiring to a Tuscan villa or a b...
4,0,"June 20, 2013 12:01 am",https://economix.blogs.nytimes.com/2013/06/20/...,"Simon Johnson, former chief economist of the I..."
5,0,"MARCH 29, 2011MARCH 29, 2011MAY 29MAY 22MAY 15...",https://www.nytimes.com/2011/03/30/business/ec...,WASHINGTONWhenever officials at the Federal Re...
6,0,,https://www.nytimes.com/2003/12/11/business/ec...,"THE City of Indianapolis, it has been reported..."
7,0,"JULY 8, 2009JULY 8, 2009JULY 8, 2009",https://www.nytimes.com/2009/07/09/us/politics...,"WASHINGTON — At his inauguration in January, P..."
8,0,"SEPT. 5, 8800",https://www.nytimes.com/2005/05/19/business/19...,Inflation slowed in April from its fast pace i...
9,0,"May 24, 2015",https://www.nytimes.com/2015/05/25/business/pu...,"MIAMI — For the Ingram clan, working for the M..."


In [100]:

pickle.dump( text_dfs8_4001to5000, open( "text_dfs8_4001to5000_testing", "wb" ) )


In [101]:
text_dfs8_4001to5000.columns = ['index','date', 'url','article']

In [108]:
text_dfs8_4001to5000.head(2)

Unnamed: 0,index,date,url,article
0,0,"MAY 4, 2009",https://www.nytimes.com/2009/05/05/opinion/05t...,There is something wrong with a system where s...
1,0,,https://www.nytimes.com/2001/11/07/opinion/whi...,Yesterday the Federal Reserve cut interest rat...


In [110]:
text_dfs8_4001to5000_testing = text_dfs8_4001to5000
text_dfs8_4001to5000_testing.dropna(subset = ['url','article'])

Unnamed: 0,index,date,url,article
0,0,"MAY 4, 2009",https://www.nytimes.com/2009/05/05/opinion/05t...,There is something wrong with a system where s...
1,0,,https://www.nytimes.com/2001/11/07/opinion/whi...,Yesterday the Federal Reserve cut interest rat...
2,0,"Dec. 4, 2014",https://www.nytimes.com/2014/12/04/opinion/rog...,New York has always been a pretty good baromet...
3,0,"APRIL 21, 2008",https://www.nytimes.com/2008/04/21/business/re...,THE dream of retiring to a Tuscan villa or a b...
4,0,"June 20, 2013 12:01 am",https://economix.blogs.nytimes.com/2013/06/20/...,"Simon Johnson, former chief economist of the I..."
5,0,"MARCH 29, 2011MARCH 29, 2011MAY 29MAY 22MAY 15...",https://www.nytimes.com/2011/03/30/business/ec...,WASHINGTONWhenever officials at the Federal Re...
6,0,,https://www.nytimes.com/2003/12/11/business/ec...,"THE City of Indianapolis, it has been reported..."
7,0,"JULY 8, 2009JULY 8, 2009JULY 8, 2009",https://www.nytimes.com/2009/07/09/us/politics...,"WASHINGTON — At his inauguration in January, P..."
8,0,"SEPT. 5, 8800",https://www.nytimes.com/2005/05/19/business/19...,Inflation slowed in April from its fast pace i...
9,0,"May 24, 2015",https://www.nytimes.com/2015/05/25/business/pu...,"MIAMI — For the Ingram clan, working for the M..."


In [111]:
text_dfs8_4001to5000_testing.shape

(999, 4)

In [112]:
text_dfs8_4001to5000_testing['url_date'] = text_dfs8_4001to5000_testing['url'].apply(regex_date)

In [115]:
(text_dfs8_4001to5000_testing.head(2))


Unnamed: 0,index,date,url,article,url_date
0,0,"MAY 4, 2009",https://www.nytimes.com/2009/05/05/opinion/05t...,There is something wrong with a system where s...,2009-05-05
1,0,,https://www.nytimes.com/2001/11/07/opinion/whi...,Yesterday the Federal Reserve cut interest rat...,2001-11-07


In [116]:
print(text_dfs8_4001to5000_testing.shape)

(999, 5)


In [117]:
url_list5001to7000 = url_list[5001:7000]

In [118]:
text_dfs8_5001to7000 = [get_text1(url) for url in url_list5001to7000]

In [119]:
text_dfs8_5001to7000 = pd.concat(text_dfs8_5001to7000).reset_index()

In [120]:
text_dfs8_5001to7000.columns = ['index','date', 'url','article']

In [121]:
text_dfs8_5001to7000.shape

(1999, 4)

In [122]:
text_dfs8_5001to7000.head(2)

Unnamed: 0,index,date,url,article
0,0,"JUNE 21, 2006",https://www.nytimes.com/2006/06/21/business/21...,Wall Street pared an earlier advance to end mi...
1,0,"Oct. 29, 2014",https://www.nytimes.com/2014/10/30/upshot/the-...,The most important thing to understand about t...


In [123]:
pickle.dump( text_dfs8_5001to7000, open( "text_dfs8_5001to7000_testing", "wb" ) )


In [125]:
text_dfs8_5001to7000_testing = text_dfs8_5001to7000

In [126]:
text_dfs8_5001to7000_testing.dropna()

Unnamed: 0,index,date,url,article
0,0,"JUNE 21, 2006",https://www.nytimes.com/2006/06/21/business/21...,Wall Street pared an earlier advance to end mi...
1,0,"Oct. 29, 2014",https://www.nytimes.com/2014/10/30/upshot/the-...,The most important thing to understand about t...
2,0,"JULY 16, 2010MAY 11MAY 11MAY 4APR 27APR 16",https://www.nytimes.com/2010/07/17/your-money/...,"Sometime next week, President Obama will final..."
3,0,"MAY 11, 2007MAY 7, 2007MAY 7, 2007",https://www.nytimes.com/2007/05/11/opinion/l11...,To the Editor:Lawrence Downes (Editorial Obse...
4,0,"SEPT. 6, 6800",https://www.nytimes.com/2002/12/21/us/democrat...,Faced with the certainty that President Bush c...
5,0,"Sept. 22, 2017",https://www.nytimes.com/2017/09/22/upshot/who-...,The most important job in global economics is ...
6,0,"DEC. 17, 2012",https://www.nytimes.com/2012/12/18/us/politics...,"ZEELAND, Mich. — The people who live here on t..."
7,0,,https://www.nytimes.com/2014/06/15/magazine/si...,Like any supersecret lab that’s supposedly try...
8,0,,https://www.nytimes.com/2002/01/27/weekinrevie...,"ENRON did it. So did Kmart. But they are far, ..."
9,0,"Nov. 29, 2016",https://www.nytimes.com/2016/11/29/business/ec...,The United States economy in the third quarter...


In [127]:
text_dfs8_5001to7000_testing['url_date'] = text_dfs8_5001to7000_testing['url'].apply(regex_date)

In [128]:
text_dfs8_5001to7000_testing.head(3)

Unnamed: 0,index,date,url,article,url_date
0,0,"JUNE 21, 2006",https://www.nytimes.com/2006/06/21/business/21...,Wall Street pared an earlier advance to end mi...,2006-06-21
1,0,"Oct. 29, 2014",https://www.nytimes.com/2014/10/30/upshot/the-...,The most important thing to understand about t...,2014-10-30
2,0,"JULY 16, 2010MAY 11MAY 11MAY 4APR 27APR 16",https://www.nytimes.com/2010/07/17/your-money/...,"Sometime next week, President Obama will final...",2010-07-17


In [129]:
url_list7001to10000 = url_list[7001:10000]

In [130]:
text_dfs8_7001to10000 = [get_text1(url) for url in url_list7001to10000]

In [131]:
text_dfs8_7001to10000 = pd.concat(text_dfs8_7001to10000).reset_index()

In [132]:
text_dfs8_7001to10000.columns = ['index','date', 'url','article']

In [133]:
text_dfs8_7001to10000.head(2)

Unnamed: 0,index,date,url,article
0,0,"NOV. 1, 2006MARCH 9, 2015MARCH 9, 2015",https://www.nytimes.com/2006/11/01/business/01...,"A couple of years ago, Robert E. Rubin — émine..."
1,0,"OCT. 7, 2008",https://www.nytimes.com/2008/10/08/business/08...,DETROIT — The few consumers who are buying new...


In [134]:
pickle.dump( text_dfs8_7001to10000, open( "text_dfs8_7001to10000_testing", "wb" ) )

In [135]:
text_dfs8_7001to10000.shape

(2999, 4)

In [136]:
text_dfs8_7001to10000_testing = text_dfs8_7001to10000
text_dfs8_7001to10000_testing.dropna(subset = ['url','article'])

Unnamed: 0,index,date,url,article
0,0,"NOV. 1, 2006MARCH 9, 2015MARCH 9, 2015",https://www.nytimes.com/2006/11/01/business/01...,"A couple of years ago, Robert E. Rubin — émine..."
1,0,"OCT. 7, 2008",https://www.nytimes.com/2008/10/08/business/08...,DETROIT — The few consumers who are buying new...
2,0,,https://www.nytimes.com/2002/07/17/business/ma...,Prices of blue-chip stocks fell for the sevent...
3,0,"March 17, 2016March 17, 2016 6:06 am",https://news.blogs.nytimes.com/2016/03/17/morn...,ACKMAN CUTS HIS MONDELEZ STAKE | William A. ...
4,0,"Dec. 14, 2015",https://www.nytimes.com/2015/12/15/upshot/why-...,The Federal Reserve will most likely raise int...
5,0,,https://www.nytimes.com/2002/08/21/business/th...,The nation's trade deficit has narrowed slight...
6,0,"Oct. 3, 2016",https://www.nytimes.com/2016/10/04/upshot/dona...,"When you buy an “American-made” car, you are p..."
7,0,,https://www.nytimes.com/2000/10/02/opinion/l-p...,To the Editor:Re ''Forget Washington. The Poor...
8,0,"JUNE 17, 2009FEB. 17, 2015MAY 29MAY 22MAY 15MA...",https://www.nytimes.com/2009/06/17/business/ec...,Rationing.More to the point: Rationing!As in: ...
9,0,"Nov. 25, 2014",https://www.nytimes.com/2014/11/26/opinion/foo...,There are four basic ways to change the food s...


In [139]:
text_dfs8_7001to10000_testing.iloc[2998]

index                                                      0
date                                                        
url        https://www.nytimes.com/video/multimedia/10000...
article                                                     
Name: 2998, dtype: object

In [140]:
text_dfs8_7001to10000_testing['url_date'] = text_dfs8_7001to10000_testing['url'].apply(regex_date)

In [141]:
text_dfs8_7001to10000_testing.head()

Unnamed: 0,index,date,url,article,url_date
0,0,"NOV. 1, 2006MARCH 9, 2015MARCH 9, 2015",https://www.nytimes.com/2006/11/01/business/01...,"A couple of years ago, Robert E. Rubin — émine...",2006-11-01
1,0,"OCT. 7, 2008",https://www.nytimes.com/2008/10/08/business/08...,DETROIT — The few consumers who are buying new...,2008-10-08
2,0,,https://www.nytimes.com/2002/07/17/business/ma...,Prices of blue-chip stocks fell for the sevent...,2002-07-17
3,0,"March 17, 2016March 17, 2016 6:06 am",https://news.blogs.nytimes.com/2016/03/17/morn...,ACKMAN CUTS HIS MONDELEZ STAKE | William A. ...,2016-03-17
4,0,"Dec. 14, 2015",https://www.nytimes.com/2015/12/15/upshot/why-...,The Federal Reserve will most likely raise int...,2015-12-15


In [142]:
url_list10001to13000 = url_list[10001:13000]

In [143]:
text_dfs8_10001to13000 = [get_text1(url) for url in url_list10001to13000]

In [144]:
text_dfs8_10001to13000 = pd.concat(text_dfs8_10001to13000).reset_index()

In [145]:
text_dfs8_10001to13000.columns = ['index','date', 'url','article']

In [146]:
text_dfs8_10001to13000.shape

(2999, 4)

In [147]:
text_dfs8_10001to13000.head(4)

Unnamed: 0,index,date,url,article
0,0,"DEC. 15, 2012AUG 29AUG 19JUL 26JUL 24JUL 15",https://www.nytimes.com/2012/12/16/opinion/sun...,When President-elect Barack Obama selected Tim...
1,0,"OCT. 2, 2011",https://www.nytimes.com/2011/10/03/business/ec...,ECONOMIC REPORTS This week’s data will include...
2,0,"JUNE 13, 2013",https://www.nytimes.com/2013/06/14/business/da...,Good news about hiring and spending at retail ...
3,0,"May 21, 2015May 21, 2015 7:27 am",https://news.blogs.nytimes.com/2015/05/21/morn...,HEAVY FINES FOR FOREIGN EXCHANGE COLLUSION | ...


In [148]:
pickle.dump( text_dfs8_10001to13000, open( "text_dfs8_10001to13000_testing", "wb" ) )

In [150]:
text_dfs8_10001to13000_testing = text_dfs8_10001to13000

In [151]:
text_dfs8_10001to13000_testing['url_date'] = text_dfs8_10001to13000_testing['url'].apply(regex_date)

In [152]:
text_dfs8_10001to13000_testing.sample(10,random_state=42)

Unnamed: 0,index,date,url,article,url_date
1376,0,,https://www.nytimes.com/2000/10/14/business/pr...,American manufacturers were able to charge sur...,2000-10-14
932,0,"JUNE 11, 2010",https://www.nytimes.com/2010/06/12/business/12...,WASHINGTON — The Federal Reserve’s vice chairm...,2010-06-12
144,0,"April 6, 2015",https://www.nytimes.com/2015/04/07/business/ec...,WASHINGTON — Service companies in the United S...,2015-04-07
1752,0,"Aug. 11, 2016",https://www.nytimes.com/2016/08/12/us/politics...,"WARREN, Mich. — In a full-throttled rejection ...",2016-08-12
51,0,,https://www.nytimes.com/2003/09/07/business/on...,"ANOTHER Labor Day has passed, with the usual c...",2003-09-07
414,0,"Sept. 14, 2016",https://www.nytimes.com/2016/09/15/business/ec...,WASHINGTON — The eye-popping improvement in ec...,2016-09-15
2306,0,,https://www.nytimes.com/2003/09/17/business/fe...,The Federal Reserve retained its cautious outl...,2003-09-17
266,0,"FEB. 28, 2009",https://www.nytimes.com/2009/03/01/business/01...,"IN the world of technology, inventors are hail...",2009-03-01
2705,0,"Feb. 23, 2015",https://www.nytimes.com/2015/02/24/us/with-con...,WASHINGTON — President Obama on Monday moved t...,2015-02-24
2424,0,,https://www.nytimes.com/2001/09/17/business/af...,From the giant Ford Motor Company to the modes...,2001-09-17


In [159]:
url_list13001to14000 = url_list[13001:14000]

In [160]:
text_dfs8_13001to14000 = [get_text1(url) for url in url_list13001to14000]

In [161]:
text_dfs8_13001to14000 = pd.concat(text_dfs8_13001to14000).reset_index()

In [162]:
text_dfs8_13001to14000.columns = ['index','date', 'url','article']

In [164]:
text_dfs8_13001to14000.head(4)

Unnamed: 0,index,date,url,article
0,0,"JUNE 16, 2013",https://www.nytimes.com/2013/06/17/business/ec...,ECONOMIC REPORTS Data to be released this week...
1,0,"NOV. 23, 2009",https://www.nytimes.com/2009/11/23/us/politics...,President Obama’s scramble for a health care o...
2,0,"FEB. 13, 2009FEB. 13, 2009FEB. 13, 2009FEB. 13...",https://www.nytimes.com/2009/02/14/business/ec...,WASHINGTON — A provision buried deep inside th...
3,0,"NOV. 18, 2008",https://www.nytimes.com/2008/11/19/nyregion/19...,"Up and down Fifth Avenue, it is hard not to se..."


In [165]:
pickle.dump( text_dfs8_13001to14000, open( "text_dfs8_13001to14000_testing", "wb" ) )

In [166]:
text_dfs8_13001to14000_testing = text_dfs8_13001to14000

In [167]:
text_dfs8_13001to14000_testing['url_date'] = text_dfs8_13001to14000_testing['url'].apply(regex_date)

In [168]:
text_dfs8_13001to14000_testing.head(4)

Unnamed: 0,index,date,url,article,url_date
0,0,"JUNE 16, 2013",https://www.nytimes.com/2013/06/17/business/ec...,ECONOMIC REPORTS Data to be released this week...,2013-06-17
1,0,"NOV. 23, 2009",https://www.nytimes.com/2009/11/23/us/politics...,President Obama’s scramble for a health care o...,2009-11-23
2,0,"FEB. 13, 2009FEB. 13, 2009FEB. 13, 2009FEB. 13...",https://www.nytimes.com/2009/02/14/business/ec...,WASHINGTON — A provision buried deep inside th...,2009-02-14
3,0,"NOV. 18, 2008",https://www.nytimes.com/2008/11/19/nyregion/19...,"Up and down Fifth Avenue, it is hard not to se...",2008-11-19


In [170]:
text_dfs8_13001to14000_testing.shape

(999, 5)

In [171]:
url_list14001to15000 = url_list[14001:15000]

In [172]:
text_dfs8_14001to15000 = [get_text1(url) for url in url_list14001to15000]

In [173]:
text_dfs8_14001to15000 = pd.concat(text_dfs8_14001to15000).reset_index()

In [175]:
text_dfs8_14001to15000.columns = ['index','date', 'url','article']

In [176]:
text_dfs8_14001to15000.head(2)

Unnamed: 0,index,date,url,article
0,0,"JAN. 7, 2011",https://www.nytimes.com/2011/01/08/business/ec...,WASHINGTON — President Obama went to a busy wi...
1,0,"November 25, 2012 10:43 pm",https://economix.blogs.nytimes.com/2012/11/25/...,"Every year, the government spends more than $1..."


In [177]:
pickle.dump( text_dfs8_14001to15000, open( "text_dfs8_14001to15000_testing", "wb" ) )

In [178]:
text_dfs8_14001to15000_testing = text_dfs8_14001to15000

In [179]:
text_dfs8_14001to15000_testing['url_date'] = text_dfs8_14001to15000_testing['url'].apply(regex_date)

In [180]:
text_dfs8_14001to15000_testing.head(2)

Unnamed: 0,index,date,url,article,url_date
0,0,"JAN. 7, 2011",https://www.nytimes.com/2011/01/08/business/ec...,WASHINGTON — President Obama went to a busy wi...,2011-01-08
1,0,"November 25, 2012 10:43 pm",https://economix.blogs.nytimes.com/2012/11/25/...,"Every year, the government spends more than $1...",2012-11-25


In [183]:
url_list15001to16000 = url_list[15001:16000]

In [184]:
text_dfs8_15001to16000 = [get_text1(url) for url in url_list15001to16000]

In [185]:
text_dfs8_15001to16000 = pd.concat(text_dfs8_15001to16000).reset_index()

In [186]:
text_dfs8_15001to16000.columns = ['index','date', 'url','article']

In [187]:
pickle.dump( text_dfs8_15001to16000, open( "text_dfs8_15001to16000_testing", "wb" ) )

In [188]:
text_dfs8_15001to16000_testing = text_dfs8_15001to16000

In [189]:
text_dfs8_15001to16000_testing['url_date'] = text_dfs8_15001to16000_testing['url'].apply(regex_date)

In [190]:
text_dfs8_15001to16000_testing.head(2)

Unnamed: 0,index,date,url,article,url_date
0,0,"March 7, 2013 4:33 pm",https://query.nytimes.com/gst/fullpage.html?re...,9:00 p.m. | UpdatedFour years after the financ...,
1,0,"January 28, 2014 11:00 am",https://economix.blogs.nytimes.com/2014/01/28/...,Phillip Swagel is a professor at the School of...,2014-01-28


In [191]:
df = pd.DataFrame()

def get_text2(url):
    try:
        stringone = ''
        content_string2 = ''
        content_string4 = ''
        string_date2 = ''
        string_date3 = ''
        page = requests.get(url).text
        soup = BeautifulSoup(page, "lxml")
        #url2 = pd.Series(url)

        article_content = soup.find_all('p', attrs={'class' : 'story-body-text story-content'})
            #print(article_content[0])
            #print(len(article_content))

        for i in article_content:
            stringone += i.text.strip()
        #print(stringone)
        article = pd.Series(stringone)

        if stringone == '':

            article_content1 = soup.find_all('p', attrs={'class' : 'css-1i0edl6 e2kc3sl0'})

            #string2 = '' ... moved to the top
            for i in article_content1:
                content_string2 += (i.text.strip())
            article = pd.Series(content_string2)

            if content_string2 == '':


                article_content_4 = soup.find_all('p', attrs={'class' : 'story-body-text'})
                #string4 = ''
                for i in article_content_4:
                    content_string4 += (i.text.strip())
                article = pd.Series(content_string4)



        date2 = soup.find_all('time', attrs={'class' : 'dateline'})
        #string_date2 = ''
        for i in date2:
            string_date2 += i.text.strip()
        df_date = pd.Series(string_date2) #to be appended to dataframe lower down

        if string_date2 == '':


            date3 = soup.find_all('time', attrs={'class' : 'css-pnci9c eqgapgq0'})
            #string_date3 = ''
            for i in date3:
                string_date3 += i.text.strip()
            df_date = pd.Series(string_date3)

            #if string_date3 == '':
                #masthead = soup.find_all(class_='sharetools theme-classic sharetools-masthead ')
                #a = str(masthead[0])
                #a = a.split("=")
                #a = [v.split("\" ")[0][1:] for v in a]
                #df_date = pd.Series(a[6])


        url2 = pd.Series(url) ##these are to be appended to dataframe lower down
        frames = [df_date,url2,article] #authorz,#descrip,
        result = pd.concat(frames,axis=1)

        return(result)
    
    except:
        return(None)


In [193]:
url_list16001to17000 = url_list[16001:17000]
text_dfs8_16001to17000 = [get_text2(url) for url in url_list16001to17000]

In [195]:
text_dfs8_16001to17000.columns = ['index','date', 'url','article']

In [196]:
text_dfs8_16001to17000.shape

(998, 4)

In [197]:
text_dfs8_16001to17000.head(2)

Unnamed: 0,index,date,url,article
0,0,,https://www.nytimes.com/2002/01/12/business/wh...,Wholesale prices in the United States declined...
1,0,"FEB. 7, 2013",https://www.nytimes.com/2013/02/08/business/ec...,WASHINGTON (Reuters) — The number of Americans...


In [198]:
pickle.dump( text_dfs8_16001to17000, open( "text_dfs8_16001to17000_testing", "wb" ) )

In [209]:
text_dfs8_16001to17000_testing = text_dfs8_16001to17000

In [200]:
text_dfs8_16001to17000_testing['url_date'] = text_dfs8_16001to17000_testing['url'].apply(regex_date)

In [201]:
text_dfs8_16001to17000_testing.head(2)

Unnamed: 0,index,date,url,article,url_date
0,0,,https://www.nytimes.com/2002/01/12/business/wh...,Wholesale prices in the United States declined...,2002-01-12
1,0,"FEB. 7, 2013",https://www.nytimes.com/2013/02/08/business/ec...,WASHINGTON (Reuters) — The number of Americans...,2013-02-08


In [202]:
len(url_list)

19702

In [203]:
url_list17001to19702 = url_list[17001:19702]
text_dfs8_17001to19702 = [get_text2(url) for url in url_list17001to19702]

In [204]:
text_dfs8_17001to19702 = pd.concat(text_dfs8_17001to19702).reset_index()

In [205]:
text_dfs8_17001to19702.columns = ['index','date', 'url','article']

In [206]:
text_dfs8_17001to19702.shape

(2701, 4)

In [207]:
pickle.dump( text_dfs8_17001to19702, open( "text_dfs8_17001to19702_testing", "wb" ) )

In [210]:
text_dfs8_17001to19702_testing = text_dfs8_17001to19702

In [211]:
text_dfs8_17001to19702_testing['url_date'] = text_dfs8_17001to19702_testing['url'].apply(regex_date)

In [212]:
text_dfs8_17001to19702_testing.shape

(2701, 5)

## As described above, there now exist dataframes for ~19,000 articles. 

## Subsequent notebooks will replicate this process for the remaining articles in the corpus. 