### Merge NY Times Data
The data was collected for separate timeframes which resulted in multiple .json files. Merge all of these files into one file.

In [4]:
import json
import datetime
import requests
from bs4 import BeautifulSoup as bs

##### 1 Get Content from URL
Using BeautifulSoup, gets the content of the article from the URL

In [6]:
def getContentfromURL(sourceUrl):
    ''' Gets just the content for the article from the NY Times
    
    Accepts : sourceUrl (string) URL to the NY Times article
    
    Returns : (string) content from the URL
    '''
    
    content = ""
    
    try:
        response = requests.request('GET', sourceUrl)

        if response.status_code == requests.codes.ok:
            soup = bs(response.content,'html.parser')
            content = soup.body.article.section.text
        
    except:
        print(f"Failure getting content: {sourceUrl}")
    
    return content

#### 2 Merge Files
For each file, gets the content and then merges together

In [12]:
def convertDateToInt(searchDate):
    ''' Converts provided date to int in "YYYYMMDD" format
    
    Accepts : searchDate (datetime) date to convert to int
    
    Returns : (int) converted date
    '''

    return int(searchDate.strftime('%Y%m%d'))

In [7]:
#- List of Files
sourceFiles = [
    'articles_Jan1_2018_June30_2018.json',
    'articles_Jan1_2019_June30_2019.json',
    'articles_July1_Dec31_2018.json',
    'articles_July1_Dec31_2017.json'
    ]


#- Merge Files
articles = []


for sourceFile in sourceFiles:
    
    #- Open File
    with open(sourceFile) as jsonFile:
        sourceData = json.load(jsonFile)
        
        
    #- Get Content for Articles
    counter = 0
    totalArticles = len(sourceData['articles'])
    reportCount = totalArticles / 10
    nextReportCount = reportCount
    
    print(f'--> Getting content for file. Count: {totalArticles} File:{sourceFile}')
    
    
    for article in sourceData['articles']:
        
        #- Report Progress
        counter+=1
        
        if (counter >= nextReportCount):
            print(f"{counter} of {totalArticles}")
            nextReportCount = nextReportCount + reportCount

            
        #- Get Content
        sourceUrl= article['sourceurl']
        
        content = getContentfromURL(sourceUrl)

        if content != '':
            #- Found Content
            article['content'] = content

        else:
            print(f'Content Failure : {sourceUrl}')
        
              
    #- Add to List of Articles
    articles = articles + sourceData['articles']
    
          
    #- Save File
    saveSourceFile = "Content-" + sourceFile
          
    with open(saveSourceFile, 'w') as fp:
          json.dump(sourceData, fp)
    
    print(f'Completed save for file: {saveSourceFile}')
        

#- Save Master File
container = {
    "createdate" : convertDateToInt(datetime.datetime.now()),
    "articles" : articles
    }


#- Json
with open('articles_all.json', 'w') as fp:
    json.dump(container, fp)

          
print("Completed")

--> Getting content for file. Count: 1545 File:articles_Jan1_2018_June30_2018.json
Content Failure : https://static01.nyt.com/slideshow/2018/01/05/business/from-mexico-to-the-us-a-nafta-tale-of-two-truckers.html
155 of 1545
Content Failure : https://static01.nyt.com/slideshow/2018/01/19/business/the-747-had-a-great-run-but-farewell-doesnt-mean-the-end.html
309 of 1545
Content Failure : https://static01.nyt.com/slideshow/2018/02/06/business/what-wall-streets-ups-and-downs-look-like.html
464 of 1545
Content Failure : https://static01.nyt.com/slideshow/2018/03/08/business/tk.html
618 of 1545
773 of 1545
927 of 1545
1082 of 1545
Failure getting content: https://www.nytimes.com/interactive/2018/05/18/technology/your-weekly-edition-newsletter-signup.html
Content Failure : https://www.nytimes.com/interactive/2018/05/18/technology/your-weekly-edition-newsletter-signup.html
1236 of 1545
1391 of 1545
Content Failure : https://static01.nyt.com/slideshow/2018/06/15/business/ford-aims-to-revive-a-d

NameError: name 'convertDateToInt' is not defined