################# THIS SCRIPT REQUIRES AN API KEY FROM NEW YORK TIMES<br><br>

This API has daily limitations for free users and so the defintions are required to run one per day.<br><br>

Do note that the articles return will most likely not be relevant to the keyword used to perform the query search.

In [15]:
import pandas, json, numpy, requests, os, datetime, pytz, tweepy, sqlite3, time, re

# 3. Extracting news data from New York Times (NYT) (https://developer.nytimes.com/)
## Genernal tutorial: (https://medium.com/@danalindquist/using-new-york-times-api-and-jq-to-collect-news-data-a5f386c7237b)

### Goal for this script is to gather all histroical data about the company and align the article to the price of that day.
### Restraints are to whether or not NYT reported it or not, or the history passes beyond 1985.
### However, since the Alpha Vantage API does not provide daily price data beyond 20 years, the 1985 restraint is ignored.


#### Repeating first few steps from Alpha Vantage API to generate a merged DataFrame of stock names and ticks.
#### Change is lowering the amount of stocks/ticks per variable:
NYSE_csv = pandas.read_csv('NYSE.txt', sep="\t", header=0).set_index('Symbol')
NYSE_csv.to_csv('NYSE_csv.csv')

AMEX_csv = pandas.read_csv('AMEX.txt', sep="\t", header=0).set_index('Symbol')
AMEX_csv.to_csv('AMEX_csv.csv')

stock_exchange_ticks_and_names = pandas.merge(NYSE_csv.reset_index(), AMEX_csv.reset_index(), how='outer')
stock_exchange_ticks_and_names.to_csv('merged_NYSE_AMEX.csv')
stock_exchange_ticks_and_names_copy = stock_exchange_ticks_and_names.copy().dropna()

regex1 = re.compile('[@_!#$%^&*()<>?/\|}{~:[\].]')
regex2 = re.compile('Cl ')

stock_exchange_ticks_and_names_removed = pandas.DataFrame()

for x, y in stock_exchange_ticks_and_names_copy.iterrows():

    if bool(regex1.search(y['Description'])) == False and bool(regex2.search(y['Description'])) == False and bool(regex1.search(y['Symbol'])) == False:
        stock_exchange_ticks_and_names_removed.loc[x, 'Symbol'] = y['Symbol']
        stock_exchange_ticks_and_names_removed.loc[x, 'Description'] = y['Description']

stock_exchange_ticks_and_names_removed = stock_exchange_ticks_and_names_removed.set_index('Symbol')
stock_exchange_ticks_and_names_removed = stock_exchange_ticks_and_names_removed.reset_index()
stock_exchange_ticks_and_names_removed.to_csv('merged_NYSE_AMEX_removed_times.csv')

### To account for the Times API daily limit and time of running the script, each list is limited to 200: 
stock_exchange_ticks_and_names_0_399 = stock_exchange_ticks_and_names_removed.iloc[0:400, :]
stock_exchange_ticks_and_names_400_799 = stock_exchange_ticks_and_names_removed.iloc[400:800, :]
stock_exchange_ticks_and_names_800_1199 = stock_exchange_ticks_and_names_removed.iloc[800:1200, :]
stock_exchange_ticks_and_names_1200_1599 = stock_exchange_ticks_and_names_removed.iloc[1200:1600, :]
stock_exchange_ticks_and_names_1600_1999 = stock_exchange_ticks_and_names_removed.iloc[1600:2000, :]
stock_exchange_ticks_and_names_2000_2399 = stock_exchange_ticks_and_names_removed.iloc[2000:2400, :]
stock_exchange_ticks_and_names_2400_2799 = stock_exchange_ticks_and_names_removed.iloc[2400:2800, :]
stock_exchange_ticks_and_names_2800_3199 = stock_exchange_ticks_and_names_removed.iloc[2800:3200, :]
stock_exchange_ticks_and_names_3200_3599 = stock_exchange_ticks_and_names_removed.iloc[3200:3600, :]
stock_exchange_ticks_and_names_3600_3999 = stock_exchange_ticks_and_names_removed.iloc[3600:4000, :]
stock_exchange_ticks_and_names_4000_4399 = stock_exchange_ticks_and_names_removed.iloc[4000:4400, :]
stock_exchange_ticks_and_names_4400_4799 = stock_exchange_ticks_and_names_removed.iloc[4400:4800, :]
stock_exchange_ticks_and_names_4800_4880 = stock_exchange_ticks_and_names_removed.iloc[4800:, :]


### Creating a list of terms to search for that is not a company name: (https://www.investing.com/indices/major-indices)
### S&P = S%26P since & is a reserved character in URL
stock_indices_df = pandas.DataFrame({'Description': ['S%26P', 'Dow', "Nasdaq"], 'Symbol': ['.INX', '.DJI', ".IXIC"]})

#### Used the DataFrame.Symbol and .Description as the keywords to look for article searching:

## The "pagination_list" variable is required since there is a 10 request per minute restriction. See https://developer.nytimes.com/docs/articlesearch-product/1/overview
## in the "Pagination" section for more info. The number of pages that I will be setting is 2 to save time. In theory,
## the range should be up to 100 pages (even though there might not be 1000 articles on the company/keyword).

def article_search(stock_list, pages= 2, begin_date= '20000101'):
    pagination_list = list(range(pages))  
    master_temp_df = pandas.DataFrame()
    filepath = os.getcwd() + '\\Historical Articles\\'

    for y in stock_list.Description:
        print(y) ### This is to indicate where a error might have occurred.

        y_splited = y.split()  ### This is done to remove the company type from the name and add a + for spaces.
        if len(y_splited) == 1:  ### Want to only pass the first/only or first two words in the company name in URL.
            y_modified = y_splited
            print(y_modified)

        if len(y_splited) > 1 and y_splited[1] != 'LP' and y_splited[1] != 'Llc' and y_splited[1] != 'Ltd' and y_splited[1] != 'Corp' and y_splited[1] != 'Inc' and y_splited[1] != 'Company' and y_splited[1] != 'ETF':
            y_modified = y_splited[0] + '+' + y_splited[1]
            print(y_modified)

        else:
            y_modified = y_splited[0]
            print(y_modified)

        for x in pagination_list:
            get_URL = "https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=body:('" + y_modified + "')ANDnews_desk:('Business')&begin_date=" + begin_date + "&sort=newest&page=" + str(x) + "&api-key=" ### INSERT API KEY
            time.sleep(6)
            NYT_api_requests = requests.get(get_URL)
            NYT_api_convert = NYT_api_requests.json()
            if x == 0:  ### Assiging the NYT_df as the foundation for building the DataFrame so the next pages can be .concat
                if NYT_api_convert['status'] != "OK":  ### These checkes to see if the URL request was successful
                    print("Error: " + str(NYT_api_convert['status']) + " ; Company: " + y)
                    break

                if NYT_api_convert['status'] == "OK":
                    NYT_df = pandas.DataFrame()
                    NYT_df['request_result'] = NYT_api_convert['response']['docs']

            else: ### Need to combine the pages as it iterates
                if NYT_api_convert['status'] != "OK":
                    print("Error: " + str(NYT_api_convert['status']) + " ; Company: " + y)
                    break

                if NYT_api_convert['status'] == "OK":
                    NYT_df_temp = pandas.DataFrame()
                    NYT_df_temp['request_result'] = NYT_api_convert['response']['docs']  
                    NYT_df = pandas.concat([NYT_df, NYT_df_temp], ignore_index=True)              

        NYT_df['company'] = y
        print('1')

        for u, v in enumerate(NYT_df.request_result):   ### Extracting desired data from the nested dictionary JSON URL return
            NYT_df.loc[u, 'headline'] = v['headline']['main']
            NYT_df.loc[u, 'abstract'] = v['abstract']
            NYT_df.loc[u, 'snippet'] = v['snippet']
            NYT_df.loc[u, 'lead_paragraph'] = v['lead_paragraph']
            NYT_df.loc[u, 'pub_date'] = v['pub_date']
            NYT_df.loc[u, 'source'] = 'The New York Times'
            NYT_df.loc[u, 'web_url'] = v['web_url']

        ### This accounts for the case that Times API does not have any a results for the search but has saved a file in a previous run
        ### If the existing file size is "1 KB", then it is most likely a empty dataframe with only two columns.
        if 'pub_date' not in NYT_df.columns: 
            continue
            
        if not os.path.exists(filepath):
            os.makedirs(filepath)
            
        ### Updates the current .csv file with the new dates, if it already exists.
        if os.path.exists(filepath + y + '.csv') == True:
            temp_df = pandas.read_csv(filepath + y + '.csv')
            print('2')
            #### This should cover the situation where pages != 1
            master_df = pandas.concat([temp_df, NYT_df], ignore_index=True).drop_duplicates('headline').sort_values(by='pub_date', ascending=False)
            print('3')
            master_df.to_csv(filepath + y + '.csv')
        print('7')
        if os.path.exists(filepath + y + '.csv') == False:
            print('6')
            NYT_df.to_csv(filepath + y + '.csv')

In [None]:
article_search(stock_indices_df, pages=10) ### Always run first, 100 articles is typically 1-2 days for S&P

article_search(stock_exchange_ticks_and_names_0_399)  ### 
article_search(stock_exchange_ticks_and_names_400_799)  ### 
article_search(stock_exchange_ticks_and_names_800_1199)  ### 
article_search(stock_exchange_ticks_and_names_1200_1599)  ### 
article_search(stock_exchange_ticks_and_names_1600_1999)  ### 
article_search(stock_exchange_ticks_and_names_2000_2399)  ### 
article_search(stock_exchange_ticks_and_names_2400_2799)  ### 
article_search(stock_exchange_ticks_and_names_2800_3199)  ### 
article_search(stock_exchange_ticks_and_names_3200_3599)  ### 
article_search(stock_exchange_ticks_and_names_3600_3999)  ### 
article_search(stock_exchange_ticks_and_names_4000_4399)  ### 
article_search(stock_exchange_ticks_and_names_4400_4799)  ### no stocks in df
article_search(stock_exchange_ticks_and_names_4800_4880)  ### no stocks in df