# Capstone Project - Predicting Stock Prices Using Machine Learning 

Here, we will be gathering business/finance news articles from the New York Times dating from 1990 to 2019, and apply sentiment analysis to gauge the economic sentiment for each day:
<br>
<br>
- __Positive__ (Investors likely to __BUY__)
- __Neutral__ (Investors likely to take __NO ACTION__)
- __Negative__ (Investors likely to __SELL__)

## 1. Pre-Process the Data - Part 1. Gathering Data

### 1.1 Import All Required Libraries

In [3]:
# !pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 4.8 MB/s 
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
## Pandas and Numpy
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

## API
import requests
import json

## Regex & Miscellaneous
import re as re
import time as timer
from datetime import datetime, date, time
from tqdm import tqdm
from collections import Counter
import pickle

## Sentiment Analysis
import spacy
import nltk
from spacy.lang.en.stop_words import STOP_WORDS
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [12]:
!cp drive/MyDrive/Load_MasterDictionary.py .
!cp drive/MyDrive/LoughranMcDonald_MasterDictionary_2021.csv .

In [13]:
import Load_MasterDictionary as MD ## LM Dictionary

file_path = ('LoughranMcDonald_MasterDictionary_2021.csv')

md, header, sent_cat, sent_dict, stopwords, total_doc = MD.load_masterdictionary(file_path, get_other = True)




### 1.2. Helper Functions for Extracting Data

In [14]:
## Helper Function for Creating A Dictionary
def create_dict(temp_dict, key, value):
    if key in temp_dict.keys():
        temp_dict[key].append(value)
    else: 
        temp_dict[key] = []
        temp_dict[key].append(value)
    return temp_dict

In [15]:
## Helper Function for Extracting Publication Dates and Abstract from the JSON Data.
def extract_data(r):
    r_dict = {}
    temp_json = r.json()
    doc = temp_json["response"]["docs"]
    length = list(range(len(temp_json["response"]["docs"])))
    
    for i in tqdm (length, 
                   desc="Loading…"):

        news_desk = doc[i]['news_desk']
        lookup = r"[bB]usiness|[fF]inan|[nN]one|[mM]oney"
        match = re.search(lookup, news_desk)
        
        if match:
            date = doc[i]["pub_date"]
            date = date[:10]
            abstract = doc[i]["abstract"]
            lead_paragraph = doc[i]['lead_paragraph']
            
            if len(abstract) < 700:
                value = abstract
            else:
                value = lead_paragraph
            
            r_dict = create_dict(r_dict, date, value)
            
        else:
            continue
    
    return r_dict

In [16]:
### Helper Function for Filtering the Data Based on Keywords
keywords = ["Standard & Poor", "S&P 500", "stock market", "index", ".INX", "Wall Street"]

def filter_keywords(r_dict, keywords):
    f_dict = {}
    
    for date, abstract in r_dict.items():
        
        for i in range(len(abstract)):
            
            for keyword in keywords:
                
                if (keyword in abstract[i]) and ("COMPANY REPORT" not in abstract[i]):                    
                    
                    f_dict = create_dict(f_dict, date, abstract[i])
                    
                else:
                    continue
    
    return f_dict

In [17]:
### Helper Function for Gathering Data from API
def get_data(start_year, end_year):
    
    data_dict = {}
    
    years = range(start_year, end_year+1)
    months = range(1, 13)
    
    for year in tqdm(years, desc="Loading…"):
        
        for month in months:
            date = f"{year}-{month}"
            print(f"Gathering Data for Year: {year}, Month: {month}")
                
            url = f"https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={key}"
            r = requests.get(url)
            r_dict = extract_data(r)
            f_dict = filter_keywords(r_dict, keywords)
            
            data_dict = create_dict(data_dict, date, f_dict)
        
            timer.sleep(5) # Sleep for 5 seconds
    
    return data_dict

### 1.3. API Calls & Gather Data From New York Times

In [18]:
key = "3aD0nGomAVgo2sv2dJiKPNG0iEHoR4AL "

In [19]:
dict_9099 = get_data(1990, 1999) ## Gather Data from 1990 to 1999

Loading…:   0%|          | 0/10 [00:00<?, ?it/s]

Gathering Data for Year: 1990, Month: 1



Loading…: 100%|██████████| 8879/8879 [00:00<00:00, 493411.57it/s]


Gathering Data for Year: 1990, Month: 2



Loading…: 100%|██████████| 8468/8468 [00:00<00:00, 279356.35it/s]


Gathering Data for Year: 1990, Month: 3



Loading…: 100%|██████████| 8856/8856 [00:00<00:00, 458531.95it/s]


Gathering Data for Year: 1990, Month: 4



Loading…: 100%|██████████| 9694/9694 [00:00<00:00, 416312.57it/s]


Gathering Data for Year: 1990, Month: 5



Loading…: 100%|██████████| 8963/8963 [00:00<00:00, 477396.56it/s]


Gathering Data for Year: 1990, Month: 6



Loading…: 100%|██████████| 7071/7071 [00:00<00:00, 455826.93it/s]


Gathering Data for Year: 1990, Month: 7



Loading…: 100%|██████████| 8219/8219 [00:00<00:00, 429254.31it/s]


Gathering Data for Year: 1990, Month: 8



Loading…: 100%|██████████| 7987/7987 [00:00<00:00, 302228.43it/s]


Gathering Data for Year: 1990, Month: 9



Loading…: 100%|██████████| 7405/7405 [00:00<00:00, 408078.06it/s]


Gathering Data for Year: 1990, Month: 10



Loading…: 100%|██████████| 8687/8687 [00:00<00:00, 373855.11it/s]


Gathering Data for Year: 1990, Month: 11



Loading…: 100%|██████████| 7782/7782 [00:00<00:00, 431467.35it/s]


Gathering Data for Year: 1990, Month: 12



Loading…: 100%|██████████| 6894/6894 [00:00<00:00, 406293.92it/s]
Loading…:  10%|█         | 1/10 [01:14<11:14, 74.92s/it]

Gathering Data for Year: 1991, Month: 1



Loading…: 100%|██████████| 6866/6866 [00:00<00:00, 435332.13it/s]


Gathering Data for Year: 1991, Month: 2



Loading…: 100%|██████████| 7323/7323 [00:00<00:00, 299604.83it/s]


Gathering Data for Year: 1991, Month: 3



Loading…: 100%|██████████| 7312/7312 [00:00<00:00, 408993.02it/s]


Gathering Data for Year: 1991, Month: 4



Loading…: 100%|██████████| 8005/8005 [00:00<00:00, 450809.68it/s]


Gathering Data for Year: 1991, Month: 5



Loading…: 100%|██████████| 7274/7274 [00:00<00:00, 499179.75it/s]


Gathering Data for Year: 1991, Month: 6



Loading…: 100%|██████████| 7439/7439 [00:00<00:00, 444699.16it/s]


Gathering Data for Year: 1991, Month: 7



Loading…: 100%|██████████| 7506/7506 [00:00<00:00, 476148.25it/s]


Gathering Data for Year: 1991, Month: 8



Loading…: 100%|██████████| 6297/6297 [00:00<00:00, 435338.18it/s]


Gathering Data for Year: 1991, Month: 9



Loading…: 100%|██████████| 7213/7213 [00:00<00:00, 310738.65it/s]


Gathering Data for Year: 1991, Month: 10



Loading…: 100%|██████████| 6536/6536 [00:00<00:00, 379442.63it/s]


Gathering Data for Year: 1991, Month: 11



Loading…: 100%|██████████| 6739/6739 [00:00<00:00, 348766.28it/s]


Gathering Data for Year: 1991, Month: 12



Loading…: 100%|██████████| 7078/7078 [00:00<00:00, 432343.28it/s]
Loading…:  20%|██        | 2/10 [02:29<09:57, 74.74s/it]

Gathering Data for Year: 1992, Month: 1



Loading…: 100%|██████████| 7618/7618 [00:00<00:00, 388070.93it/s]


Gathering Data for Year: 1992, Month: 2



Loading…: 100%|██████████| 6032/6032 [00:00<00:00, 448089.72it/s]


Gathering Data for Year: 1992, Month: 3



Loading…: 100%|██████████| 7753/7753 [00:00<00:00, 402600.42it/s]


Gathering Data for Year: 1992, Month: 4



Loading…: 100%|██████████| 7182/7182 [00:00<00:00, 334237.53it/s]


Gathering Data for Year: 1992, Month: 5



Loading…: 100%|██████████| 8082/8082 [00:00<00:00, 399999.59it/s]


Gathering Data for Year: 1992, Month: 6



Loading…: 100%|██████████| 7123/7123 [00:00<00:00, 272484.59it/s]


Gathering Data for Year: 1992, Month: 7



Loading…: 100%|██████████| 7235/7235 [00:00<00:00, 396164.30it/s]


Gathering Data for Year: 1992, Month: 8



Loading…: 100%|██████████| 7341/7341 [00:00<00:00, 327839.79it/s]


Gathering Data for Year: 1992, Month: 9



Loading…: 100%|██████████| 6529/6529 [00:00<00:00, 399227.50it/s]


Gathering Data for Year: 1992, Month: 10



Loading…: 100%|██████████| 7289/7289 [00:00<00:00, 359036.09it/s]


Gathering Data for Year: 1992, Month: 11



Loading…: 100%|██████████| 7240/7240 [00:00<00:00, 375621.70it/s]


Gathering Data for Year: 1992, Month: 12



Loading…: 100%|██████████| 6614/6614 [00:00<00:00, 395358.60it/s]
Loading…:  30%|███       | 3/10 [03:44<08:42, 74.66s/it]

Gathering Data for Year: 1993, Month: 1



Loading…: 100%|██████████| 7340/7340 [00:00<00:00, 242300.30it/s]


Gathering Data for Year: 1993, Month: 2



Loading…: 100%|██████████| 6575/6575 [00:00<00:00, 258468.44it/s]


Gathering Data for Year: 1993, Month: 3



Loading…: 100%|██████████| 6062/6062 [00:00<00:00, 439301.13it/s]


Gathering Data for Year: 1993, Month: 4



Loading…: 100%|██████████| 6748/6748 [00:00<00:00, 351836.85it/s]


Gathering Data for Year: 1993, Month: 5



Loading…: 100%|██████████| 7153/7153 [00:00<00:00, 390080.31it/s]


Gathering Data for Year: 1993, Month: 6



Loading…: 100%|██████████| 6530/6530 [00:00<00:00, 369534.72it/s]


Gathering Data for Year: 1993, Month: 7



Loading…: 100%|██████████| 6805/6805 [00:00<00:00, 372589.76it/s]


Gathering Data for Year: 1993, Month: 8



Loading…: 100%|██████████| 7115/7115 [00:00<00:00, 424169.89it/s]


Gathering Data for Year: 1993, Month: 9



Loading…: 100%|██████████| 6056/6056 [00:00<00:00, 354268.61it/s]


Gathering Data for Year: 1993, Month: 10



Loading…: 100%|██████████| 8068/8068 [00:00<00:00, 357214.51it/s]


Gathering Data for Year: 1993, Month: 11



Loading…: 100%|██████████| 7182/7182 [00:00<00:00, 324694.06it/s]


Gathering Data for Year: 1993, Month: 12



Loading…: 100%|██████████| 6386/6386 [00:00<00:00, 386043.06it/s]
Loading…:  40%|████      | 4/10 [04:58<07:26, 74.37s/it]

Gathering Data for Year: 1994, Month: 1



Loading…: 100%|██████████| 6856/6856 [00:00<00:00, 328908.58it/s]


Gathering Data for Year: 1994, Month: 2



Loading…: 100%|██████████| 5934/5934 [00:00<00:00, 370951.63it/s]


Gathering Data for Year: 1994, Month: 3



Loading…: 100%|██████████| 6512/6512 [00:00<00:00, 398251.86it/s]


Gathering Data for Year: 1994, Month: 4



Loading…: 100%|██████████| 6642/6642 [00:00<00:00, 351691.86it/s]


Gathering Data for Year: 1994, Month: 5



Loading…: 100%|██████████| 7177/7177 [00:00<00:00, 331997.22it/s]


Gathering Data for Year: 1994, Month: 6



Loading…: 100%|██████████| 6249/6249 [00:00<00:00, 373822.71it/s]


Gathering Data for Year: 1994, Month: 7



Loading…: 100%|██████████| 6925/6925 [00:00<00:00, 371911.64it/s]


Gathering Data for Year: 1994, Month: 8



Loading…: 100%|██████████| 6662/6662 [00:00<00:00, 299692.75it/s]


Gathering Data for Year: 1994, Month: 9



Loading…: 100%|██████████| 6149/6149 [00:00<00:00, 420888.35it/s]


Gathering Data for Year: 1994, Month: 10



Loading…: 100%|██████████| 7352/7352 [00:00<00:00, 434225.49it/s]


Gathering Data for Year: 1994, Month: 11



Loading…: 100%|██████████| 6892/6892 [00:00<00:00, 376385.29it/s]


Gathering Data for Year: 1994, Month: 12



Loading…: 100%|██████████| 6106/6106 [00:00<00:00, 373989.40it/s]
Loading…:  50%|█████     | 5/10 [06:12<06:11, 74.32s/it]

Gathering Data for Year: 1995, Month: 1



Loading…: 100%|██████████| 7038/7038 [00:00<00:00, 336830.77it/s]


Gathering Data for Year: 1995, Month: 2



Loading…: 100%|██████████| 5695/5695 [00:00<00:00, 432210.79it/s]


Gathering Data for Year: 1995, Month: 3



Loading…: 100%|██████████| 6695/6695 [00:00<00:00, 342220.04it/s]


Gathering Data for Year: 1995, Month: 4



Loading…: 100%|██████████| 7527/7527 [00:00<00:00, 278549.54it/s]


Gathering Data for Year: 1995, Month: 5



Loading…: 100%|██████████| 7062/7062 [00:00<00:00, 391987.92it/s]


Gathering Data for Year: 1995, Month: 6



Loading…: 100%|██████████| 6581/6581 [00:00<00:00, 395267.49it/s]


Gathering Data for Year: 1995, Month: 7



Loading…: 100%|██████████| 7556/7556 [00:00<00:00, 372113.48it/s]


Gathering Data for Year: 1995, Month: 8



Loading…: 100%|██████████| 7061/7061 [00:00<00:00, 404744.72it/s]


Gathering Data for Year: 1995, Month: 9



Loading…: 100%|██████████| 6351/6351 [00:00<00:00, 319745.83it/s]


Gathering Data for Year: 1995, Month: 10



Loading…: 100%|██████████| 9715/9715 [00:00<00:00, 382370.21it/s]


Gathering Data for Year: 1995, Month: 11



Loading…: 100%|██████████| 6803/6803 [00:00<00:00, 442632.32it/s]


Gathering Data for Year: 1995, Month: 12



Loading…: 100%|██████████| 7151/7151 [00:00<00:00, 345149.23it/s]
Loading…:  60%|██████    | 6/10 [07:26<04:57, 74.42s/it]

Gathering Data for Year: 1996, Month: 1



Loading…: 100%|██████████| 6829/6829 [00:00<00:00, 423736.64it/s]


Gathering Data for Year: 1996, Month: 2



Loading…: 100%|██████████| 6605/6605 [00:00<00:00, 431073.63it/s]


Gathering Data for Year: 1996, Month: 3



Loading…: 100%|██████████| 7178/7178 [00:00<00:00, 367877.35it/s]


Gathering Data for Year: 1996, Month: 4



Loading…: 100%|██████████| 6705/6705 [00:00<00:00, 324627.54it/s]


Gathering Data for Year: 1996, Month: 5



Loading…: 100%|██████████| 6902/6902 [00:00<00:00, 276871.08it/s]


Gathering Data for Year: 1996, Month: 6



Loading…: 100%|██████████| 7034/7034 [00:00<00:00, 347909.60it/s]


Gathering Data for Year: 1996, Month: 7



Loading…: 100%|██████████| 6992/6992 [00:00<00:00, 390640.76it/s]


Gathering Data for Year: 1996, Month: 8



Loading…: 100%|██████████| 6010/6010 [00:00<00:00, 316243.47it/s]


Gathering Data for Year: 1996, Month: 9



Loading…: 100%|██████████| 6387/6387 [00:00<00:00, 429104.91it/s]


Gathering Data for Year: 1996, Month: 10



Loading…: 100%|██████████| 6675/6675 [00:00<00:00, 389003.62it/s]


Gathering Data for Year: 1996, Month: 11



Loading…: 100%|██████████| 6214/6214 [00:00<00:00, 350804.96it/s]


Gathering Data for Year: 1996, Month: 12



Loading…: 100%|██████████| 6258/6258 [00:00<00:00, 368314.80it/s]
Loading…:  70%|███████   | 7/10 [08:40<03:42, 74.21s/it]

Gathering Data for Year: 1997, Month: 1



Loading…: 100%|██████████| 7282/7282 [00:00<00:00, 359155.25it/s]


Gathering Data for Year: 1997, Month: 2



Loading…: 100%|██████████| 6967/6967 [00:00<00:00, 311881.27it/s]


Gathering Data for Year: 1997, Month: 3



Loading…: 100%|██████████| 7647/7647 [00:00<00:00, 392547.06it/s]


Gathering Data for Year: 1997, Month: 4



Loading…: 100%|██████████| 7386/7386 [00:00<00:00, 415994.75it/s]


Gathering Data for Year: 1997, Month: 5



Loading…: 100%|██████████| 7398/7398 [00:00<00:00, 395158.94it/s]


Gathering Data for Year: 1997, Month: 6



Loading…: 100%|██████████| 7427/7427 [00:00<00:00, 384178.28it/s]


Gathering Data for Year: 1997, Month: 7



Loading…: 100%|██████████| 7120/7120 [00:00<00:00, 349877.50it/s]


Gathering Data for Year: 1997, Month: 8



Loading…: 100%|██████████| 7296/7296 [00:00<00:00, 337781.38it/s]


Gathering Data for Year: 1997, Month: 9



Loading…: 100%|██████████| 7580/7580 [00:00<00:00, 351885.16it/s]


Gathering Data for Year: 1997, Month: 10



Loading…: 100%|██████████| 8280/8280 [00:00<00:00, 375946.80it/s]


Gathering Data for Year: 1997, Month: 11



Loading…: 100%|██████████| 7893/7893 [00:00<00:00, 369874.77it/s]


Gathering Data for Year: 1997, Month: 12



Loading…: 100%|██████████| 7742/7742 [00:00<00:00, 344749.52it/s]
Loading…:  80%|████████  | 8/10 [09:55<02:28, 74.30s/it]

Gathering Data for Year: 1998, Month: 1



Loading…: 100%|██████████| 7825/7825 [00:00<00:00, 378739.50it/s]


Gathering Data for Year: 1998, Month: 2



Loading…: 100%|██████████| 7147/7147 [00:00<00:00, 295993.00it/s]


Gathering Data for Year: 1998, Month: 3



Loading…: 100%|██████████| 8137/8137 [00:00<00:00, 393999.81it/s]


Gathering Data for Year: 1998, Month: 4



Loading…: 100%|██████████| 7669/7669 [00:00<00:00, 350065.49it/s]


Gathering Data for Year: 1998, Month: 5



Loading…: 100%|██████████| 8089/8089 [00:00<00:00, 391251.04it/s]


Gathering Data for Year: 1998, Month: 6



Loading…: 100%|██████████| 7628/7628 [00:00<00:00, 375069.18it/s]


Gathering Data for Year: 1998, Month: 7



Loading…: 100%|██████████| 7579/7579 [00:00<00:00, 355362.87it/s]


Gathering Data for Year: 1998, Month: 8



Loading…: 100%|██████████| 7730/7730 [00:00<00:00, 399802.33it/s]


Gathering Data for Year: 1998, Month: 9



Loading…: 100%|██████████| 7805/7805 [00:00<00:00, 368256.65it/s]


Gathering Data for Year: 1998, Month: 10



Loading…: 100%|██████████| 8314/8314 [00:00<00:00, 369890.68it/s]


Gathering Data for Year: 1998, Month: 11



Loading…: 100%|██████████| 8422/8422 [00:00<00:00, 279030.53it/s]


Gathering Data for Year: 1998, Month: 12



Loading…: 100%|██████████| 7899/7899 [00:00<00:00, 374643.03it/s]
Loading…:  90%|█████████ | 9/10 [11:10<01:14, 74.54s/it]

Gathering Data for Year: 1999, Month: 1



Loading…: 100%|██████████| 8588/8588 [00:00<00:00, 310756.19it/s]


Gathering Data for Year: 1999, Month: 2



Loading…: 100%|██████████| 7439/7439 [00:00<00:00, 369913.07it/s]


Gathering Data for Year: 1999, Month: 3



Loading…: 100%|██████████| 8372/8372 [00:00<00:00, 381602.85it/s]


Gathering Data for Year: 1999, Month: 4



Loading…: 100%|██████████| 7879/7879 [00:00<00:00, 376144.43it/s]


Gathering Data for Year: 1999, Month: 5



Loading…: 100%|██████████| 8031/8031 [00:00<00:00, 364416.29it/s]


Gathering Data for Year: 1999, Month: 6



Loading…: 100%|██████████| 8382/8382 [00:00<00:00, 267468.97it/s]


Gathering Data for Year: 1999, Month: 7



Loading…: 100%|██████████| 7821/7821 [00:00<00:00, 335738.35it/s]


Gathering Data for Year: 1999, Month: 8



Loading…: 100%|██████████| 7755/7755 [00:00<00:00, 402216.27it/s]


Gathering Data for Year: 1999, Month: 9



Loading…: 100%|██████████| 7575/7575 [00:00<00:00, 349091.37it/s]


Gathering Data for Year: 1999, Month: 10



Loading…: 100%|██████████| 8511/8511 [00:00<00:00, 377414.19it/s]


Gathering Data for Year: 1999, Month: 11



Loading…: 100%|██████████| 7874/7874 [00:00<00:00, 391739.02it/s]


Gathering Data for Year: 1999, Month: 12



Loading…: 100%|██████████| 8034/8034 [00:00<00:00, 371571.08it/s]
Loading…: 100%|██████████| 10/10 [12:25<00:00, 74.56s/it]


In [20]:
dict_0009 = get_data(2000, 2009) ## Gather Data from 1990 to 1999

Loading…:   0%|          | 0/10 [00:00<?, ?it/s]

Gathering Data for Year: 2000, Month: 1



Loading…: 100%|██████████| 8502/8502 [00:00<00:00, 379067.03it/s]


Gathering Data for Year: 2000, Month: 2



Loading…: 100%|██████████| 7953/7953 [00:00<00:00, 363226.84it/s]


Gathering Data for Year: 2000, Month: 3



Loading…: 100%|██████████| 8482/8482 [00:00<00:00, 225149.43it/s]


Gathering Data for Year: 2000, Month: 4



Loading…: 100%|██████████| 8231/8231 [00:00<00:00, 357561.90it/s]


Gathering Data for Year: 2000, Month: 5



Loading…: 100%|██████████| 8331/8331 [00:00<00:00, 428876.91it/s]


Gathering Data for Year: 2000, Month: 6



Loading…: 100%|██████████| 8193/8193 [00:00<00:00, 395733.71it/s]


Gathering Data for Year: 2000, Month: 7



Loading…: 100%|██████████| 8215/8215 [00:00<00:00, 388676.90it/s]


Gathering Data for Year: 2000, Month: 8



Loading…: 100%|██████████| 8131/8131 [00:00<00:00, 371789.57it/s]


Gathering Data for Year: 2000, Month: 9



Loading…: 100%|██████████| 9736/9736 [00:00<00:00, 382590.00it/s]


Gathering Data for Year: 2000, Month: 10



Loading…: 100%|██████████| 9966/9966 [00:00<00:00, 407300.48it/s]


Gathering Data for Year: 2000, Month: 11



Loading…: 100%|██████████| 8635/8635 [00:00<00:00, 383980.57it/s]


Gathering Data for Year: 2000, Month: 12



Loading…: 100%|██████████| 8425/8425 [00:00<00:00, 373774.46it/s]
Loading…:  10%|█         | 1/10 [01:16<11:25, 76.19s/it]

Gathering Data for Year: 2001, Month: 1



Loading…: 100%|██████████| 9180/9180 [00:00<00:00, 374345.79it/s]


Gathering Data for Year: 2001, Month: 2



Loading…: 100%|██████████| 8230/8230 [00:00<00:00, 349528.87it/s]


Gathering Data for Year: 2001, Month: 3



Loading…: 100%|██████████| 8978/8978 [00:00<00:00, 397459.03it/s]


Gathering Data for Year: 2001, Month: 4



Loading…: 100%|██████████| 8788/8788 [00:00<00:00, 349101.60it/s]


Gathering Data for Year: 2001, Month: 5



Loading…: 100%|██████████| 9649/9649 [00:00<00:00, 383028.95it/s]


Gathering Data for Year: 2001, Month: 6



Loading…: 100%|██████████| 9215/9215 [00:00<00:00, 282607.35it/s]


Gathering Data for Year: 2001, Month: 7



Loading…: 100%|██████████| 9058/9058 [00:00<00:00, 389877.53it/s]


Gathering Data for Year: 2001, Month: 8



Loading…: 100%|██████████| 9214/9214 [00:00<00:00, 342357.28it/s]


Gathering Data for Year: 2001, Month: 9



Loading…: 100%|██████████| 9482/9482 [00:00<00:00, 389260.84it/s]


Gathering Data for Year: 2001, Month: 10



Loading…: 100%|██████████| 10168/10168 [00:00<00:00, 422664.40it/s]


Gathering Data for Year: 2001, Month: 11



Loading…: 100%|██████████| 9667/9667 [00:00<00:00, 419695.23it/s]


Gathering Data for Year: 2001, Month: 12



Loading…: 100%|██████████| 9683/9683 [00:00<00:00, 381640.75it/s]
Loading…:  20%|██        | 2/10 [02:33<10:14, 76.83s/it]

Gathering Data for Year: 2002, Month: 1



Loading…: 100%|██████████| 9147/9147 [00:00<00:00, 355725.02it/s]


Gathering Data for Year: 2002, Month: 2



Loading…: 100%|██████████| 8535/8535 [00:00<00:00, 310128.00it/s]


Gathering Data for Year: 2002, Month: 3



Loading…: 100%|██████████| 8902/8902 [00:00<00:00, 355983.58it/s]


Gathering Data for Year: 2002, Month: 4



Loading…: 100%|██████████| 10616/10616 [00:00<00:00, 407239.31it/s]


Gathering Data for Year: 2002, Month: 5



Loading…: 100%|██████████| 9260/9260 [00:00<00:00, 417882.52it/s]


Gathering Data for Year: 2002, Month: 6



Loading…: 100%|██████████| 9268/9268 [00:00<00:00, 333431.77it/s]


Gathering Data for Year: 2002, Month: 7



Loading…: 100%|██████████| 8580/8580 [00:00<00:00, 386912.61it/s]


Gathering Data for Year: 2002, Month: 8



Loading…: 100%|██████████| 8487/8487 [00:00<00:00, 234029.51it/s]


Gathering Data for Year: 2002, Month: 9



Loading…: 100%|██████████| 9272/9272 [00:00<00:00, 367056.03it/s]


Gathering Data for Year: 2002, Month: 10



Loading…: 100%|██████████| 9713/9713 [00:00<00:00, 408918.01it/s]


Gathering Data for Year: 2002, Month: 11



Loading…: 100%|██████████| 9115/9115 [00:00<00:00, 392089.52it/s]


Gathering Data for Year: 2002, Month: 12



Loading…: 100%|██████████| 9155/9155 [00:00<00:00, 400864.95it/s]
Loading…:  30%|███       | 3/10 [03:50<08:58, 76.93s/it]

Gathering Data for Year: 2003, Month: 1



Loading…: 100%|██████████| 9213/9213 [00:00<00:00, 383780.82it/s]


Gathering Data for Year: 2003, Month: 2



Loading…: 100%|██████████| 8603/8603 [00:00<00:00, 351120.47it/s]


Gathering Data for Year: 2003, Month: 3



Loading…: 100%|██████████| 9766/9766 [00:00<00:00, 351284.88it/s]


Gathering Data for Year: 2003, Month: 4



Loading…: 100%|██████████| 9351/9351 [00:00<00:00, 303581.72it/s]


Gathering Data for Year: 2003, Month: 5



Loading…: 100%|██████████| 9157/9157 [00:00<00:00, 312923.09it/s]


Gathering Data for Year: 2003, Month: 6



Loading…: 100%|██████████| 9173/9173 [00:00<00:00, 403134.50it/s]


Gathering Data for Year: 2003, Month: 7



Loading…: 100%|██████████| 8568/8568 [00:00<00:00, 356611.36it/s]


Gathering Data for Year: 2003, Month: 8



Loading…: 100%|██████████| 8701/8701 [00:00<00:00, 296965.13it/s]


Gathering Data for Year: 2003, Month: 9



Loading…: 100%|██████████| 8555/8555 [00:00<00:00, 433806.09it/s]


Gathering Data for Year: 2003, Month: 10



Loading…: 100%|██████████| 9204/9204 [00:00<00:00, 387189.82it/s]


Gathering Data for Year: 2003, Month: 11



Loading…: 100%|██████████| 9034/9034 [00:00<00:00, 450540.32it/s]


Gathering Data for Year: 2003, Month: 12



Loading…: 100%|██████████| 8747/8747 [00:00<00:00, 414567.63it/s]
Loading…:  40%|████      | 4/10 [05:07<07:41, 76.94s/it]

Gathering Data for Year: 2004, Month: 1



Loading…: 100%|██████████| 8810/8810 [00:00<00:00, 359285.72it/s]


Gathering Data for Year: 2004, Month: 2



Loading…: 100%|██████████| 8750/8750 [00:00<00:00, 404877.93it/s]


Gathering Data for Year: 2004, Month: 3



Loading…: 100%|██████████| 8986/8986 [00:00<00:00, 398368.22it/s]


Gathering Data for Year: 2004, Month: 4



Loading…: 100%|██████████| 8519/8519 [00:00<00:00, 415122.75it/s]


Gathering Data for Year: 2004, Month: 5



Loading…: 100%|██████████| 8899/8899 [00:00<00:00, 359479.46it/s]


Gathering Data for Year: 2004, Month: 6



Loading…: 100%|██████████| 8436/8436 [00:00<00:00, 440598.56it/s]


Gathering Data for Year: 2004, Month: 7



Loading…: 100%|██████████| 8696/8696 [00:00<00:00, 462435.40it/s]


Gathering Data for Year: 2004, Month: 8



Loading…: 100%|██████████| 8682/8682 [00:00<00:00, 358732.61it/s]


Gathering Data for Year: 2004, Month: 9



Loading…: 100%|██████████| 9131/9131 [00:00<00:00, 321005.39it/s]


Gathering Data for Year: 2004, Month: 10



Loading…: 100%|██████████| 10876/10876 [00:00<00:00, 340046.59it/s]


Gathering Data for Year: 2004, Month: 11



Loading…: 100%|██████████| 8997/8997 [00:00<00:00, 306347.19it/s]


Gathering Data for Year: 2004, Month: 12



Loading…: 100%|██████████| 9165/9165 [00:00<00:00, 372217.83it/s]
Loading…:  50%|█████     | 5/10 [06:24<06:24, 76.95s/it]

Gathering Data for Year: 2005, Month: 1



Loading…: 100%|██████████| 10567/10567 [00:00<00:00, 358513.33it/s]


Gathering Data for Year: 2005, Month: 2



Loading…: 100%|██████████| 9311/9311 [00:00<00:00, 265906.56it/s]


Gathering Data for Year: 2005, Month: 3



Loading…: 100%|██████████| 10262/10262 [00:00<00:00, 337192.50it/s]


Gathering Data for Year: 2005, Month: 4



Loading…: 100%|██████████| 10180/10180 [00:00<00:00, 407645.52it/s]


Gathering Data for Year: 2005, Month: 5



Loading…: 100%|██████████| 10526/10526 [00:00<00:00, 326646.72it/s]


Gathering Data for Year: 2005, Month: 6



Loading…: 100%|██████████| 10032/10032 [00:00<00:00, 348760.51it/s]


Gathering Data for Year: 2005, Month: 7



Loading…: 100%|██████████| 10113/10113 [00:00<00:00, 342846.72it/s]


Gathering Data for Year: 2005, Month: 8



Loading…: 100%|██████████| 9698/9698 [00:00<00:00, 401020.98it/s]


Gathering Data for Year: 2005, Month: 9



Loading…: 100%|██████████| 10256/10256 [00:00<00:00, 375781.03it/s]


Gathering Data for Year: 2005, Month: 10



Loading…: 100%|██████████| 10464/10464 [00:00<00:00, 399930.72it/s]


Gathering Data for Year: 2005, Month: 11



Loading…: 100%|██████████| 10130/10130 [00:00<00:00, 325441.19it/s]


Gathering Data for Year: 2005, Month: 12



Loading…: 100%|██████████| 9946/9946 [00:00<00:00, 392792.69it/s]
Loading…:  60%|██████    | 6/10 [07:41<05:08, 77.13s/it]

Gathering Data for Year: 2006, Month: 1



Loading…: 100%|██████████| 10671/10671 [00:00<00:00, 370655.71it/s]


Gathering Data for Year: 2006, Month: 2



Loading…: 100%|██████████| 10030/10030 [00:00<00:00, 454307.44it/s]


Gathering Data for Year: 2006, Month: 3



Loading…: 100%|██████████| 12357/12357 [00:00<00:00, 382982.45it/s]


Gathering Data for Year: 2006, Month: 4



Loading…: 100%|██████████| 12130/12130 [00:00<00:00, 363536.32it/s]


Gathering Data for Year: 2006, Month: 5



Loading…: 100%|██████████| 15001/15001 [00:00<00:00, 422929.20it/s]


Gathering Data for Year: 2006, Month: 6



Loading…: 100%|██████████| 15109/15109 [00:00<00:00, 424424.96it/s]


Gathering Data for Year: 2006, Month: 7



Loading…: 100%|██████████| 14294/14294 [00:00<00:00, 405792.33it/s]


Gathering Data for Year: 2006, Month: 8



Loading…: 100%|██████████| 14075/14075 [00:00<00:00, 326597.75it/s]


Gathering Data for Year: 2006, Month: 9



Loading…: 100%|██████████| 15036/15036 [00:00<00:00, 423236.03it/s]


Gathering Data for Year: 2006, Month: 10



Loading…: 100%|██████████| 14845/14845 [00:00<00:00, 384267.76it/s]


Gathering Data for Year: 2006, Month: 11



Loading…: 100%|██████████| 15028/15028 [00:00<00:00, 420502.08it/s]


Gathering Data for Year: 2006, Month: 12



Loading…: 100%|██████████| 14904/14904 [00:00<00:00, 367747.38it/s]
Loading…:  70%|███████   | 7/10 [09:04<03:56, 78.78s/it]

Gathering Data for Year: 2007, Month: 1



Loading…: 100%|██████████| 7905/7905 [00:00<00:00, 445237.86it/s]


Gathering Data for Year: 2007, Month: 2



Loading…: 100%|██████████| 7186/7186 [00:00<00:00, 409781.77it/s]


Gathering Data for Year: 2007, Month: 3



Loading…: 100%|██████████| 8428/8428 [00:00<00:00, 396811.93it/s]


Gathering Data for Year: 2007, Month: 4



Loading…: 100%|██████████| 8068/8068 [00:00<00:00, 377186.28it/s]


Gathering Data for Year: 2007, Month: 5



Loading…: 100%|██████████| 8254/8254 [00:00<00:00, 429398.01it/s]


Gathering Data for Year: 2007, Month: 6



Loading…: 100%|██████████| 8230/8230 [00:00<00:00, 440722.15it/s]


Gathering Data for Year: 2007, Month: 7



Loading…: 100%|██████████| 8558/8558 [00:00<00:00, 372210.39it/s]


Gathering Data for Year: 2007, Month: 8



Loading…: 100%|██████████| 8061/8061 [00:00<00:00, 438781.19it/s]


Gathering Data for Year: 2007, Month: 9



Loading…: 100%|██████████| 9063/9063 [00:00<00:00, 414229.11it/s]


Gathering Data for Year: 2007, Month: 10



Loading…: 100%|██████████| 9561/9561 [00:00<00:00, 430983.70it/s]


Gathering Data for Year: 2007, Month: 11



Loading…: 100%|██████████| 8687/8687 [00:00<00:00, 482818.77it/s]


Gathering Data for Year: 2007, Month: 12



Loading…: 100%|██████████| 8962/8962 [00:00<00:00, 359575.97it/s]
Loading…:  80%|████████  | 8/10 [10:19<02:35, 77.70s/it]

Gathering Data for Year: 2008, Month: 1



Loading…: 100%|██████████| 9857/9857 [00:00<00:00, 360550.59it/s]


Gathering Data for Year: 2008, Month: 2



Loading…: 100%|██████████| 9083/9083 [00:00<00:00, 466598.85it/s]


Gathering Data for Year: 2008, Month: 3



Loading…: 100%|██████████| 9561/9561 [00:00<00:00, 412039.46it/s]


Gathering Data for Year: 2008, Month: 4



Loading…: 100%|██████████| 9449/9449 [00:00<00:00, 423395.96it/s]


Gathering Data for Year: 2008, Month: 5



Loading…: 100%|██████████| 9468/9468 [00:00<00:00, 381898.06it/s]


Gathering Data for Year: 2008, Month: 6



Loading…: 100%|██████████| 9368/9368 [00:00<00:00, 423837.07it/s]


Gathering Data for Year: 2008, Month: 7



Loading…: 100%|██████████| 9323/9323 [00:00<00:00, 377377.66it/s]


Gathering Data for Year: 2008, Month: 8



Loading…: 100%|██████████| 10281/10281 [00:00<00:00, 500314.88it/s]


Gathering Data for Year: 2008, Month: 9



Loading…: 100%|██████████| 10769/10769 [00:00<00:00, 410405.97it/s]


Gathering Data for Year: 2008, Month: 10



Loading…: 100%|██████████| 11545/11545 [00:00<00:00, 445635.87it/s]


Gathering Data for Year: 2008, Month: 11



Loading…: 100%|██████████| 10054/10054 [00:00<00:00, 463462.57it/s]


Gathering Data for Year: 2008, Month: 12



Loading…: 100%|██████████| 10137/10137 [00:00<00:00, 415816.57it/s]
Loading…:  90%|█████████ | 9/10 [11:36<01:17, 77.51s/it]

Gathering Data for Year: 2009, Month: 1



Loading…: 100%|██████████| 10691/10691 [00:00<00:00, 413531.65it/s]


Gathering Data for Year: 2009, Month: 2



Loading…: 100%|██████████| 10072/10072 [00:00<00:00, 385943.87it/s]


Gathering Data for Year: 2009, Month: 3



Loading…: 100%|██████████| 11334/11334 [00:00<00:00, 411697.01it/s]


Gathering Data for Year: 2009, Month: 4



Loading…: 100%|██████████| 11042/11042 [00:00<00:00, 422838.54it/s]


Gathering Data for Year: 2009, Month: 5



Loading…: 100%|██████████| 13632/13632 [00:00<00:00, 410301.55it/s]


Gathering Data for Year: 2009, Month: 6



Loading…: 100%|██████████| 14413/14413 [00:00<00:00, 407383.85it/s]


Gathering Data for Year: 2009, Month: 7



Loading…: 100%|██████████| 13687/13687 [00:00<00:00, 409760.45it/s]


Gathering Data for Year: 2009, Month: 8



Loading…: 100%|██████████| 13140/13140 [00:00<00:00, 348365.44it/s]


Gathering Data for Year: 2009, Month: 9



Loading…: 100%|██████████| 14558/14558 [00:00<00:00, 309897.62it/s]


Gathering Data for Year: 2009, Month: 10



Loading…: 100%|██████████| 15463/15463 [00:00<00:00, 396237.33it/s]


Gathering Data for Year: 2009, Month: 11



Loading…: 100%|██████████| 9432/9432 [00:00<00:00, 429596.42it/s]


Gathering Data for Year: 2009, Month: 12



Loading…: 100%|██████████| 9475/9475 [00:00<00:00, 379541.49it/s]
Loading…: 100%|██████████| 10/10 [12:55<00:00, 77.55s/it]


In [21]:
dict_1019 = get_data(2010, 2019) ## Gather Data from 1990 to 1999

Loading…:   0%|          | 0/10 [00:00<?, ?it/s]

Gathering Data for Year: 2010, Month: 1



Loading…: 100%|██████████| 9820/9820 [00:00<00:00, 425628.45it/s]


Gathering Data for Year: 2010, Month: 2



Loading…: 100%|██████████| 9298/9298 [00:00<00:00, 459272.19it/s]


Gathering Data for Year: 2010, Month: 3



Loading…: 100%|██████████| 10452/10452 [00:00<00:00, 393452.45it/s]


Gathering Data for Year: 2010, Month: 4



Loading…: 100%|██████████| 9770/9770 [00:00<00:00, 486499.63it/s]


Gathering Data for Year: 2010, Month: 5



Loading…: 100%|██████████| 9537/9537 [00:00<00:00, 478333.02it/s]


Gathering Data for Year: 2010, Month: 6



Loading…: 100%|██████████| 9609/9609 [00:00<00:00, 430183.88it/s]


Gathering Data for Year: 2010, Month: 7



Loading…: 100%|██████████| 9085/9085 [00:00<00:00, 425073.09it/s]


Gathering Data for Year: 2010, Month: 8



Loading…: 100%|██████████| 8793/8793 [00:00<00:00, 450091.71it/s]


Gathering Data for Year: 2010, Month: 9



Loading…: 100%|██████████| 9404/9404 [00:00<00:00, 431073.60it/s]


Gathering Data for Year: 2010, Month: 10



Loading…: 100%|██████████| 9691/9691 [00:00<00:00, 434476.35it/s]


Gathering Data for Year: 2010, Month: 11



Loading…: 100%|██████████| 8679/8679 [00:00<00:00, 467080.23it/s]


Gathering Data for Year: 2010, Month: 12



Loading…: 100%|██████████| 8310/8310 [00:00<00:00, 452469.96it/s]
Loading…:  10%|█         | 1/10 [01:16<11:26, 76.29s/it]

Gathering Data for Year: 2011, Month: 1



Loading…: 100%|██████████| 8478/8478 [00:00<00:00, 504716.68it/s]


Gathering Data for Year: 2011, Month: 2



Loading…: 100%|██████████| 7917/7917 [00:00<00:00, 467003.79it/s]


Gathering Data for Year: 2011, Month: 3



Loading…: 100%|██████████| 9026/9026 [00:00<00:00, 459361.13it/s]


Gathering Data for Year: 2011, Month: 4



Loading…: 100%|██████████| 8751/8751 [00:00<00:00, 523375.93it/s]


Gathering Data for Year: 2011, Month: 5



Loading…: 100%|██████████| 8732/8732 [00:00<00:00, 538724.74it/s]


Gathering Data for Year: 2011, Month: 6



Loading…: 100%|██████████| 8602/8602 [00:00<00:00, 478767.01it/s]


Gathering Data for Year: 2011, Month: 7



Loading…: 100%|██████████| 7987/7987 [00:00<00:00, 465852.32it/s]


Gathering Data for Year: 2011, Month: 8



Loading…: 100%|██████████| 8138/8138 [00:00<00:00, 448295.85it/s]


Gathering Data for Year: 2011, Month: 9



Loading…: 100%|██████████| 9200/9200 [00:00<00:00, 367053.47it/s]


Gathering Data for Year: 2011, Month: 10



Loading…: 100%|██████████| 8912/8912 [00:00<00:00, 490404.83it/s]


Gathering Data for Year: 2011, Month: 11



Loading…: 100%|██████████| 8722/8722 [00:00<00:00, 490437.57it/s]


Gathering Data for Year: 2011, Month: 12



Loading…: 100%|██████████| 8504/8504 [00:00<00:00, 388129.90it/s]
Loading…:  20%|██        | 2/10 [02:32<10:11, 76.46s/it]

Gathering Data for Year: 2012, Month: 1



Loading…: 100%|██████████| 9168/9168 [00:00<00:00, 417817.10it/s]


Gathering Data for Year: 2012, Month: 2



Loading…: 100%|██████████| 8796/8796 [00:00<00:00, 503350.81it/s]


Gathering Data for Year: 2012, Month: 3



Loading…: 100%|██████████| 9280/9280 [00:00<00:00, 423551.81it/s]


Gathering Data for Year: 2012, Month: 4



Loading…: 100%|██████████| 8240/8240 [00:00<00:00, 473432.76it/s]


Gathering Data for Year: 2012, Month: 5



Loading…: 100%|██████████| 8862/8862 [00:00<00:00, 509267.71it/s]


Gathering Data for Year: 2012, Month: 6



Loading…: 100%|██████████| 8704/8704 [00:00<00:00, 531408.35it/s]


Gathering Data for Year: 2012, Month: 7



Loading…: 100%|██████████| 7795/7795 [00:00<00:00, 394290.88it/s]


Gathering Data for Year: 2012, Month: 8



Loading…: 100%|██████████| 8202/8202 [00:00<00:00, 514156.26it/s]


Gathering Data for Year: 2012, Month: 9



Loading…: 100%|██████████| 8306/8306 [00:00<00:00, 438571.02it/s]


Gathering Data for Year: 2012, Month: 10



Loading…: 100%|██████████| 8731/8731 [00:00<00:00, 541971.44it/s]


Gathering Data for Year: 2012, Month: 11



Loading…: 100%|██████████| 7959/7959 [00:00<00:00, 410623.58it/s]


Gathering Data for Year: 2012, Month: 12



Loading…: 100%|██████████| 7212/7212 [00:00<00:00, 507734.87it/s]
Loading…:  30%|███       | 3/10 [03:51<09:01, 77.39s/it]

Gathering Data for Year: 2013, Month: 1



Loading…: 100%|██████████| 7961/7961 [00:00<00:00, 432787.50it/s]


Gathering Data for Year: 2013, Month: 2



Loading…: 100%|██████████| 7504/7504 [00:00<00:00, 429258.03it/s]


Gathering Data for Year: 2013, Month: 3



Loading…: 100%|██████████| 8181/8181 [00:00<00:00, 244740.53it/s]


Gathering Data for Year: 2013, Month: 4



Loading…: 100%|██████████| 7616/7616 [00:00<00:00, 465171.90it/s]


Gathering Data for Year: 2013, Month: 5



Loading…: 100%|██████████| 7968/7968 [00:00<00:00, 479355.90it/s]


Gathering Data for Year: 2013, Month: 6



Loading…: 100%|██████████| 7401/7401 [00:00<00:00, 419424.73it/s]


Gathering Data for Year: 2013, Month: 7



Loading…: 100%|██████████| 6870/6870 [00:00<00:00, 422865.02it/s]


Gathering Data for Year: 2013, Month: 8



Loading…: 100%|██████████| 6634/6634 [00:00<00:00, 460915.58it/s]


Gathering Data for Year: 2013, Month: 9



Loading…: 100%|██████████| 7348/7348 [00:00<00:00, 372259.62it/s]


Gathering Data for Year: 2013, Month: 10



Loading…: 100%|██████████| 7711/7711 [00:00<00:00, 526284.35it/s]


Gathering Data for Year: 2013, Month: 11



Loading…: 100%|██████████| 7015/7015 [00:00<00:00, 496097.43it/s]


Gathering Data for Year: 2013, Month: 12



Loading…: 100%|██████████| 6775/6775 [00:00<00:00, 541378.38it/s]
Loading…:  40%|████      | 4/10 [05:08<07:44, 77.47s/it]

Gathering Data for Year: 2014, Month: 1



Loading…: 100%|██████████| 7337/7337 [00:00<00:00, 419928.34it/s]


Gathering Data for Year: 2014, Month: 2



Loading…: 100%|██████████| 6941/6941 [00:00<00:00, 437317.51it/s]


Gathering Data for Year: 2014, Month: 3



Loading…: 100%|██████████| 7187/7187 [00:00<00:00, 352826.79it/s]


Gathering Data for Year: 2014, Month: 4



Loading…: 100%|██████████| 7344/7344 [00:00<00:00, 409249.32it/s]


Gathering Data for Year: 2014, Month: 5



Loading…: 100%|██████████| 7549/7549 [00:00<00:00, 434761.37it/s]


Gathering Data for Year: 2014, Month: 6



Loading…: 100%|██████████| 7345/7345 [00:00<00:00, 403816.53it/s]


Gathering Data for Year: 2014, Month: 7



Loading…: 100%|██████████| 6814/6814 [00:00<00:00, 414244.74it/s]


Gathering Data for Year: 2014, Month: 8



Loading…: 100%|██████████| 6577/6577 [00:00<00:00, 393057.26it/s]


Gathering Data for Year: 2014, Month: 9



Loading…: 100%|██████████| 7585/7585 [00:00<00:00, 481334.38it/s]


Gathering Data for Year: 2014, Month: 10



Loading…: 100%|██████████| 7862/7862 [00:00<00:00, 417788.37it/s]


Gathering Data for Year: 2014, Month: 11



Loading…: 100%|██████████| 6709/6709 [00:00<00:00, 379936.08it/s]


Gathering Data for Year: 2014, Month: 12



Loading…: 100%|██████████| 6701/6701 [00:00<00:00, 498705.26it/s]
Loading…:  50%|█████     | 5/10 [06:26<06:28, 77.63s/it]

Gathering Data for Year: 2015, Month: 1



Loading…: 100%|██████████| 6906/6906 [00:00<00:00, 494373.94it/s]


Gathering Data for Year: 2015, Month: 2



Loading…: 100%|██████████| 6437/6437 [00:00<00:00, 477287.73it/s]


Gathering Data for Year: 2015, Month: 3



Loading…: 100%|██████████| 6975/6975 [00:00<00:00, 422020.00it/s]


Gathering Data for Year: 2015, Month: 4



Loading…: 100%|██████████| 6542/6542 [00:00<00:00, 362759.61it/s]


Gathering Data for Year: 2015, Month: 5



Loading…: 100%|██████████| 6634/6634 [00:00<00:00, 370762.88it/s]


Gathering Data for Year: 2015, Month: 6



Loading…: 100%|██████████| 6947/6947 [00:00<00:00, 444872.74it/s]


Gathering Data for Year: 2015, Month: 7



Loading…: 100%|██████████| 6758/6758 [00:00<00:00, 342518.35it/s]


Gathering Data for Year: 2015, Month: 8



Loading…: 100%|██████████| 6086/6086 [00:00<00:00, 353220.43it/s]


Gathering Data for Year: 2015, Month: 9



Loading…: 100%|██████████| 6990/6990 [00:00<00:00, 419508.42it/s]


Gathering Data for Year: 2015, Month: 10



Loading…: 100%|██████████| 7365/7365 [00:00<00:00, 419681.13it/s]


Gathering Data for Year: 2015, Month: 11



Loading…: 100%|██████████| 6241/6241 [00:00<00:00, 427046.21it/s]


Gathering Data for Year: 2015, Month: 12



Loading…: 100%|██████████| 6371/6371 [00:00<00:00, 355604.64it/s]
Loading…:  60%|██████    | 6/10 [07:44<05:11, 77.77s/it]

Gathering Data for Year: 2016, Month: 1



Loading…: 100%|██████████| 6265/6265 [00:00<00:00, 359180.89it/s]


Gathering Data for Year: 2016, Month: 2



Loading…: 100%|██████████| 6448/6448 [00:00<00:00, 421200.64it/s]


Gathering Data for Year: 2016, Month: 3



Loading…: 100%|██████████| 6487/6487 [00:00<00:00, 460286.41it/s]


Gathering Data for Year: 2016, Month: 4



Loading…: 100%|██████████| 5927/5927 [00:00<00:00, 388352.99it/s]


Gathering Data for Year: 2016, Month: 5



Loading…: 100%|██████████| 5921/5921 [00:00<00:00, 403242.14it/s]


Gathering Data for Year: 2016, Month: 6



Loading…: 100%|██████████| 5920/5920 [00:00<00:00, 277194.81it/s]


Gathering Data for Year: 2016, Month: 7



Loading…: 100%|██████████| 5594/5594 [00:00<00:00, 412941.74it/s]


Gathering Data for Year: 2016, Month: 8



Loading…: 100%|██████████| 5224/5224 [00:00<00:00, 284925.35it/s]


Gathering Data for Year: 2016, Month: 9



Loading…: 100%|██████████| 5894/5894 [00:00<00:00, 473686.56it/s]


Gathering Data for Year: 2016, Month: 10



Loading…: 100%|██████████| 5511/5511 [00:00<00:00, 328952.15it/s]


Gathering Data for Year: 2016, Month: 11



Loading…: 100%|██████████| 5331/5331 [00:00<00:00, 423321.37it/s]


Gathering Data for Year: 2016, Month: 12



Loading…: 100%|██████████| 4967/4967 [00:00<00:00, 522434.18it/s]
Loading…:  70%|███████   | 7/10 [09:02<03:52, 77.63s/it]

Gathering Data for Year: 2017, Month: 1



Loading…: 100%|██████████| 5129/5129 [00:00<00:00, 409506.12it/s]


Gathering Data for Year: 2017, Month: 2



Loading…: 100%|██████████| 4935/4935 [00:00<00:00, 514949.01it/s]


Gathering Data for Year: 2017, Month: 3



Loading…: 100%|██████████| 5546/5546 [00:00<00:00, 375805.52it/s]


Gathering Data for Year: 2017, Month: 4



Loading…: 100%|██████████| 4878/4878 [00:00<00:00, 366040.16it/s]


Gathering Data for Year: 2017, Month: 5



Loading…: 100%|██████████| 5384/5384 [00:00<00:00, 507201.51it/s]


Gathering Data for Year: 2017, Month: 6



Loading…: 100%|██████████| 5300/5300 [00:00<00:00, 343678.48it/s]


Gathering Data for Year: 2017, Month: 7



Loading…: 100%|██████████| 4757/4757 [00:00<00:00, 452371.65it/s]


Gathering Data for Year: 2017, Month: 8



Loading…: 100%|██████████| 4854/4854 [00:00<00:00, 452084.02it/s]


Gathering Data for Year: 2017, Month: 9



Loading…: 100%|██████████| 5033/5033 [00:00<00:00, 404722.71it/s]


Gathering Data for Year: 2017, Month: 10



Loading…: 100%|██████████| 5214/5214 [00:00<00:00, 414729.50it/s]


Gathering Data for Year: 2017, Month: 11



Loading…: 100%|██████████| 4834/4834 [00:00<00:00, 395476.04it/s]


Gathering Data for Year: 2017, Month: 12



Loading…: 100%|██████████| 4560/4560 [00:00<00:00, 338417.90it/s]
Loading…:  80%|████████  | 8/10 [10:18<02:34, 77.05s/it]

Gathering Data for Year: 2018, Month: 1



Loading…: 100%|██████████| 4757/4757 [00:00<00:00, 431056.33it/s]


Gathering Data for Year: 2018, Month: 2



Loading…: 100%|██████████| 4573/4573 [00:00<00:00, 298506.77it/s]


Gathering Data for Year: 2018, Month: 3



Loading…: 100%|██████████| 5034/5034 [00:00<00:00, 450569.27it/s]


Gathering Data for Year: 2018, Month: 4



Loading…: 100%|██████████| 4655/4655 [00:00<00:00, 392325.79it/s]


Gathering Data for Year: 2018, Month: 5



Loading…: 100%|██████████| 5127/5127 [00:00<00:00, 527167.01it/s]


Gathering Data for Year: 2018, Month: 6



Loading…: 100%|██████████| 4988/4988 [00:00<00:00, 301002.64it/s]


Gathering Data for Year: 2018, Month: 7



Loading…: 100%|██████████| 4444/4444 [00:00<00:00, 390896.04it/s]


Gathering Data for Year: 2018, Month: 8



Loading…: 100%|██████████| 6381/6381 [00:00<00:00, 514729.09it/s]


Gathering Data for Year: 2018, Month: 9



Loading…: 100%|██████████| 4722/4722 [00:00<00:00, 367564.97it/s]


Gathering Data for Year: 2018, Month: 10



Loading…: 100%|██████████| 5146/5146 [00:00<00:00, 351993.48it/s]


Gathering Data for Year: 2018, Month: 11



Loading…: 100%|██████████| 4829/4829 [00:00<00:00, 342868.89it/s]


Gathering Data for Year: 2018, Month: 12



Loading…: 100%|██████████| 4193/4193 [00:00<00:00, 408993.41it/s]
Loading…:  90%|█████████ | 9/10 [11:33<01:16, 76.69s/it]

Gathering Data for Year: 2019, Month: 1



Loading…: 100%|██████████| 4482/4482 [00:00<00:00, 364269.78it/s]


Gathering Data for Year: 2019, Month: 2



Loading…: 100%|██████████| 4122/4122 [00:00<00:00, 379051.57it/s]


Gathering Data for Year: 2019, Month: 3



Loading…: 100%|██████████| 4690/4690 [00:00<00:00, 346844.50it/s]


Gathering Data for Year: 2019, Month: 4



Loading…: 100%|██████████| 4523/4523 [00:00<00:00, 382785.25it/s]


Gathering Data for Year: 2019, Month: 5



Loading…: 100%|██████████| 4737/4737 [00:00<00:00, 406764.62it/s]


Gathering Data for Year: 2019, Month: 6



Loading…: 100%|██████████| 4496/4496 [00:00<00:00, 390393.98it/s]


Gathering Data for Year: 2019, Month: 7



Loading…: 100%|██████████| 4287/4287 [00:00<00:00, 325076.95it/s]


Gathering Data for Year: 2019, Month: 8



Loading…: 100%|██████████| 4109/4109 [00:00<00:00, 415787.58it/s]


Gathering Data for Year: 2019, Month: 9



Loading…: 100%|██████████| 4418/4418 [00:00<00:00, 255627.47it/s]


Gathering Data for Year: 2019, Month: 10



Loading…: 100%|██████████| 4989/4989 [00:00<00:00, 418976.91it/s]


Gathering Data for Year: 2019, Month: 11



Loading…: 100%|██████████| 4420/4420 [00:00<00:00, 302225.65it/s]


Gathering Data for Year: 2019, Month: 12



Loading…: 100%|██████████| 3985/3985 [00:00<00:00, 253658.23it/s]
Loading…: 100%|██████████| 10/10 [12:49<00:00, 76.95s/it]


## 2. Pre-Process the Data - Part 2. NLP: Tokenisation & Lemmatisation

### 2.1 Load spaCy and the English Model

In [23]:
# !python -m spacy download en_core_web_md

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.0/en_core_web_md-3.4.0-py3-none-any.whl (42.8 MB)
[K     |████████████████████████████████| 42.8 MB 417 kB/s 
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [24]:
# load spaCy and the English model
nlp = spacy.load("en_core_web_md")

### 2.2 Define Helper Functions for Filtering Stop Words

In [25]:
def create_doc(text):
   ## Clean up the text
    text = text.lower()
    text = re.sub(r"[ \t]+", " ", text)
    text = text.lstrip()
    text = re.sub('[A-Z]+', lambda x: x.group(0).title(), text)
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r"[']|[,]|[.]|[:]|[;]", "", text)
    text = re.sub(r'(\s\s+|\n\n+)', r'\1', text)
#     text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    
    doc = nlp(text) ## Tokenisation
    
    return doc

In [26]:
def filter_text(doc):
    filtered_doc =[] 
    token_list = []
    lemma_list = []
    
    for token in doc:
        if token.is_stop == False or token.dep_ in ["det", "neg", "npadvmod:"] and token.pos_ != "DET":
            ## Remove Stop Words
            token_list.append(token)
    
    for word in token_list:
        lemma_list.append(word.lemma_) ## Lemma version of the text
    
    for word in lemma_list: ## Remove Punctuations and Non-alphabetical texts
        lexeme = nlp.vocab[word]
        if word not in " " and lexeme.is_punct == False and lexeme.is_alpha == True:
            filtered_doc.append(word)
                    
    return filtered_doc ## Return original tokenised text and filtered text

In [27]:
## Helper Function for Creating a Dataframe
def create_df(data_dict):
    ## Turn the dictionary into a dataframe
    text_dict = {}
    count = 0
    
    for key, value in data_dict.items():
        for date, texts in value[0].items():
            for index in range(len(texts)):
                txt = texts[index]
                doc = create_doc(txt)
                f_doc = filter_text(doc)
#                 f_doc = " ".join(f_doc)
                
                temp_list = [date, txt, f_doc]
                text_dict = create_dict(text_dict, count, temp_list)
                
                count += 1   
        
    df = pd.DataFrame.from_dict(text_dict,orient='index')
    df[['date',"original", 'texts']] = pd.DataFrame(df[0].tolist(), index= df.index)
    df = df.drop([0], axis=1)
    df = df[~df.original.duplicated(keep='first')]
    df = df.reset_index(drop=True)
    
    ## Drop articles that are longer than 1000 characters for simplicity
    long = []
    for index in range(len(df)):
        if len(df.original[index]) > 1000:
            long.append(index)
            
    df = df.drop(labels = long, axis = 0)
    df = df.reset_index(drop=True)
    
    for i in df.index:
        if df.texts[i][0] == "lead":
            df.texts[i].pop(0)
        else:
            continue

    df["shorts"] = ""
    for i in df.index:
        df.shorts[i] = " ".join([word for word in df.texts[i]])
    
    return df

In [28]:
dict_list = [dict_9099, dict_0009, dict_1019]
df_list = []

for d in dict_list:
    df = create_df(d)
    df_list.append(df)

In [31]:
df_list[2]

Unnamed: 0,date,original,texts,shorts
0,2010-01-01,With the possibility of hefty profits and bonu...,"[possibility, hefty, profit, bonus, bank, rare...",possibility hefty profit bonus bank rarely div...
1,2010-01-04,Veteran value investor Martin J. Whitman will ...,"[veteran, value, investor, martin, j, whitman,...",veteran value investor martin j whitman relinq...
2,2010-01-05,China’s securities regulator may introduce fut...,"[china, security, regulator, introduce, future...",china security regulator introduce future cont...
3,2010-01-05,Wall Street ushered in 2010 on Monday with its...,"[wall, street, usher, monday, big, stock, mark...",wall street usher monday big stock market rall...
4,2010-01-05,Is the return of the Wall Street bonus helping...,"[return, wall, street, bonus, helping, spur, c...",return wall street bonus helping spur comeback...
...,...,...,...,...
2188,2019-11-01,The S&P 500 set a new high for the third time ...,"[set, new, high, time, week, nasdaq, composite...",set new high time week nasdaq composite rise r...
2189,2019-12-06,Saudi Arabia’s state-owned oil company priced ...,"[saudi, arabia, state, own, oil, company, pric...",saudi arabia state own oil company price offer...
2190,2019-12-20,The Wall Street bank would pay a fine of as mu...,"[wall, street, bank, pay, fine, billion, subsi...",wall street bank pay fine billion subsidiary p...
2191,2019-12-23,Wall Street strategists are issuing prediction...,"[wall, street, strategist, issue, prediction, ...",wall street strategist issue prediction ignore...


In [32]:
# df.to_pickle("df_9099.pkl") 

## 3.0 Pre-Processing - Part 3. Labeling Positive, Neutral and Negative News

In [33]:
## Convert the LM Dictionary into positive/negative word lists
positive = []
negative = []

for sent, value in sent_dict.items():
    if sent == "negative" or sent == "uncertainty":
        for word in value.keys():        
            negative.append(word.lower())
    
    elif sent == "positive" :
        for word in value.keys():        
            positive.append(word.lower())

In [34]:
!cp drive/MyDrive/positive.xlsx .
!cp drive/MyDrive/negative.xlsx .

In [35]:
GI_post_csv = "positive.xlsx" ## GI Dictionary
GI_neg_csv = "negative.xlsx"

In [36]:
GI_post = pd.read_excel(GI_post_csv, header = None)
GI_neg = pd.read_excel(GI_neg_csv, header = None)

GI_post_list = list(GI_post[0])
GI_neg_list = list(GI_neg[0])

In [37]:
## Extra words defined by the user to be relevant in the analysis

positive_extra = [r"[bB]ull", "positive", "rise", "stock", "high", "up", "climb", "buy", "surge", "recover",
                  "soar", "up", "blue", "chip"]

negative_extra = [r'[bB]ear', "negative", "drop", "fall", "low", 
                  "burst", "tight", "bubble", "crash", "down", "slide", "sell", "plunge", "dive",
                  "stagflation", "recession", "slow", "inflation", "down"]

In [38]:
## Combine all lists into a single list
positive += (positive_extra + GI_post_list)
negative += (negative_extra + GI_neg_list)

## Remove duplicates and convert to a final list
positive_words = list(set(positive))
negative_words = list(set(negative))

In [39]:
## Helper Function for Sentiment Labelling Using Custom Made Lexicon Dictionary
def get_lexicon_score(df):
    labels = {}
    for i in df.index:
    
        for word in df.texts[i]:            
            if word in positive_words:
                labels = create_dict(labels, i, "+")
        
            elif word in negative_words:
                labels = create_dict(labels, i, "-") 
        
            else:
                labels = create_dict(labels, i, "0")
    
    scores = {}
    for key, value in labels.items():
        plus = value.count("+")
        minus = value.count("-")
        neutral = value.count("0")
        total = len(value)
        pos_score = plus/total
        neg_score = minus/total
        neut_score = neutral/total
        
        r = np.array([pos_score, neg_score])
    
        score = max([pos_score, neg_score])
        if neg_score > pos_score:
            score = -score
            
        scores[key] = score 
    
    return scores

In [40]:
## Helper Function for Getting Polarity Score using TextBlob
def get_polar_score(df):
    polar = []
    for text in df.shorts:
        textBlob = TextBlob(text)
        polar.append(textBlob.polarity * textBlob.subjectivity)
    
    return polar

In [41]:
## Function to print sentiments of the sentence.
def vader_score(text):
    
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(text)
    comp_score = sentiment_dict['compound']

    return comp_score

In [42]:
## Helper Function for Getting Vader Scores for the Entire Dataframe
def get_vader_score(df):    
    vader= []
    for text in df.shorts:
        vader.append(vader_score(text))
    
    return vader

In [43]:
## Helper Function for Getting the Final Aggregated Score
def get_agg_scores(df):
    scores = get_lexicon_score(df)
    polar = get_polar_score(df)
    vader = get_vader_score(df)
    
    agg_scores = []
    weight = [0.1, 0.05, 0.85]
    
    for i in range(len(polar)):
        value = np.sum([weight[0]*polar[i], weight[1]*vader[i], weight[2]*scores[i]])
        agg_scores.append(value)
    
    return agg_scores

In [44]:
new_df_list = []

for df in df_list:
    agg_scores = get_agg_scores(df)
    df["sentiment_score"] = agg_scores
    
    df_temp = df.copy()
    df_group = df_temp.groupby(["date"]).agg({'sentiment_score': ["sum"]})
    new_df_list.append(df_group)

In [47]:
new_df_list[2]

Unnamed: 0_level_0,sentiment_score
Unnamed: 0_level_1,sum
date,Unnamed: 1_level_2
2010-01-01,0.258499
2010-01-04,0.064222
2010-01-05,0.721483
2010-01-06,0.235975
2010-01-07,0.186924
...,...
2019-11-01,0.196258
2019-12-06,0.135332
2019-12-20,0.166365
2019-12-23,-0.186214


In [48]:
df_list[0].to_pickle("df_9099.pkl") 
df_list[1].to_pickle("df_0009.pkl") 
df_list[2].to_pickle("df_1019.pkl") 

In [49]:
new_df_list[0].to_pickle("df_9099_score.pkl") 
new_df_list[1].to_pickle("df_0009_score.pkl") 
new_df_list[2].to_pickle("df_1019_score.pkl") 

In [None]:
from google.colab import drive
drive.flush_and_unmount()