In [14]:
import pandas as pd
import numpy as np
import yfinance as yf
from yahoofinancials import YahooFinancials
from tqdm import tqdm
from torch.nn.functional import softmax
import torch
from datetime import datetime, timedelta

## Collect and process stock data from Yahoo Finance

In [15]:
#get stock data from yahoo finance
def stock_dataset(ticker,  start_date="2017-01-01", end_date="2022-01-01") :
    '''download and process stock data from yahoo finance'''
    stock_df = yf.download(ticker, 
                      start=start_date, 
                      end=end_date, 
                      progress=False,
            )
    #get day of week column
    stock_df['Weekday'] = stock_df.index.day_name()
    #initialize price_change and yesterday_price_change columns.
    #1 indicates increase; -1 indicates decrease
    stock_df['Price_change'] = np.nan
    stock_df['Tomorrow_price_change'] = np.nan
    dates_index = stock_df.index
    yesterday = str(dates_index[0].date())  
    for date in dates_index[0:]:
        today = str(date.date())
        yesterday = str(date - timedelta(days = 1))
        tomorrow = str(date + timedelta(days = 1))
        #single index condition
        if yesterday in stock_df.index:
            if stock_df.loc[yesterday, 'Close'] > stock_df.loc[today, 'Close']:
                price_change = 0
            else:
                price_change = 1
            stock_df.loc[today, 'Price_change'] = price_change
         #single index condition
        if tomorrow in stock_df.index:
            if stock_df.loc[today, 'Close'] > stock_df.loc[tomorrow, 'Close']:
                price_change = 0
            else:
                price_change = 1
            stock_df.loc[today, 'Tomorrow_price_change'] = price_change       

    return stock_df

In [16]:
stock_df = stock_dataset('AMZN', "2017-01-01", "2022-01-01")
stock_df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Weekday,Price_change,Tomorrow_price_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-03,37.896000,37.938000,37.384998,37.683498,37.683498,70422000,Tuesday,,1.0
2017-01-04,37.919498,37.984001,37.709999,37.859001,37.859001,50210000,Wednesday,1.0,1.0
2017-01-05,38.077499,39.119999,38.013000,39.022499,39.022499,116602000,Thursday,1.0,1.0
2017-01-06,39.118000,39.972000,38.924000,39.799500,39.799500,119724000,Friday,1.0,
2017-01-09,39.900002,40.088501,39.588501,39.846001,39.846001,68922000,Monday,,0.0
...,...,...,...,...,...,...,...,...,...
2021-12-27,171.037003,172.942993,169.215500,169.669495,169.669495,58688000,Monday,,1.0
2021-12-28,170.182495,172.175995,169.135498,170.660995,170.660995,54638000,Tuesday,1.0,0.0
2021-12-29,170.839996,171.212006,168.600494,169.201004,169.201004,35754000,Wednesday,0.0,0.0
2021-12-30,169.699997,170.888000,168.524002,168.644501,168.644501,37584000,Thursday,0.0,0.0


## Load and process news data from web scraping

In [17]:
#read news data
#newsarticles_amazon.csv is from running webscraping script query_google_newsdata.py
amazon = './newsarticles_amazon.csv'
df_amazon = pd.read_csv(amazon)
df_amazon.head()

Unnamed: 0,Date,Summary,Title,Article,Link
0,01-01-2017,HAPPY 2017 -- 19 days until Inauguration Day -...,HAPPY 2017 -- 19 days until Inauguration Day -...,HAPPY 2017 -- 19 days until Inauguration Day -...,https://www.politico.com/tipsheets/playbook/20...
1,01-02-2017,"However, like 2016, 2017 is expected to witnes...",Transports Week In Review - 2016 Year-End Edit...,Source: Google Images\n\nNow that 2016 has com...,https://seekingalpha.com/article/4033674-trans...
2,01-02-2017,Seattle-based radio frequency identification t...,Seattle’s top tech stocks of 2016: Newcomer Im...,Seattle-based radio frequency identification t...,https://www.geekwire.com/2017/seattle-top-tech...
3,01-03-2017,Amazon has already started using drones to del...,Amazon Files Patent for Airborne Warehouses,Amazon has already started using drones to del...,https://www.inc.com/business-insider/amazon-fi...
4,01-03-2017,You think $5 gas is bad?\nCheck out the most e...,"Retail Trends for 2017: Walmart, Macy's, Targe...",You think $5 gas is bad? Check out the most ex...,https://fortune.com/2017/01/03/why-2017-will-s...


In [18]:
df_amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7962 entries, 0 to 7961
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     7962 non-null   object
 1   Summary  7471 non-null   object
 2   Title    7504 non-null   object
 3   Article  7472 non-null   object
 4   Link     7962 non-null   object
dtypes: object(5)
memory usage: 311.1+ KB


In [19]:
#there are small portion of null values due to subscription issue. 
#remove rows with null values
df = df_amazon.dropna()
#remove rows with invalid data by check if 'amazon' or 'AMZN' appears in headline or summary
df = df[(df['Summary'].str.lower().str.contains('amazon'))|(df['Summary'].str.lower().str.contains('amzn'))|\
    (df['Title'].str.lower().str.contains('amazon'))|(df['Title'].str.lower().str.contains('amzn'))]
#combine title and article, drop link
df['Title_summary'] = df['Title'] + '.\n' + df['Summary']
df = df.drop(columns=['Link', 'Summary', 'Title', 'Article'], axis=1).set_index(['Date'])
df.reset_index(inplace=True)
df

Unnamed: 0,Date,Title_summary
0,01-03-2017,Amazon Files Patent for Airborne Warehouses.\n...
1,01-03-2017,"Retail Trends for 2017: Walmart, Macy's, Targe..."
2,01-04-2017,Amazon: 2016 Marketplace Sales Broke Records.\...
3,01-05-2017,Amazon Sets Sights on Home Domination With Ale...
4,01-05-2017,How to Use Amazon Affiliate Marketing: A Step ...
...,...,...
3495,12-30-2021,Is Amazon a Good Stock to Buy For 2022?.\nIn t...
3496,12-30-2021,Amazon’s Machine Bosses Are Targeted in Califo...
3497,12-30-2021,2022 Could Be Banner Year for Amazon.com.\n202...
3498,12-30-2021,Will 2022 Be the Year for Amazon Stock to Hit ...


In [20]:
df.loc[0, 'Title_summary']

'Amazon Files Patent for Airborne Warehouses.\nAmazon has already started using drones to deliver products to its customers, albeit on a very small scale.\nAmazon also writes in the patent: "Described is an airborne fulfillment center ("AFC") and the use of unmanned aerial vehicles (\'UAV\') to deliver items from the AFC to users.\nI just unearthed the Death Star of #ecommerce via @cbinsights... AMZN patent for airborne warehouses at 45K ft spitting out delivery drones pic.twitter.com/qEz2ilUtJP -; Zoe Leavitt (@zoe_leavitt) December 28, 2016A number of smaller airships could be used to float Amazon products and drones up to the larger AFC.\n"Shuttles (smaller airships) may be used to replenish the AFC with inventory, UAVs, supplies, fuel, etc," Amazon writes.\nAmazon would likely have to overcome a number of regulatory hurdles before governments would permit it to deliver items in the way that it envisions.'

In [21]:
#initilize pre-trained transformer model finBERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def sentim_analyzer_finbert(df, tokenizer, model, column_name):
    ''' apply pre-trained model finBERT to df[column_name] and generate sentiment scores'''
    for i in tqdm(df.index) :
        try:
            col = df.loc[i, column_name].split('.\n')
        except:
            return print(column_name +' column might be missing from dataframe')
        # Pre-process input phrase
        input = tokenizer(col, padding = True, truncation = True, return_tensors='pt')
        # Estimate output
        output = model(**input)
        # Pass model output logits through a softmax layer.
        predictions = softmax(output.logits, dim=-1)
        df.loc[i, 'Positive'] = torch.mean(predictions[:,0]).tolist()
        df.loc[i, 'Negative'] = torch.mean(predictions[:,1]).tolist()
        df.loc[i, 'Neutral']  = torch.mean(predictions[:,2]).tolist()
    return df

#A pre-processing tokenizer object from Hugging Face lib.
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert") 
#A hugging face transformer model
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")


In [22]:
trained_df = sentim_analyzer_finbert(df, tokenizer, model, 'Title_summary')
#save trained_df into csv file
#trained_df.to_csv('trained.csv', index = False)

100%|███████████████████████████████████████████████████████████████████████████████| 3500/3500 [50:27<00:00,  1.16it/s]


In [23]:
#trained_df = pd.read_csv('trained.csv')
trained_df

Unnamed: 0,Date,Title_summary,Positive,Negative,Neutral
0,01-03-2017,Amazon Files Patent for Airborne Warehouses.\n...,0.193379,0.023666,0.782955
1,01-03-2017,"Retail Trends for 2017: Walmart, Macy's, Targe...",0.045807,0.061819,0.892373
2,01-04-2017,Amazon: 2016 Marketplace Sales Broke Records.\...,0.416015,0.054345,0.529640
3,01-05-2017,Amazon Sets Sights on Home Domination With Ale...,0.136615,0.048071,0.815314
4,01-05-2017,How to Use Amazon Affiliate Marketing: A Step ...,0.120665,0.025450,0.853885
...,...,...,...,...,...
3495,12-30-2021,Is Amazon a Good Stock to Buy For 2022?.\nIn t...,0.195238,0.015275,0.789487
3496,12-30-2021,Amazon’s Machine Bosses Are Targeted in Califo...,0.068186,0.177515,0.754299
3497,12-30-2021,2022 Could Be Banner Year for Amazon.com.\n202...,0.037047,0.061059,0.901894
3498,12-30-2021,Will 2022 Be the Year for Amazon Stock to Hit ...,0.209584,0.038815,0.751601


In [24]:
def news_dataset(df) :
    '''process sentiment data'''
    trained_df['Date'] =pd.to_datetime(trained_df.Date)
    s_df = trained_df[['Date','Positive','Negative','Neutral']].groupby('Date').mean()

    #s_df.set_index(['Date'])
    dates_index = s_df.index
    s_df['Tomorrow_pos'] = np.nan
    s_df['Tomorrow_neg'] = np.nan
    s_df['Tomorrow_neu'] = np.nan
    for date in dates_index[0:]:
        today = str(date.date())
        tomorrow = str((date + timedelta(days = 1)).date())

     #single index condition
        if tomorrow in s_df.index:
            s_df.loc[today, 'Tomorrow_pos'] = s_df.loc[tomorrow, 'Positive']
            s_df.loc[today, 'Tomorrow_neg'] = s_df.loc[tomorrow, 'Negative']
            s_df.loc[today, 'Tomorrow_neu'] = s_df.loc[tomorrow, 'Neutral']
    return s_df.reset_index().sort_values('Date')

sentiment_df = news_dataset(trained_df)
sentiment_df

Unnamed: 0,Date,Positive,Negative,Neutral,Tomorrow_pos,Tomorrow_neg,Tomorrow_neu
0,2017-01-03,0.119593,0.042743,0.837664,0.416015,0.054345,0.529640
1,2017-01-04,0.416015,0.054345,0.529640,0.128640,0.036761,0.834599
2,2017-01-05,0.128640,0.036761,0.834599,0.222731,0.061406,0.715863
3,2017-01-06,0.222731,0.061406,0.715863,0.076935,0.361986,0.561079
4,2017-01-07,0.076935,0.361986,0.561079,0.154535,0.055718,0.789748
...,...,...,...,...,...,...,...
1505,2021-12-27,0.414998,0.149957,0.435044,0.270271,0.078007,0.651721
1506,2021-12-28,0.270271,0.078007,0.651721,0.225209,0.232646,0.542145
1507,2021-12-29,0.225209,0.232646,0.542145,0.134685,0.163041,0.702274
1508,2021-12-30,0.134685,0.163041,0.702274,0.244497,0.179205,0.576298


In [25]:
#merge stock data with news sentiment data
def merge_stock_news(df_stock, df_news, how='inner') :
    ''' Merges the financial data dataframe with the news sentiment dataframe'''
    # merge on date column and only for their intersection
    merged_df = df_stock.merge(df_news, on='Date', how=how)
    # rearrange column order
    merged_df = merged_df[['Date', 'Weekday','Positive','Negative','Neutral',\
                           'Tomorrow_pos','Tomorrow_neg','Tomorrow_neu','Price_change','Tomorrow_price_change',\
                           'Open','Close', 'Volume','High','Low', 'Adj Close']]
    merged_df['Date'] =pd.to_datetime(merged_df['Date'])
    #shift the data by one more day and rename all the columns
    merged_df['Date'] = merged_df['Date'] + pd.Timedelta('1 day')
    merged_df['Weekday'] = merged_df['Date'].dt.day_name()
    merged_df = merged_df.rename(columns={'Positive': 'Yesterday_pos', 'Negative': 'Yesterday_neg', 'Neutral': 'Yesterday_neu',\
                           'Tomorrow_pos':'Positive','Tomorrow_neg':'Negative', 'Tomorrow_neu':'Neutral',\
                           'Price_change':'Yesterday_price_change','Tomorrow_price_change':'Price_change',\
                           'Open':'Yesterday_open','Close':'Yesterday_close', 'Volume':'Yesterday_volume',\
                            'High':'Yesterday_high','Low':'Yesterday_low', 'Adj Close':'Yesterday_Adj_close'})
    return merged_df
merged_df = merge_stock_news(stock_df, sentiment_df)
merged_df

Unnamed: 0,Date,Weekday,Yesterday_pos,Yesterday_neg,Yesterday_neu,Positive,Negative,Neutral,Yesterday_price_change,Price_change,Yesterday_open,Yesterday_close,Yesterday_volume,Yesterday_high,Yesterday_low,Yesterday_Adj_close
0,2017-01-04,Wednesday,0.119593,0.042743,0.837664,0.416015,0.054345,0.529640,,1.0,37.896000,37.683498,70422000,37.938000,37.384998,37.683498
1,2017-01-05,Thursday,0.416015,0.054345,0.529640,0.128640,0.036761,0.834599,1.0,1.0,37.919498,37.859001,50210000,37.984001,37.709999,37.859001
2,2017-01-06,Friday,0.128640,0.036761,0.834599,0.222731,0.061406,0.715863,1.0,1.0,38.077499,39.022499,116602000,39.119999,38.013000,39.022499
3,2017-01-07,Saturday,0.222731,0.061406,0.715863,0.076935,0.361986,0.561079,1.0,,39.118000,39.799500,119724000,39.972000,38.924000,39.799500
4,2017-01-10,Tuesday,0.164341,0.085015,0.750644,0.460805,0.134797,0.404399,,0.0,39.900002,39.846001,68922000,40.088501,39.588501,39.846001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1155,2021-12-28,Tuesday,0.414998,0.149957,0.435044,0.270271,0.078007,0.651721,,1.0,171.037003,169.669495,58688000,172.942993,169.215500,169.669495
1156,2021-12-29,Wednesday,0.270271,0.078007,0.651721,0.225209,0.232646,0.542145,1.0,0.0,170.182495,170.660995,54638000,172.175995,169.135498,170.660995
1157,2021-12-30,Thursday,0.225209,0.232646,0.542145,0.134685,0.163041,0.702274,0.0,0.0,170.839996,169.201004,35754000,171.212006,168.600494,169.201004
1158,2021-12-31,Friday,0.134685,0.163041,0.702274,0.244497,0.179205,0.576298,0.0,0.0,169.699997,168.644501,37584000,170.888000,168.524002,168.644501


In [26]:
#save merge_stock_news into csv file
merged_df.to_csv('merge_stock_news_amazon.csv', index = False)