https://medium.com/mlearning-ai/predict-sp500-stock-price-with-python-machine-learning-sentiment-analysis-a296dc276353
https://github.com/Poulinakis-Konstantinos/Stock_prediction_with_News_Sentiment_Analysis/blob/master/DF_creation_example.ipynb

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from yahoofinancials import YahooFinancials
from tqdm import tqdm
from torch.nn.functional import softmax

In [31]:
#get stock data from yahoo finance
def stock_dataset(ticker,  start_date="2019-01-01", end_date="2022-01-01") :
    '''download and process stock data from yahoo finance'''
    stock_df = yf.download(ticker, 
                      start=start_date, 
                      end=end_date, 
                      progress=False,
            )
    #initialize price_change and yesterday_price_change columns.
    #1 indicates increase; -1 indicates decrease
    stock_df['Price_change'] = np.nan
    #stock_df['Yesterday_price_change'] = np.nan
    dates_index = stock_df.index
    yesterday = str(dates_index[0].date())  
    for date in dates_index[1:]:
        today = str(date.date())
        if stock_df.loc[yesterday, 'Close'] > stock_df.loc[today, 'Close']:
            price_change = -1
        else:
            price_change = +1
        yesterday = today
        stock_df.loc[today, 'Price_change'] = price_change
    #stock_df['Yesterday_price_change'] = stock_df['Price_change'].shift(1)
    return stock_df

In [32]:
stock_df = stock_dataset('AMZN', "2019-01-01", "2022-01-01")
stock_df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Price_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-02,73.260002,77.667999,73.046501,76.956497,76.956497,159662000,
2019-01-03,76.000504,76.900002,74.855499,75.014000,75.014000,139512000,-1.0
2019-01-04,76.500000,79.699997,75.915497,78.769501,78.769501,183652000,1.0
2019-01-07,80.115501,81.727997,79.459503,81.475502,81.475502,159864000,1.0
2019-01-08,83.234497,83.830498,80.830498,82.829002,82.829002,177628000,1.0
...,...,...,...,...,...,...,...
2021-12-27,171.037003,172.942993,169.215500,169.669495,169.669495,58688000,-1.0
2021-12-28,170.182495,172.175995,169.135498,170.660995,170.660995,54638000,1.0
2021-12-29,170.839996,171.212006,168.600494,169.201004,169.201004,35754000,-1.0
2021-12-30,169.699997,170.888000,168.524002,168.644501,168.644501,37584000,-1.0


In [39]:
#add tomorrow_price_change as the target feature
stock_df['Tomorrow_price_change'] = stock_df['Price_change'].shift(-1)
stock_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Price_change,Tomorrow_price_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-02,73.260002,77.667999,73.046501,76.956497,76.956497,159662000,,-1.0
2019-01-03,76.000504,76.900002,74.855499,75.014,75.014,139512000,-1.0,1.0
2019-01-04,76.5,79.699997,75.915497,78.769501,78.769501,183652000,1.0,1.0
2019-01-07,80.115501,81.727997,79.459503,81.475502,81.475502,159864000,1.0,1.0
2019-01-08,83.234497,83.830498,80.830498,82.829002,82.829002,177628000,1.0,1.0


In [9]:
#read news data
amazon = './newsarticles_amazon.csv'
df_amazon = pd.read_csv(amazon)
df_amazon.head()

Unnamed: 0,Date,Summary,Title,Article,Link
0,01-02-2019,Soon after Amazon (AMZN -5.60%) acquired Whole...,Amazon Will Take the Next Step of Its Whole Fo...,Check out the latest Amazon earnings call tran...,https://www.fool.com/investing/2019/01/02/amaz...
1,01-02-2019,Please Sign In and use this article's on page ...,Pepco alum Debbi Jarvis looks to boost Howard ...,Please Sign In and use this article's on page ...,https://www.bizjournals.com/washington/news/20...
2,01-02-2019,,,,https://www.nasdaq.com/articles/why-prana-biot...
3,01-02-2019,,,,https://www.nasdaq.com/articles/3-dividend-ari...
4,01-02-2019,"Over the past year, the shares of Starbucks Co...",Starbucks At Historic Low (NASDAQ:SBUX),"Over the past year, the shares of Starbucks Co...",https://seekingalpha.com/article/4231086-starb...


In [10]:
df_amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4911 entries, 0 to 4910
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     4911 non-null   object
 1   Summary  4655 non-null   object
 2   Title    4662 non-null   object
 3   Article  4656 non-null   object
 4   Link     4911 non-null   object
dtypes: object(5)
memory usage: 192.0+ KB


In [50]:
#there are small portion of null values due to subscription issue. 
#remove rows with null values
df = df_amazon.dropna()
#remove rows with invalid data by check if 'amazon' or 'AMZN' appears in headline or summary
df = df[(df['Summary'].str.lower().str.contains('amazon'))|(df['Summary'].str.lower().str.contains('amzn'))|\
    (df['Title'].str.lower().str.contains('amazon'))|(df['Title'].str.lower().str.contains('amzn'))]
#combine title and article, drop link
df['Title_summary'] = df['Title'] + '.\n' + df['Summary']
df = df.drop(columns=['Link', 'Summary', 'Title', 'Article'], axis=1).set_index(['Date'])
df.reset_index(inplace=True)
df

Unnamed: 0,Date,Title_summary
0,01-02-2019,Amazon Will Take the Next Step of Its Whole Fo...
1,01-03-2019,Why Amazon Stock Lost 11% Last Month.\nWhat ha...
2,01-04-2019,Top 5 Tech Trends For 2019.\nAs we approach th...
3,01-04-2019,Whole Foods online search tool filters food ch...
4,01-05-2019,Jeff Bezos and Jamie Dimon: Best of Frenemies....
...,...,...
2319,12-30-2021,Is Amazon a Good Stock to Buy For 2022?.\nIn t...
2320,12-30-2021,Amazon’s Machine Bosses Are Targeted in Califo...
2321,12-30-2021,2022 Could Be Banner Year for Amazon.com.\n202...
2322,12-30-2021,Will 2022 Be the Year for Amazon Stock to Hit ...


In [52]:
df.loc[0, 'Title_summary'].split('.\n')

['Amazon Will Take the Next Step of Its Whole Foods Strategy in 2019',
 'Soon after Amazon (AMZN -5.60%) acquired Whole Foods Market in 2017, it started offering discounts to Amazon Prime members and selling Amazon devices in stores',
 '2019 might be the year Amazon starts building new Whole Foods locations',
 'All about Prime NowAmazon has over 100 million Prime members around the world, but member growth has stagnated in the United States',
 'Amazon continues to invest in benefits for its Prime members, including those offered at Whole Foods',
 'Amazon can invest in those areas because it can gain benefits from scale and has proven capable of maximizing the value of its infrastructure (see Amazon Web Services, Fulfillment by Amazon, Prime Video, et cetera).']

In [53]:
#initilize pre-trained transformer model finBERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def sentim_analyzer_finbert(df, tokenizer, model, column_name):
    ''' apply pre-trained model finBERT to df[column_name] and generate sentiment scores'''
    for i in tqdm(df.index) :
        try:
            col = df.loc[i, column_name].split('.\n')
        except:
            return print(column_name +' column might be missing from dataframe')
        # Pre-process input phrase
        input = tokenizer(col, padding = True, truncation = True, return_tensors='pt')
        # Estimate output
        output = model(**input)
        # Pass model output logits through a softmax layer.
        predictions = softmax(output.logits, dim=-1)
        df.loc[i, 'Positive'] = predictions[0][0].tolist()
        df.loc[i, 'Negative'] = predictions[0][1].tolist()
        df.loc[i, 'Neutral']  = predictions[0][2].tolist()
    return df

#A pre-processing tokenizer object from Hugging Face lib.
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert") 
#A hugging face transformer model
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")


In [55]:
trained_df = sentim_analyzer_finbert(df, tokenizer, model, 'Title_summary')
#save trained_df into csv file
#trained_df.to_csv('trained.csv', index = False)

In [19]:
#trained_df = pd.read_csv('trained.csv')
trained_df

Unnamed: 0,Date,Title_summary,Positive,Negative,Neutral
0,01-02-2019,Amazon Will Take the Next Step of Its Whole Fo...,0.105194,0.008503,0.886303
1,01-03-2019,Why Amazon Stock Lost 11% Last Month.\nWhat ha...,0.008750,0.955784,0.035466
2,01-04-2019,Top 5 Tech Trends For 2019.\nAs we approach th...,0.075322,0.020919,0.903759
3,01-04-2019,Whole Foods online search tool filters food ch...,0.035183,0.024803,0.940014
4,01-05-2019,Jeff Bezos and Jamie Dimon: Best of Frenemies....,0.048974,0.026076,0.924950
...,...,...,...,...,...
2319,12-30-2021,Is Amazon a Good Stock to Buy For 2022?.\nIn t...,0.083103,0.015508,0.901389
2320,12-30-2021,Amazon’s Machine Bosses Are Targeted in Califo...,0.017710,0.533394,0.448896
2321,12-30-2021,2022 Could Be Banner Year for Amazon.com.\n202...,0.043635,0.032697,0.923668
2322,12-30-2021,Will 2022 Be the Year for Amazon Stock to Hit ...,0.209327,0.016585,0.774088


In [20]:
# take average sentiment scores for each date
sentiment_df = trained_df[['Date','Positive','Negative','Neutral']].groupby('Date').mean().\
              reset_index().sort_values('Date')
sentiment_df['Date'] =pd.to_datetime(sentiment_df.Date)
sentiment_df.sort_values('Date')

Unnamed: 0,Date,Positive,Negative,Neutral
1,2019-01-02,0.105194,0.008503,0.886303
3,2019-01-03,0.008750,0.955784,0.035466
5,2019-01-04,0.055253,0.022861,0.921886
8,2019-01-05,0.048974,0.026076,0.924950
11,2019-01-06,0.113569,0.029981,0.856450
...,...,...,...,...
919,2021-12-27,0.389570,0.208017,0.402413
922,2021-12-28,0.034733,0.030352,0.934915
925,2021-12-29,0.173104,0.098132,0.728764
928,2021-12-30,0.093781,0.122158,0.784062


In [42]:
#merge stock data with news sentiment data
def merge_stock_news(df_stock, df_news, how='inner') :
    ''' Merges the financial data dataframe with the news sentiment dataframe'''
    # merge on date column and only for their intersection
    merged_df = df_stock.merge(df_news, on='Date', how=how)
    # rearrange column order
    merged_df = merged_df[['Date', 'Positive','Negative','Neutral','Open',\
                           'Close', 'Volume','High','Low', 'Adj Close', 'Price_change','Tomorrow_price_change']]
    return merged_df
merged_df = merge_stock_news(stock_df, sentiment_df)

In [43]:
#save merge_stock_news into csv file
merged_df.to_csv('merge_stock_news.csv', index = False)