# Sentiment Analysis: Compares headlines and stock data and the correlation between them

### Sample data will be used pulled from the NewsApiClient and the yfinance API for my information

### Code is commented out as it has been used for the sample data

In [4]:
import vaderSentiment

import pandas as pd
import numpy as np
from textblob import TextBlob
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import os
from newsapi import NewsApiClient
from dotenv import load_dotenv
load_dotenv()
from pathlib import Path

In [5]:
#Load datasets
api_key = os.getenv("NEWS_API_KEY")

In [7]:
newsapi = NewsApiClient(api_key=api_key)

In [9]:
djia_headlines = newsapi.get_everything(
    q="DJIA AND Dow Jones",
    language="en",
    page_size=100,
    sort_by="relevancy"
)


# Print total articles
print(f"Total articles about the dow jones: {djia_headlines['totalResults']}")

# Show sample article
djia_headlines["articles"][3]

Total articles about the dow jones: 1239


{'source': {'id': None, 'name': 'MarketWatch'},
 'author': 'Jeffry Bartash',
 'title': 'Economic Report: Coming up: PCE inflation and consumer spending',
 'description': 'The PCE inflation gauge preferred by the Federal Reserve is forecast to rise 0.7% in May. The core rate, which excludes food and gas, is seen increasing 0.4%.',
 'url': 'https://www.marketwatch.com/story/coming-up-pce-inflation-and-consumer-spending-11656591128',
 'urlToImage': 'https://images.mktw.net/im-568043/social',
 'publishedAt': '2022-06-30T12:12:00Z',
 'content': 'The PCE inflation gauge preferred by the Federal Reserve is forecast to rise 0.7% in May. The core rate, which excludes food and gas, is seen increasing 0.4%. The 12-month rate of increase in the PCE… [+445 chars]'}

In [10]:
# Download/Update the VADER Lexicon
import nltk
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/cliffordcharles/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [11]:
djia_headline_df= pd.DataFrame(djia_headlines)
djia_headline_df

Unnamed: 0,status,totalResults,articles
0,ok,1239,"{'source': {'id': None, 'name': 'Entrepreneur'..."
1,ok,1239,"{'source': {'id': None, 'name': 'MarketWatch'}..."
2,ok,1239,"{'source': {'id': None, 'name': 'MarketWatch'}..."
3,ok,1239,"{'source': {'id': None, 'name': 'MarketWatch'}..."
4,ok,1239,"{'source': {'id': None, 'name': 'MarketWatch'}..."
...,...,...,...
95,ok,1239,"{'source': {'id': None, 'name': 'CoinDesk'}, '..."
96,ok,1239,"{'source': {'id': None, 'name': 'CoinDesk'}, '..."
97,ok,1239,"{'source': {'id': None, 'name': 'MarketWatch'}..."
98,ok,1239,"{'source': {'id': None, 'name': 'MarketWatch'}..."


In [None]:
djia_headlines

In [18]:
# Create the Dow Jones Industrial Average sentiment scores DataFrame
djia_sentiment = []

for article in djia_headlines["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        djia_sentiment.append({
            "Text": text,
            "Date": date,
            "Compound": compound,
            "Positive": pos,
            "Negative": neg,
            "Neutral": neu
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
djia_df = pd.DataFrame(djia_sentiment)

# Reorder DataFrame columns
cols = ["Date", "Text", "Compound", "Positive", "Negative", "Neutral"]
djia_df = djia_df[cols]

djia_df=djia_df.sort_values(by="Date")
djia_df=djia_df.reset_index(drop=True)


In [19]:
# Use groupby in pandas to fix df so that you do not use the csv. 
#Use reset_index to turn the series back into df
#rename columns if need be

In [20]:
djia_df.to_csv("DJIA_headline.csv")
djia_data = Path("DJIA_headline.csv")
djia_df = pd.read_csv(djia_data, index_col="Date")
djia_df.head()

Unnamed: 0_level_0,Unnamed: 0,Text,Compound,Positive,Negative,Neutral
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-06-02,0,"Don't miss CoinDesk's Consensus 2022, the must...",0.8682,0.285,0.0,0.715
2022-06-02,1,Good morning. Heres whats happening:\r\nPrices...,0.2382,0.059,0.0,0.941
2022-06-03,2,"Shares of Caterpillar Inc. \r\n CAT,\r\n +0.80...",0.296,0.062,0.0,0.937
2022-06-03,3,"The U.S. is forecast to add 328,000 new jobs i...",-0.4404,0.0,0.073,0.927
2022-06-03,4,Stocks closed lower on Friday to end the week ...,0.34,0.146,0.137,0.718


In [21]:
#stock data

import yfinance as yf
from datetime import date

today = date.today()


data = yf.download("DIA", start="2022-05-24", end=today)
df_stock=pd.DataFrame(data)

[*********************100%***********************]  1 of 1 completed


In [22]:
#merge the dataframes
merged_df = pd.concat([djia_df, df_stock], axis=0, join='outer', ignore_index=True, keys="Date", levels=None, names=None, verify_integrity=False, sort=False, copy=True)
merged_df.to_csv("Merged_df.csv")
merged_data = Path("../csv/Merged_df.csv")
merged_df = pd.read_csv(merged_data)

#add column to dataframe to show if the stock went up or down
columns = ["Text","Compound","Positive","Negative","Neutral","Open","High","Low","Close","Adj Close","Volume", "Label"]
df = merged_df[columns]
df


Unnamed: 0,Text,Compound,Positive,Negative,Neutral,Open,High,Low,Close,Adj Close,Volume,Label
0,Chinas central bank urged the nations lenders ...,0.7096,0.185,0.038,0.777,316.570007,320.149994,313.619995,319.390015,318.653046,5260500.0,1.0
1,SINGAPORE Singapores economy expanded at a fas...,0.4939,0.127,0.000,0.873,317.980011,322.540008,317.450012,321.279999,320.538666,4612800.0,1.0
2,"SHANGHAI, May 26 (Reuters) - Asian share marke...",0.3182,0.096,0.000,0.904,322.950012,327.769989,322.899994,326.450012,325.696747,3270200.0,1.0
3,US stocks rose Friday and headed toward their ...,0.0000,0.000,0.000,1.000,327.049988,332.130005,326.820007,332.070007,331.303772,2773100.0,0.0
4,A look at the day ahead in markets from Julien...,-0.4389,0.000,0.071,0.929,330.170013,332.549988,327.660004,330.200012,329.438110,4354000.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
94,,,,,,,,,,,,
95,,,,,,,,,,,,
96,,,,,,,,,,,,
97,,,,,,,,,,,,


In [11]:
df.dropna()

Unnamed: 0,Text,Compound,Positive,Negative,Neutral,Open,High,Low,Close,Adj Close,Volume,Label
0,Chinas central bank urged the nations lenders ...,0.7096,0.185,0.038,0.777,316.570007,320.149994,313.619995,319.390015,318.653046,5260500.0,1.0
1,SINGAPORE Singapores economy expanded at a fas...,0.4939,0.127,0.0,0.873,317.980011,322.540008,317.450012,321.279999,320.538666,4612800.0,1.0
2,"SHANGHAI, May 26 (Reuters) - Asian share marke...",0.3182,0.096,0.0,0.904,322.950012,327.769989,322.899994,326.450012,325.696747,3270200.0,1.0
3,US stocks rose Friday and headed toward their ...,0.0,0.0,0.0,1.0,327.049988,332.130005,326.820007,332.070007,331.303772,2773100.0,0.0
4,A look at the day ahead in markets from Julien...,-0.4389,0.0,0.071,0.929,330.170013,332.549988,327.660004,330.200012,329.43811,4354000.0,0.0
5,The eurozones annual rate of inflation acceler...,0.2023,0.049,0.0,0.951,332.410004,332.950012,326.100006,328.359985,327.602325,3721400.0,0.0
6,NEW YORK (AP) A swift jump in Treasury yields ...,0.3818,0.091,0.0,0.909,328.549988,332.779999,325.359985,332.769989,332.002136,3718800.0,1.0
7,Bank of Japan policy board member Seiji Adachi...,-0.6705,0.0,0.143,0.857,329.869995,331.730011,328.730011,329.350006,328.590057,4149700.0,0.0
8,"The U.S. is forecast to add 328,000 new jobs i...",-0.4404,0.0,0.073,0.927,331.589996,332.73999,328.529999,329.450012,328.689819,1957600.0,0.0
9,US futures and global stocks rallied on Monday...,0.128,0.089,0.075,0.836,326.820007,332.470001,326.820007,332.170013,331.403565,2424000.0,1.0


In [23]:
#a function for subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#a function to get polarity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [24]:
# add two columns 
df['Subjectivity'] = df['Text'].apply(getSubjectivity)
df['Polarity'] = df['Text'].apply(getPolarity)

df.head(3)


TypeError: The `text` argument passed to `__init__(text)` must be a string, not <class 'float'>

In [25]:
#ML
# Create the feature data set
X= df
x= np.array(X.drop(["Label"], 1))

# Create the target data set
y = np.array(df["Label"])


  x= np.array(X.drop(["Label"], 1))


In [26]:
# split the data 80/20 between training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 78)

In [27]:
#Create and train
model= LinearDiscriminantAnalysis().fit(X_train, y_train)

ValueError: could not convert string to float: 'SINGAPORE Singapores economy expanded at a faster pace in the first quarter of 2022 than previously estimated, mainly due to solid growth in the manufacturing, construction and services-producing ind'

In [17]:
#Show model predictions

predictions = model.predict(x_test)
predictions

NameError: name 'model' is not defined

In [18]:
y_test

array([ 1., nan,  0., nan,  0.,  1., nan, nan,  0., nan,  1., nan, nan,
       nan, nan, nan, nan,  0., nan, nan])

In [None]:
#Show the model metrics
print(classification_report(y_test, predictions))

In [None]:
#visualize compound score in relation to stock price