### Objective Two - Main

#### Predict stock market volatility using ESG-related news

In [431]:
# Import libraries
import pandas as pd
import numpy as np
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from datetime import datetime
import re

# Granger's casuality test library
from statsmodels.tsa.stattools import grangercausalitytests

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


from statsmodels.tsa.api import VAR


# Import VADER for sentiment analysis
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import CountVectorizer

import gensim
from gensim import corpora
from gensim.models import CoherenceModel

from wordcloud import WordCloud
from gensim.models import LdaModel

import spacy
nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, roc_curve, auc

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>
[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>
[nltk_data] Error loading wordnet: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>
[nltk_data] Error loading vader_lexicon: <urlopen error [Errno 8]
[nltk_data]     nodename nor servname provided, or not known>


### Step 1: Calculate the daily market returns time series and volatility

In [432]:
# Load price data
prices_file_path = '../Data/Input/Eikon/refinitiv_prices_raw.csv'


# Read the CSV file into a DataFrame
prices_df = pd.read_csv(prices_file_path)

prices_df

Unnamed: 0,company,symbol,date,close,open,high,low
0,Ford,F,2024-04-01,13.29,13.33,13.380,13.140
1,Ford,F,2024-04-02,13.28,13.16,13.370,13.090
2,Ford,F,2024-04-03,13.65,13.25,13.680,13.230
3,Ford,F,2024-04-04,13.21,13.90,13.950,13.170
4,Ford,F,2024-04-05,13.28,13.27,13.395,13.090
...,...,...,...,...,...,...,...
955,Tesco,TSCO.L,2024-08-09,330.50,328.50,331.100,326.200
956,Tesco,TSCO.L,2024-08-12,333.50,331.70,333.900,330.880
957,Tesco,TSCO.L,2024-08-13,335.00,333.80,338.000,333.400
958,Tesco,TSCO.L,2024-08-14,340.60,336.50,340.600,336.400


In [433]:
# Let's filter the dataset to match the dates of the available news stories 

# Ensure 'date' column is in datetime format
prices_df['date'] = pd.to_datetime(prices_df['date'])

# Define the date range for filtering
start_date = '2024-05-01'
end_date = '2024-07-31'

# Convert string to datetime object
date_format = "%Y-%m-%d"
start_date = datetime.strptime(start_date, date_format)
end_date = datetime.strptime(end_date, date_format)

delta = (end_date-start_date)
N = delta.days + 1
print(N)

# Filter the DataFrame to keep rows between 1st May 2024 and 31st July 2024
stock_performance_df = prices_df[(prices_df['date'] >= start_date) & (prices_df['date'] <= end_date)]

# Display the filtered data
print(stock_performance_df.head())


92
   company symbol       date  close   open   high      low
22    Ford      F 2024-05-01  12.20  12.16  12.43  12.1500
23    Ford      F 2024-05-02  12.49  12.40  12.55  12.3600
24    Ford      F 2024-05-03  12.43  12.64  12.76  12.3900
25    Ford      F 2024-05-06  12.50  12.54  12.62  12.4575
26    Ford      F 2024-05-07  12.17  12.40  12.45  12.0850


In [434]:
stock_performance_df

Unnamed: 0,company,symbol,date,close,open,high,low
22,Ford,F,2024-05-01,12.20,12.16,12.4300,12.1500
23,Ford,F,2024-05-02,12.49,12.40,12.5500,12.3600
24,Ford,F,2024-05-03,12.43,12.64,12.7600,12.3900
25,Ford,F,2024-05-06,12.50,12.54,12.6200,12.4575
26,Ford,F,2024-05-07,12.17,12.40,12.4500,12.0850
...,...,...,...,...,...,...,...
944,Tesco,TSCO.L,2024-07-25,325.60,325.60,327.1000,323.4000
945,Tesco,TSCO.L,2024-07-26,327.10,325.10,328.6000,323.3490
946,Tesco,TSCO.L,2024-07-29,330.90,330.00,333.7000,328.1630
947,Tesco,TSCO.L,2024-07-30,331.70,330.00,332.8000,328.4000


The daily market return on day t is calculated as:

\begin{equation}
r_{t} = ln \left(\frac{CLOSE_{t}}{CLOSE_{t-1}}\right)
\end{equation}

where $CLOSE_{t}$ is the closing price on day t and $CLOSE_{t-1}$ is the previous day closing price.

In [435]:
# Sort by company and date to maintain the proper order for each company
stock_performance_df = stock_performance_df.sort_values(by=['company', 'date'])


In [436]:
# Calculate daily market return using the formula: r_t = log(CLOSE_t / CLOSE_t-1)
stock_performance_df['daily_return'] = stock_performance_df.groupby('company', group_keys=False)['close'].apply(
    lambda x: np.log(x / x.shift(1))
)

# Display the result
print(stock_performance_df.head())

    company symbol       date  close   open    high    low  daily_return
502    Asda    WMT 2024-05-01  58.85  59.31  59.410  58.72           NaN
503    Asda    WMT 2024-05-02  59.71  58.94  59.885  58.58      0.014508
504    Asda    WMT 2024-05-03  59.82  59.62  59.980  59.14      0.001841
505    Asda    WMT 2024-05-06  59.87  60.00  60.000  59.39      0.000835
506    Asda    WMT 2024-05-07  60.62  60.17  60.800  60.05      0.012449


In [437]:
stock_performance_df

Unnamed: 0,company,symbol,date,close,open,high,low,daily_return
502,Asda,WMT,2024-05-01,58.85,59.31,59.410,58.7200,
503,Asda,WMT,2024-05-02,59.71,58.94,59.885,58.5800,0.014508
504,Asda,WMT,2024-05-03,59.82,59.62,59.980,59.1400,0.001841
505,Asda,WMT,2024-05-06,59.87,60.00,60.000,59.3900,0.000835
506,Asda,WMT,2024-05-07,60.62,60.17,60.800,60.0500,0.012449
...,...,...,...,...,...,...,...,...
464,Toyota,TM,2024-07-25,195.25,197.43,197.430,193.7300,-0.009938
465,Toyota,TM,2024-07-26,192.52,190.74,192.840,190.5100,-0.014081
466,Toyota,TM,2024-07-29,192.48,193.00,193.200,191.8067,-0.000208
467,Toyota,TM,2024-07-30,193.11,194.96,195.480,192.2650,0.003268


Calculate Volatility

The volatility of the stock market index is calculated within a defined time window (e.g., previous 90 days):

\begin{equation}
Vol = \sqrt{\frac{1}{N}\sum_{t=1}^{N}(r_{t}-\bar{r})^2} \cdot \sqrt{252}
\end{equation}

where N is the total number of days during a window time of observations (eg, 30 days), and 252 is the total number of trading days in a single year;

In [460]:
def calculate_volatility(df):
    # Group by company to calculate volatility for each stock
    volatility_df = df.groupby('company').apply(lambda x: calculate_stock_volatility(x))
    
    # Reset the index 
    volatility_df = volatility_df.reset_index(drop=True)
    
    # Calculate quartiles
    quartiles = np.percentile(volatility_df['volatility'], [0, 25, 50, 75, 100])
    
    # Add quartile information to the DataFrame
    volatility_df['quartile'] = pd.cut(volatility_df['volatility'], bins=quartiles, labels=[1, 2, 3, 4], include_lowest=True)

    return volatility_df

def calculate_stock_volatility(stock_df):
    # Number of trading days (rows)
    N = len(stock_df)
    
    # Mean of daily returns (r̄)
    mean_return = stock_df['daily_return'].mean()
    
    # Variance calculation: 
    variance = np.sum((stock_df['daily_return'] - mean_return) ** 2) / N
    
    # Daily volatility: sqrt(variance)
    daily_volatility = np.sqrt(variance)
    
    # Annual volatility: daily_volatility * sqrt(252)
    annual_volatility = daily_volatility * np.sqrt(252)
    
    # Return the company and its calculated volatility
    return pd.Series({'company': stock_df['company'].iloc[0], 'volatility': annual_volatility})


volatility_results = calculate_volatility(stock_performance_df)

# Display the calculated volatility and quartile for each company
print(volatility_results)


           company  volatility quartile
0             Asda    0.187419        1
1             Ford    0.477228        3
2  Marks & Spencer    0.250589        2
3            Ocado    0.745336        4
4         Polestar    1.047160        4
5       Sainsburys    0.195476        1
6       Stellantis    0.294795        3
7            Tesco    0.120855        1
8            Tesla    0.566548        4
9           Toyota    0.224128        2


In [461]:
# Sort the DataFrame by 'volatility' in descending order
volatility_results = volatility_results.sort_values(by='volatility', ascending=False)
volatility_results

Unnamed: 0,company,volatility,quartile
4,Polestar,1.04716,4
3,Ocado,0.745336,4
8,Tesla,0.566548,4
1,Ford,0.477228,3
6,Stellantis,0.294795,3
2,Marks & Spencer,0.250589,2
9,Toyota,0.224128,2
5,Sainsburys,0.195476,1
0,Asda,0.187419,1
7,Tesco,0.120855,1


### Step 2: Run sentiment Analysis of the ESG news stories using VADER and set up the daily sentiment score time series

In [440]:
# Load Pre-Processed ESG stories data from Objective One
stories_file_path = '../Data/Output/news_df.csv'


# Read the CSV file into a DataFrame
news_df = pd.read_csv(stories_file_path)

news_df

Unnamed: 0,story,date,company,ticker
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,Ford,esg
1,new york city ny accesswire july 29 2024 br...,2024-07-29,Ford,esg
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,Ford,esg
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,Ford,esg
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,Ford,esg
...,...,...,...,...
985,tesco the uks largest supermarket chain has sp...,2024-05-15,Tesco,governance
986,tesco has been accused of giving struggling wo...,2024-05-14,Tesco,governance
987,tesco boss ken murphy has seen his pay deal mo...,2024-05-14,Tesco,governance
988,tesco has apologised after a black publisher s...,2024-05-20,Tesco,sustainability


In [441]:
# Let's initialise the 'stop words' function for English 

stop = stopwords.words('english')
stop[:5]

['i', 'me', 'my', 'myself', 'we']

In [442]:
# Let's remove 'stop words' from the stories

news_df['story'] = news_df['story'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop]))
news_df.head()

Unnamed: 0,story,date,company,ticker
0,los angeles ca accesswire july 29 2024 schall ...,2024-07-29,Ford,esg
1,new york city ny accesswire july 29 2024 brons...,2024-07-29,Ford,esg
2,ford alert bragar eagel amp squire pc investig...,2024-07-29,Ford,esg
3,first atlantic nickel corp fanv alaska energy ...,2024-07-29,Ford,esg
4,palm beach fla july 29 2024 globe newswire fin...,2024-07-29,Ford,esg


In [443]:
# Let's lemmatise the content of the 'story' column

for index, row in news_df.iterrows():
    words = nltk.word_tokenize(row['story'])

    # Lemmatise each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)

    # Update the 'story' column with the lemmatised text
    news_df.at[index, 'story'] = lemmatized_text


In [444]:
news_df.head()

Unnamed: 0,story,date,company,ticker
0,los angeles ca accesswire july 29 2024 schall ...,2024-07-29,Ford,esg
1,new york city ny accesswire july 29 2024 brons...,2024-07-29,Ford,esg
2,ford alert bragar eagel amp squire pc investig...,2024-07-29,Ford,esg
3,first atlantic nickel corp fanv alaska energy ...,2024-07-29,Ford,esg
4,palm beach fla july 29 2024 globe newswire fin...,2024-07-29,Ford,esg


In [445]:
# Let's remove numbers from the 'story' column using regular expressions

for index, row in news_df.iterrows():
    row_text = re.sub(r'\d+', '', row['story'])
    news_df.at[index, 'story'] = row_text

In [446]:
news_df.head()

Unnamed: 0,story,date,company,ticker
0,los angeles ca accesswire july schall law fi...,2024-07-29,Ford,esg
1,new york city ny accesswire july bronstein g...,2024-07-29,Ford,esg
2,ford alert bragar eagel amp squire pc investig...,2024-07-29,Ford,esg
3,first atlantic nickel corp fanv alaska energy ...,2024-07-29,Ford,esg
4,palm beach fla july globe newswire financial...,2024-07-29,Ford,esg


In [447]:
# Let's remove non-alphanumeric characters from the column 'story'
def remove_symbols(text):
    cleaned_text = re.sub(r'[^\w\s]','', text)
    return cleaned_text

news_df['story'] = news_df['story'].apply(lambda x: remove_symbols(x))

In [448]:
news_df.head()

Unnamed: 0,story,date,company,ticker
0,los angeles ca accesswire july schall law fi...,2024-07-29,Ford,esg
1,new york city ny accesswire july bronstein g...,2024-07-29,Ford,esg
2,ford alert bragar eagel amp squire pc investig...,2024-07-29,Ford,esg
3,first atlantic nickel corp fanv alaska energy ...,2024-07-29,Ford,esg
4,palm beach fla july globe newswire financial...,2024-07-29,Ford,esg


In [449]:
# Let's remove first names of ppl from the news articles

def remove_first_names(text):
    doc = nlp(text)
    filtered_text = ' '.join([token.text if token.ent_type_ != 'person' else '' for token in doc])
    return ' '.join(filtered_text.split())

for index, row in news_df.iterrows():
    # Let's remove first names of ppl using spaCy NER
    news_df.at[index, 'story'] = remove_first_names(row['story'])



In [450]:
news_df.head()

Unnamed: 0,story,date,company,ticker
0,los angeles ca accesswire july schall law firm...,2024-07-29,Ford,esg
1,new york city ny accesswire july bronstein gew...,2024-07-29,Ford,esg
2,ford alert bragar eagel amp squire pc investig...,2024-07-29,Ford,esg
3,first atlantic nickel corp fanv alaska energy ...,2024-07-29,Ford,esg
4,palm beach fla july globe newswire financialne...,2024-07-29,Ford,esg


In [451]:
# Save the clean ESG-related news 
news_df.to_csv('../Data/Output/obj2_ESG_news_clean.csv', index=False)

In [452]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to apply VADER and get sentiment scores
def apply_vader_sentiment(text):
    # Get the sentiment scores from VADER
    sentiment_scores = analyzer.polarity_scores(text)
    return sentiment_scores

# Apply VADER sentiment analysis to the "story" column
news_df['vader_sentiment'] = news_df['story'].apply(apply_vader_sentiment)

# Split the sentiment scores into separate columns (optional)
news_df = pd.concat([news_df.drop(['vader_sentiment'], axis=1), news_df['vader_sentiment'].apply(pd.Series)], axis=1)

# Define a function to classify sentiment based on the 'compound' score
def classify_sentiment(compound_score):
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply the classify_sentiment function to create a new 'sentiment' column
news_df['sentiment'] = news_df['compound'].apply(classify_sentiment)

news_df

Unnamed: 0,story,date,company,ticker,neg,neu,pos,compound,sentiment
0,los angeles ca accesswire july schall law firm...,2024-07-29,Ford,esg,0.106,0.764,0.130,0.5719,positive
1,new york city ny accesswire july bronstein gew...,2024-07-29,Ford,esg,0.067,0.854,0.079,0.1027,positive
2,ford alert bragar eagel amp squire pc investig...,2024-07-29,Ford,esg,0.066,0.783,0.150,0.9552,positive
3,first atlantic nickel corp fanv alaska energy ...,2024-07-29,Ford,esg,0.037,0.808,0.155,0.9995,positive
4,palm beach fla july globe newswire financialne...,2024-07-29,Ford,esg,0.035,0.811,0.154,0.9994,positive
...,...,...,...,...,...,...,...,...,...
985,tesco uk largest supermarket chain sparked con...,2024-05-15,Tesco,governance,0.127,0.685,0.187,0.9451,positive
986,tesco accused giving struggling worker slap fa...,2024-05-14,Tesco,governance,0.116,0.688,0.196,0.9896,positive
987,tesco bos ken murphy seen pay deal double almo...,2024-05-14,Tesco,governance,0.060,0.733,0.207,0.9886,positive
988,tesco apologised black publisher say racially ...,2024-05-20,Tesco,sustainability,0.134,0.728,0.138,-0.6794,negative


In [453]:
# Save the VADER sentiment df for ESG-related news
news_df.to_csv('../Data/Output/obj2_ESG_news_vader.csv', index=False)


In [455]:
# Function to normalise sentiment proportions and calculate Sent_d
def calculate_sentiment_score(df):
    # Calculate total count of all sentiment categories
    df['total'] = df['pos'] + df['neu'] + df['neg']
    
    # Normalize to get probabilities (frequencies) of positive, neutral, and negative
    df['pos_prob'] = df['pos'] / df['total']
    df['neut_prob'] = df['neu'] / df['total']
    df['neg_prob'] = df['neg'] / df['total']
    
    # Confirm the probabilities sum to 1
    df['sum_probs'] = df['pos_prob'] + df['neut_prob'] + df['neg_prob']
    
    # Calculate Sent_d using the normalized probabilities
    df['Sent_d'] = (df['pos'] - df['neg']) / (df['pos'] + df['neu'] + df['neg'] + 3)
    
    return df


# Group by company and date for daily aggregation
grouped_df = news_df.groupby(['company', 'date']).sum().reset_index()

# Apply the sentiment calculation
Daily_Sentiment_Compound_df = calculate_sentiment_score(grouped_df)


Daily_Sentiment_Compound_df = Daily_Sentiment_Compound_df[['company', 'date', 'pos_prob', 'neut_prob', 'neg_prob', 'Sent_d']]

Daily_Sentiment_Compound_df

Unnamed: 0,company,date,pos_prob,neut_prob,neg_prob,Sent_d
0,Asda,2024-06-07,0.138000,0.765000,0.097000,0.010250
1,Asda,2024-07-08,0.173000,0.818000,0.009000,0.065600
2,Asda,2024-07-23,0.097000,0.624000,0.279000,-0.045500
3,Asda,2024-07-24,0.163000,0.794000,0.043000,0.048000
4,Asda,2024-07-26,0.097000,0.608000,0.295000,-0.049500
...,...,...,...,...,...,...
211,Toyota,2024-07-25,0.135904,0.755808,0.108288,0.025644
212,Toyota,2024-07-26,0.138128,0.766626,0.095246,0.040455
213,Toyota,2024-07-27,0.167167,0.754064,0.078770,0.064284
214,Toyota,2024-07-28,0.154341,0.733763,0.111896,0.035742


In [459]:
# Let's save Daily_Sentiment_Compound_df for ESG-related news

Daily_Sentiment_Compound_df.to_csv('../Data/Output/obj2_ESG_daily_sentiment_compound.csv', index=False)

### Let's perform the Granger’s causality testing using the dedicated Python library “grangercausalitytests.” 

In [456]:
# Ensure 'date' column is in datetime format in both dataframes
Daily_Sentiment_Compound_df['date'] = pd.to_datetime(Daily_Sentiment_Compound_df['date'])
stock_performance_df['date'] = pd.to_datetime(stock_performance_df['date'])


In [457]:
# Merge the sentiment dataframe with the stock performance dataframe on 'company' and 'date'
merged_df = pd.merge(Daily_Sentiment_Compound_df[['company', 'date', 'Sent_d']], 
                     stock_performance_df[['company', 'date', 'daily_return']], 
                     on=['company', 'date'], how='inner')


In [458]:
merged_df

Unnamed: 0,company,date,Sent_d,daily_return
0,Asda,2024-06-07,0.010250,-0.019094
1,Asda,2024-07-08,0.065600,-0.005153
2,Asda,2024-07-23,-0.045500,0.003829
3,Asda,2024-07-24,0.048000,-0.000708
4,Asda,2024-07-26,-0.049500,-0.003433
...,...,...,...,...
177,Toyota,2024-07-23,0.034458,0.003242
178,Toyota,2024-07-24,0.019665,-0.018041
179,Toyota,2024-07-25,0.025644,-0.009938
180,Toyota,2024-07-26,0.040455,-0.014081


In [259]:
# Drop any rows with missing data, as Granger causality tests require complete cases
merged_df = merged_df.dropna()
merged_df

Unnamed: 0,company,date,Sent_d,daily_return
0,Asda,2024-06-07,0.010250,-0.019094
1,Asda,2024-07-08,0.065600,-0.005153
2,Asda,2024-07-23,-0.045500,0.003829
3,Asda,2024-07-24,0.048000,-0.000708
4,Asda,2024-07-26,-0.049500,-0.003433
...,...,...,...,...
177,Toyota,2024-07-23,0.034458,0.003242
178,Toyota,2024-07-24,0.019665,-0.018041
179,Toyota,2024-07-25,0.025644,-0.009938
180,Toyota,2024-07-26,0.040455,-0.014081


In [430]:
# Save merged_df
merged_df.to_csv('../Data/Output/obj2_ESGnews_stock_merged.csv', index=False)

In [260]:
# Filter the dataset for rows where 'company' is 'Asda'
Asda_merged_df = merged_df[merged_df['company'] == 'Asda']
Asda_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
0,Asda,2024-06-07,0.01025,-0.019094
1,Asda,2024-07-08,0.0656,-0.005153
2,Asda,2024-07-23,-0.0455,0.003829
3,Asda,2024-07-24,0.048,-0.000708
4,Asda,2024-07-26,-0.0495,-0.003433


In [261]:
# Perform Granger causality test for ASDA
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Asda_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.2503  , p=0.7047  , df_denom=1, df_num=1
ssr based chi2 test:   chi2=1.0011  , p=0.3171  , df=1
likelihood ratio test: chi2=0.8934  , p=0.3445  , df=1
parameter F test:         F=0.2503  , p=0.7047  , df_denom=1, df_num=1
Lag 1: p-value = 0.7046968509482529
At lag 1, we fail to reject the null hypothesis.



In [262]:
# Filter the dataset for rows where 'company' is 'Ford'
Ford_merged_df = merged_df[merged_df['company'] == 'Ford']
Ford_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
5,Ford,2024-05-02,0.064774,0.023492
6,Ford,2024-06-20,0.03325,0.013491
7,Ford,2024-06-21,0.059697,-0.00841
8,Ford,2024-06-24,0.05942,0.032408
9,Ford,2024-06-25,0.059571,-0.011513


In [263]:
# Perform Granger causality test for FORD
# Set the maximum number of lags to test
max_lag = 5

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Ford_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.2008  , p=0.2869  , df_denom=19, df_num=1
ssr based chi2 test:   chi2=1.3904  , p=0.2383  , df=1
likelihood ratio test: chi2=1.3482  , p=0.2456  , df=1
parameter F test:         F=1.2008  , p=0.2869  , df_denom=19, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.0739  , p=0.3651  , df_denom=16, df_num=2
ssr based chi2 test:   chi2=2.8189  , p=0.2443  , df=2
likelihood ratio test: chi2=2.6451  , p=0.2665  , df=2
parameter F test:         F=1.0739  , p=0.3651  , df_denom=16, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.4889  , p=0.6960  , df_denom=13, df_num=3
ssr based chi2 test:   chi2=2.2562  , p=0.5210  , df=3
likelihood ratio test: chi2=2.1378  , p=0.5443  , df=3
parameter F test:         F=0.4889  , p=0.6960  , df_denom=13, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.7570  , p=0.5761  , df_d

In [264]:
# Filter the dataset for rows where 'company' is 'Marks & Spencer'
MarksSpencer_merged_df = merged_df[merged_df['company'] == 'Marks & Spencer']
MarksSpencer_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
28,Marks & Spencer,2024-05-22,0.062016,0.050563
29,Marks & Spencer,2024-05-29,0.047012,0.002646
30,Marks & Spencer,2024-05-30,0.0422,0.003626
31,Marks & Spencer,2024-06-03,0.055667,0.013485
32,Marks & Spencer,2024-06-06,0.037,0.00323


In [265]:
# Perform Granger causality test for M&S
# Set the maximum number of lags to test
max_lag = 2

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(MarksSpencer_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=6.3092  , p=0.0203  , df_denom=21, df_num=1
ssr based chi2 test:   chi2=7.2105  , p=0.0072  , df=1
likelihood ratio test: chi2=6.3048  , p=0.0120  , df=1
parameter F test:         F=6.3092  , p=0.0203  , df_denom=21, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=7.8530  , p=0.0035  , df_denom=18, df_num=2
ssr based chi2 test:   chi2=20.0688 , p=0.0000  , df=2
likelihood ratio test: chi2=14.4280 , p=0.0007  , df=2
parameter F test:         F=7.8530  , p=0.0035  , df_denom=18, df_num=2
Lag 1: p-value = 0.020254700175103175
At lag 1, we reject the null hypothesis. Sentiment influences stock market performance.

Lag 2: p-value = 0.0035325440251965604
At lag 2, we reject the null hypothesis. Sentiment influences stock market performance.



In [266]:
# Filter the dataset for rows where 'company' is 'Ocado'
Ocado_merged_df = merged_df[merged_df['company'] == 'Ocado']
Ocado_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
53,Ocado,2024-05-09,0.037,0.022582
54,Ocado,2024-07-08,0.00225,0.052527
55,Ocado,2024-07-16,0.074819,0.057371
56,Ocado,2024-07-17,0.0635,-0.008357
57,Ocado,2024-07-19,-0.002001,-0.037378


In [267]:
# Perform Granger causality test for OCADO
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Ocado_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=2.7205  , p=0.1600  , df_denom=5, df_num=1
ssr based chi2 test:   chi2=4.3527  , p=0.0369  , df=1
likelihood ratio test: chi2=3.4755  , p=0.0623  , df=1
parameter F test:         F=2.7205  , p=0.1600  , df_denom=5, df_num=1
Lag 1: p-value = 0.1599826891583226
At lag 1, we fail to reject the null hypothesis.



In [268]:
# Filter the dataset for rows where 'company' is 'Polestar'
Polestar_merged_df = merged_df[merged_df['company'] == 'Polestar']
Polestar_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
62,Polestar,2024-06-05,0.028571,0.025477
63,Polestar,2024-06-28,-0.007145,-0.048391


In [269]:
# Perform Granger causality test for POLESTAR
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Polestar_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")


ValueError: Insufficient observations. Maximum allowable lag is -1

In [270]:
# Filter the dataset for rows where 'company' is 'Sainsburys'
Sainsburys_merged_df = merged_df[merged_df['company'] == 'Sainsburys']
Sainsburys_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
64,Sainsburys,2024-06-12,0.0405,-0.002337
65,Sainsburys,2024-07-01,0.0935,0.010921
66,Sainsburys,2024-07-02,-0.0068,-0.029124
67,Sainsburys,2024-07-16,0.04375,0.018962


In [271]:
# Perform Granger causality test for SAINSBURY'S
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Sainsburys_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")

ValueError: Insufficient observations. Maximum allowable lag is 0

In [272]:
# Filter the dataset for rows where 'company' is 'Stellantis'
Stellantis_merged_df = merged_df[merged_df['company'] == 'Stellantis']
Stellantis_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
68,Stellantis,2024-05-03,0.02325,0.017366
69,Stellantis,2024-05-06,-0.0175,-0.004931
70,Stellantis,2024-05-07,0.033807,-0.001484
71,Stellantis,2024-05-13,0.00475,0.036315
72,Stellantis,2024-05-14,0.0808,0.020707


In [273]:
# Perform Granger causality test for STELLANTIS
# Set the maximum number of lags to test
max_lag = 5

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Stellantis_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.0633  , p=0.8053  , df_denom=13, df_num=1
ssr based chi2 test:   chi2=0.0779  , p=0.7801  , df=1
likelihood ratio test: chi2=0.0777  , p=0.7804  , df=1
parameter F test:         F=0.0633  , p=0.8053  , df_denom=13, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.0408  , p=0.9602  , df_denom=10, df_num=2
ssr based chi2 test:   chi2=0.1223  , p=0.9407  , df=2
likelihood ratio test: chi2=0.1218  , p=0.9409  , df=2
parameter F test:         F=0.0408  , p=0.9602  , df_denom=10, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.0452  , p=0.9862  , df_denom=7, df_num=3
ssr based chi2 test:   chi2=0.2711  , p=0.9654  , df=3
likelihood ratio test: chi2=0.2685  , p=0.9658  , df=3
parameter F test:         F=0.0452  , p=0.9862  , df_denom=7, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.1878  , p=0.9329  , df_den

In [274]:
# Filter the dataset for rows where 'company' is 'Tesco'
Tesco_merged_df = merged_df[merged_df['company'] == 'Tesco']
Tesco_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
85,Tesco,2024-05-03,0.0495,-0.005016
86,Tesco,2024-05-09,-0.011,0.007767
87,Tesco,2024-05-14,0.064857,0.00415
88,Tesco,2024-05-15,0.02401,-0.010247
89,Tesco,2024-05-20,0.001,-0.00161


In [275]:
# Perform Granger causality test for TESCO
# Set the maximum number of lags to test
max_lag = 5

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Tesco_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.2004  , p=0.6581  , df_denom=26, df_num=1
ssr based chi2 test:   chi2=0.2236  , p=0.6363  , df=1
likelihood ratio test: chi2=0.2227  , p=0.6370  , df=1
parameter F test:         F=0.2004  , p=0.6581  , df_denom=26, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.5867  , p=0.5643  , df_denom=23, df_num=2
ssr based chi2 test:   chi2=1.4285  , p=0.4896  , df=2
likelihood ratio test: chi2=1.3933  , p=0.4983  , df=2
parameter F test:         F=0.5867  , p=0.5643  , df_denom=23, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.3581  , p=0.7838  , df_denom=20, df_num=3
ssr based chi2 test:   chi2=1.4504  , p=0.6938  , df=3
likelihood ratio test: chi2=1.4127  , p=0.7026  , df=3
parameter F test:         F=0.3581  , p=0.7838  , df_denom=20, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.1728  , p=0.9493  , df_d

In [276]:
# Filter the dataset for rows where 'company' is 'Tesla'
Tesla_merged_df = merged_df[merged_df['company'] == 'Tesla']
Tesla_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
115,Tesla,2024-05-08,0.00175,-0.017531
116,Tesla,2024-05-10,0.003834,-0.020562
117,Tesla,2024-05-14,-0.002999,0.032398
118,Tesla,2024-05-17,-0.013748,0.014874
119,Tesla,2024-05-20,0.092,-0.014245


In [277]:
# Perform Granger causality test for TESLA
# Set the maximum number of lags to test
max_lag = 3

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Tesla_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.2243  , p=0.6390  , df_denom=32, df_num=1
ssr based chi2 test:   chi2=0.2453  , p=0.6204  , df=1
likelihood ratio test: chi2=0.2445  , p=0.6210  , df=1
parameter F test:         F=0.2243  , p=0.6390  , df_denom=32, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=2.6719  , p=0.0861  , df_denom=29, df_num=2
ssr based chi2 test:   chi2=6.2650  , p=0.0436  , df=2
likelihood ratio test: chi2=5.7502  , p=0.0564  , df=2
parameter F test:         F=2.6719  , p=0.0861  , df_denom=29, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=3.8542  , p=0.0209  , df_denom=26, df_num=3
ssr based chi2 test:   chi2=14.6756 , p=0.0021  , df=3
likelihood ratio test: chi2=12.1411 , p=0.0069  , df=3
parameter F test:         F=3.8542  , p=0.0209  , df_denom=26, df_num=3
Lag 1: p-value = 0.63900636811279
At lag 1, we fail to reject the null hypothesis.

Lag 2: p-value =

In [278]:
# Filter the dataset for rows where 'company' is 'Toyota'
Toyota_merged_df = merged_df[merged_df['company'] == 'Toyota']
Toyota_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
152,Toyota,2024-05-02,0.104901,0.017351
153,Toyota,2024-05-14,0.0894,0.00684
154,Toyota,2024-05-15,0.04875,0.01113
155,Toyota,2024-05-28,0.042604,0.008633
156,Toyota,2024-05-29,0.0755,-0.02205


In [279]:
# Perform Granger causality test for TOYOTA
# Set the maximum number of lags to test
max_lag = 7

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Toyota_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.3807  , p=0.5426  , df_denom=26, df_num=1
ssr based chi2 test:   chi2=0.4246  , p=0.5146  , df=1
likelihood ratio test: chi2=0.4216  , p=0.5162  , df=1
parameter F test:         F=0.3807  , p=0.5426  , df_denom=26, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.1237  , p=0.8842  , df_denom=23, df_num=2
ssr based chi2 test:   chi2=0.3012  , p=0.8602  , df=2
likelihood ratio test: chi2=0.2996  , p=0.8609  , df=2
parameter F test:         F=0.1237  , p=0.8842  , df_denom=23, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.6975  , p=0.5645  , df_denom=20, df_num=3
ssr based chi2 test:   chi2=2.8250  , p=0.4194  , df=3
likelihood ratio test: chi2=2.6867  , p=0.4425  , df=3
parameter F test:         F=0.6975  , p=0.5645  , df_denom=20, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.8584  , p=0.5084  , df_d

In [280]:
# Convert 'date' column to datetime format
merged_df['date'] = pd.to_datetime(merged_df['date'])

# Count the number of available data points per company
available_data_counts = merged_df.groupby('company').size().reset_index(name='available_data_points')

# Add the total_days column with the same value for all companies
available_data_counts['total_days'] = N

# Calculate the percentage of available data
available_data_counts['percentage_available'] = (available_data_counts['available_data_points'] / available_data_counts['total_days']) * 100

# Display the result
print(available_data_counts[['company', 'total_days', 'available_data_points', 'percentage_available']])


           company  total_days  available_data_points  percentage_available
0             Asda          92                      5              5.434783
1             Ford          92                     23             25.000000
2  Marks & Spencer          92                     25             27.173913
3            Ocado          92                      9              9.782609
4         Polestar          92                      2              2.173913
5       Sainsburys          92                      4              4.347826
6       Stellantis          92                     17             18.478261
7            Tesco          92                     30             32.608696
8            Tesla          92                     36             39.130435
9           Toyota          92                     30             32.608696


### Conclusion: M&S and Tesla show correlation between sentiment and returns 

## End of main