### Objective Two

#### Predict stock market volatility using ESG-related news

In [114]:
# Import libraries
import pandas as pd
import numpy as np
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from datetime import datetime

# Granger's casuality test library
from statsmodels.tsa.stattools import grangercausalitytests

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


from statsmodels.tsa.api import VAR


# Import VADER for sentiment analysis
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import CountVectorizer


import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /Users/luca/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/luca/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/luca/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/luca/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [117]:
import gensim

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

### Step 1: Calculate the daily market returns time series and volatility

In [2]:
# Load price data
prices_file_path = '../Data/Input/Eikon/refinitiv_prices_raw.csv'


# Read the CSV file into a DataFrame
prices_df = pd.read_csv(prices_file_path)

prices_df

Unnamed: 0,company,symbol,date,close,open,high,low
0,Ford,F,2024-04-01,13.29,13.33,13.380,13.140
1,Ford,F,2024-04-02,13.28,13.16,13.370,13.090
2,Ford,F,2024-04-03,13.65,13.25,13.680,13.230
3,Ford,F,2024-04-04,13.21,13.90,13.950,13.170
4,Ford,F,2024-04-05,13.28,13.27,13.395,13.090
...,...,...,...,...,...,...,...
955,Tesco,TSCO.L,2024-08-09,330.50,328.50,331.100,326.200
956,Tesco,TSCO.L,2024-08-12,333.50,331.70,333.900,330.880
957,Tesco,TSCO.L,2024-08-13,335.00,333.80,338.000,333.400
958,Tesco,TSCO.L,2024-08-14,340.60,336.50,340.600,336.400


In [3]:
# Let's filter the dataset to match the dates of the available news stories 

# Ensure 'date' column is in datetime format
prices_df['date'] = pd.to_datetime(prices_df['date'])

# Define the date range for filtering
start_date = '2024-05-01'
end_date = '2024-07-31'

# Convert string to datetime object
date_format = "%Y-%m-%d"
start_date = datetime.strptime(start_date, date_format)
end_date = datetime.strptime(end_date, date_format)

delta = (end_date-start_date)
N = delta.days + 1
print(N)

# Filter the DataFrame to keep rows between 1st May 2024 and 31st July 2024
stock_performance_df = prices_df[(prices_df['date'] >= start_date) & (prices_df['date'] <= end_date)]

# Display the filtered data
print(stock_performance_df.head())


92
   company symbol       date  close   open   high      low
22    Ford      F 2024-05-01  12.20  12.16  12.43  12.1500
23    Ford      F 2024-05-02  12.49  12.40  12.55  12.3600
24    Ford      F 2024-05-03  12.43  12.64  12.76  12.3900
25    Ford      F 2024-05-06  12.50  12.54  12.62  12.4575
26    Ford      F 2024-05-07  12.17  12.40  12.45  12.0850


In [4]:
stock_performance_df

Unnamed: 0,company,symbol,date,close,open,high,low
22,Ford,F,2024-05-01,12.20,12.16,12.4300,12.1500
23,Ford,F,2024-05-02,12.49,12.40,12.5500,12.3600
24,Ford,F,2024-05-03,12.43,12.64,12.7600,12.3900
25,Ford,F,2024-05-06,12.50,12.54,12.6200,12.4575
26,Ford,F,2024-05-07,12.17,12.40,12.4500,12.0850
...,...,...,...,...,...,...,...
944,Tesco,TSCO.L,2024-07-25,325.60,325.60,327.1000,323.4000
945,Tesco,TSCO.L,2024-07-26,327.10,325.10,328.6000,323.3490
946,Tesco,TSCO.L,2024-07-29,330.90,330.00,333.7000,328.1630
947,Tesco,TSCO.L,2024-07-30,331.70,330.00,332.8000,328.4000


The daily market return on day t is calculated as:

\begin{equation}
r_{t} = ln \left(\frac{CLOSE_{t}}{CLOSE_{t-1}}\right)
\end{equation}

where $CLOSE_{t}$ is the closing price on day t and $CLOSE_{t-1}$ is the previous day closing price.

In [5]:
# Sort by company and date to maintain the proper order for each company
stock_performance_df = stock_performance_df.sort_values(by=['company', 'date'])


In [6]:
# Calculate daily market return using the formula: r_t = log(CLOSE_t / CLOSE_t-1)
stock_performance_df['daily_return'] = stock_performance_df.groupby('company', group_keys=False)['close'].apply(
    lambda x: np.log(x / x.shift(1))
)

# Display the result
print(stock_performance_df.head())

    company symbol       date  close   open    high    low  daily_return
502    Asda    WMT 2024-05-01  58.85  59.31  59.410  58.72           NaN
503    Asda    WMT 2024-05-02  59.71  58.94  59.885  58.58      0.014508
504    Asda    WMT 2024-05-03  59.82  59.62  59.980  59.14      0.001841
505    Asda    WMT 2024-05-06  59.87  60.00  60.000  59.39      0.000835
506    Asda    WMT 2024-05-07  60.62  60.17  60.800  60.05      0.012449


In [7]:
stock_performance_df

Unnamed: 0,company,symbol,date,close,open,high,low,daily_return
502,Asda,WMT,2024-05-01,58.85,59.31,59.410,58.7200,
503,Asda,WMT,2024-05-02,59.71,58.94,59.885,58.5800,0.014508
504,Asda,WMT,2024-05-03,59.82,59.62,59.980,59.1400,0.001841
505,Asda,WMT,2024-05-06,59.87,60.00,60.000,59.3900,0.000835
506,Asda,WMT,2024-05-07,60.62,60.17,60.800,60.0500,0.012449
...,...,...,...,...,...,...,...,...
464,Toyota,TM,2024-07-25,195.25,197.43,197.430,193.7300,-0.009938
465,Toyota,TM,2024-07-26,192.52,190.74,192.840,190.5100,-0.014081
466,Toyota,TM,2024-07-29,192.48,193.00,193.200,191.8067,-0.000208
467,Toyota,TM,2024-07-30,193.11,194.96,195.480,192.2650,0.003268


Calculate Volatility

The volatility of the stock market index is calculated within a defined time window (e.g., previous 90 days):

\begin{equation}
Vol = \sqrt{\frac{1}{N}\sum_{t=1}^{N}(r_{t}-\bar{r})^2} \cdot \sqrt{252}
\end{equation}

where N is the total number of days during a window time of observations (eg, 30 days), and 252 is the total number of trading days in a single year;

In [8]:
def calculate_volatility(df):
    # Group by company to calculate volatility for each stock
    volatility_df = df.groupby('company').apply(lambda x: calculate_stock_volatility(x))
    
    # Reset the index 
    volatility_df = volatility_df.reset_index(drop=True)
    
    return volatility_df

def calculate_stock_volatility(stock_df):
    # Number of trading days (rows)
    N = len(stock_df)
    
    # Mean of daily returns (r̄)
    mean_return = stock_df['daily_return'].mean()
    
    # Variance calculation: (r_t - r̄)^2
    variance = np.sum((stock_df['daily_return'] - mean_return) ** 2) / N
    
    # Daily volatility: sqrt(variance)
    daily_volatility = np.sqrt(variance)
    
    # Annual volatility: daily_volatility * sqrt(252)
    annual_volatility = daily_volatility * np.sqrt(252)
    
    # Return the company and its calculated volatility
    return pd.Series({'company': stock_df['company'].iloc[0], 'volatility': annual_volatility})


# Assuming filtered_df is already loaded and contains 'company' and 'daily_return' columns
volatility_results = calculate_volatility(stock_performance_df)

# Display the calculated volatility for each company
print(volatility_results)


           company  volatility
0             Asda    0.187419
1             Ford    0.477228
2  Marks & Spencer    0.250589
3            Ocado    0.745336
4         Polestar    1.047160
5       Sainsburys    0.195476
6       Stellantis    0.294795
7            Tesco    0.120855
8            Tesla    0.566548
9           Toyota    0.224128


In [9]:
# Sort the DataFrame by 'volatility' in descending order
volatility_results = volatility_results.sort_values(by='volatility', ascending=False)
volatility_results

Unnamed: 0,company,volatility
4,Polestar,1.04716
3,Ocado,0.745336
8,Tesla,0.566548
1,Ford,0.477228
6,Stellantis,0.294795
2,Marks & Spencer,0.250589
9,Toyota,0.224128
5,Sainsburys,0.195476
0,Asda,0.187419
7,Tesco,0.120855


### Step 2: Run sentiment Analysis of the ESG news stories using VADER and set up the daily sentiment score time series

In [10]:
# Load Pre-Processed ESG stories data from Objective One
stories_file_path = '../Data/Output/news_df.csv'


# Read the CSV file into a DataFrame
news_df = pd.read_csv(stories_file_path)

news_df

Unnamed: 0,story,date,company,ticker
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,Ford,esg
1,new york city ny accesswire july 29 2024 br...,2024-07-29,Ford,esg
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,Ford,esg
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,Ford,esg
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,Ford,esg
...,...,...,...,...
985,tesco the uks largest supermarket chain has sp...,2024-05-15,Tesco,governance
986,tesco has been accused of giving struggling wo...,2024-05-14,Tesco,governance
987,tesco boss ken murphy has seen his pay deal mo...,2024-05-14,Tesco,governance
988,tesco has apologised after a black publisher s...,2024-05-20,Tesco,sustainability


In [11]:
# Let's initialise the 'stop words' function for English 

stop = stopwords.words('english')
stop[:5]

['i', 'me', 'my', 'myself', 'we']

In [12]:
# Let's remove 'stop words' from the stories

news_df['story'] = news_df['story'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop]))
news_df.head()

Unnamed: 0,story,date,company,ticker
0,los angeles ca accesswire july 29 2024 schall ...,2024-07-29,Ford,esg
1,new york city ny accesswire july 29 2024 brons...,2024-07-29,Ford,esg
2,ford alert bragar eagel amp squire pc investig...,2024-07-29,Ford,esg
3,first atlantic nickel corp fanv alaska energy ...,2024-07-29,Ford,esg
4,palm beach fla july 29 2024 globe newswire fin...,2024-07-29,Ford,esg


In [13]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to apply VADER and get sentiment scores
def apply_vader_sentiment(text):
    # Get the sentiment scores from VADER
    sentiment_scores = analyzer.polarity_scores(text)
    return sentiment_scores

# Apply VADER sentiment analysis to the "story" column
news_df['vader_sentiment'] = news_df['story'].apply(apply_vader_sentiment)

# Split the sentiment scores into separate columns (optional)
news_df = pd.concat([news_df.drop(['vader_sentiment'], axis=1), news_df['vader_sentiment'].apply(pd.Series)], axis=1)

# Define a function to classify sentiment based on the 'compound' score
def classify_sentiment(compound_score):
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply the classify_sentiment function to create a new 'sentiment' column
news_df['sentiment'] = news_df['compound'].apply(classify_sentiment)

news_df

Unnamed: 0,story,date,company,ticker,neg,neu,pos,compound,sentiment
0,los angeles ca accesswire july 29 2024 schall ...,2024-07-29,Ford,esg,0.100,0.779,0.121,0.5267,positive
1,new york city ny accesswire july 29 2024 brons...,2024-07-29,Ford,esg,0.065,0.864,0.071,-0.1531,negative
2,ford alert bragar eagel amp squire pc investig...,2024-07-29,Ford,esg,0.063,0.800,0.137,0.9442,positive
3,first atlantic nickel corp fanv alaska energy ...,2024-07-29,Ford,esg,0.033,0.818,0.149,0.9995,positive
4,palm beach fla july 29 2024 globe newswire fin...,2024-07-29,Ford,esg,0.033,0.819,0.148,0.9994,positive
...,...,...,...,...,...,...,...,...,...
985,tesco uks largest supermarket chain sparked co...,2024-05-15,Tesco,governance,0.123,0.696,0.181,0.9442,positive
986,tesco accused giving struggling workers slap f...,2024-05-14,Tesco,governance,0.116,0.704,0.180,0.9863,positive
987,tesco boss ken murphy seen pay deal double alm...,2024-05-14,Tesco,governance,0.057,0.747,0.196,0.9882,positive
988,tesco apologised black publisher says racially...,2024-05-20,Tesco,sustainability,0.142,0.732,0.125,-0.8141,negative


In [14]:
news_df

Unnamed: 0,story,date,company,ticker,neg,neu,pos,compound,sentiment
0,los angeles ca accesswire july 29 2024 schall ...,2024-07-29,Ford,esg,0.100,0.779,0.121,0.5267,positive
1,new york city ny accesswire july 29 2024 brons...,2024-07-29,Ford,esg,0.065,0.864,0.071,-0.1531,negative
2,ford alert bragar eagel amp squire pc investig...,2024-07-29,Ford,esg,0.063,0.800,0.137,0.9442,positive
3,first atlantic nickel corp fanv alaska energy ...,2024-07-29,Ford,esg,0.033,0.818,0.149,0.9995,positive
4,palm beach fla july 29 2024 globe newswire fin...,2024-07-29,Ford,esg,0.033,0.819,0.148,0.9994,positive
...,...,...,...,...,...,...,...,...,...
985,tesco uks largest supermarket chain sparked co...,2024-05-15,Tesco,governance,0.123,0.696,0.181,0.9442,positive
986,tesco accused giving struggling workers slap f...,2024-05-14,Tesco,governance,0.116,0.704,0.180,0.9863,positive
987,tesco boss ken murphy seen pay deal double alm...,2024-05-14,Tesco,governance,0.057,0.747,0.196,0.9882,positive
988,tesco apologised black publisher says racially...,2024-05-20,Tesco,sustainability,0.142,0.732,0.125,-0.8141,negative


In [15]:
# Function to normalise sentiment proportions and calculate Sent_d
def calculate_sentiment_score(df):
    # Calculate total count of all sentiment categories
    df['total'] = df['pos'] + df['neu'] + df['neg']
    
    # Normalize to get probabilities (frequencies) of positive, neutral, and negative
    df['pos_prob'] = df['pos'] / df['total']
    df['neut_prob'] = df['neu'] / df['total']
    df['neg_prob'] = df['neg'] / df['total']
    
    # Confirm the probabilities sum to 1
    df['sum_probs'] = df['pos_prob'] + df['neut_prob'] + df['neg_prob']
    
    # Calculate Sent_d using the normalized probabilities
    df['Sent_d'] = (df['pos'] - df['neg']) / (df['pos'] + df['neu'] + df['neg'] + 3)
    
    return df


# Group by company and date to ensure daily aggregation
grouped_df = news_df.groupby(['company', 'date']).sum().reset_index()

# Apply the sentiment calculation
Daily_Sentiment_Compound_df = calculate_sentiment_score(grouped_df)


Daily_Sentiment_Compound_df = Daily_Sentiment_Compound_df[['company', 'date', 'pos_prob', 'neut_prob', 'neg_prob', 'Sent_d']]

Daily_Sentiment_Compound_df


Unnamed: 0,company,date,pos_prob,neut_prob,neg_prob,Sent_d
0,Asda,2024-06-07,0.135000,0.770000,0.095000,0.010000
1,Asda,2024-07-08,0.139000,0.852000,0.009000,0.052000
2,Asda,2024-07-23,0.093000,0.640000,0.267000,-0.043500
3,Asda,2024-07-24,0.154845,0.804196,0.040959,0.045582
4,Asda,2024-07-26,0.094000,0.621000,0.285000,-0.047750
...,...,...,...,...,...,...
211,Toyota,2024-07-25,0.126452,0.774171,0.099377,0.025141
212,Toyota,2024-07-26,0.128203,0.786056,0.085742,0.040057
213,Toyota,2024-07-27,0.154125,0.768875,0.077000,0.056091
214,Toyota,2024-07-28,0.142884,0.749781,0.107335,0.029938


Step 3: perform the Granger’s causality testing using the dedicated Python library “grangercausalitytests.” 

In [16]:
# Ensure 'date' column is in datetime format in both dataframes
Daily_Sentiment_Compound_df['date'] = pd.to_datetime(Daily_Sentiment_Compound_df['date'])
stock_performance_df['date'] = pd.to_datetime(stock_performance_df['date'])


In [17]:

# Merge the sentiment dataframe with the stock performance dataframe on 'company' and 'date'
merged_df = pd.merge(Daily_Sentiment_Compound_df[['company', 'date', 'Sent_d']], 
                     stock_performance_df[['company', 'date', 'daily_return']], 
                     on=['company', 'date'], how='inner')


In [18]:
merged_df

Unnamed: 0,company,date,Sent_d,daily_return
0,Asda,2024-06-07,0.010000,-0.019094
1,Asda,2024-07-08,0.052000,-0.005153
2,Asda,2024-07-23,-0.043500,0.003829
3,Asda,2024-07-24,0.045582,-0.000708
4,Asda,2024-07-26,-0.047750,-0.003433
...,...,...,...,...
177,Toyota,2024-07-23,0.028831,0.003242
178,Toyota,2024-07-24,0.017616,-0.018041
179,Toyota,2024-07-25,0.025141,-0.009938
180,Toyota,2024-07-26,0.040057,-0.014081


In [19]:
# Drop any rows with missing data, as Granger causality tests require complete cases
merged_df = merged_df.dropna()
merged_df

Unnamed: 0,company,date,Sent_d,daily_return
0,Asda,2024-06-07,0.010000,-0.019094
1,Asda,2024-07-08,0.052000,-0.005153
2,Asda,2024-07-23,-0.043500,0.003829
3,Asda,2024-07-24,0.045582,-0.000708
4,Asda,2024-07-26,-0.047750,-0.003433
...,...,...,...,...
177,Toyota,2024-07-23,0.028831,0.003242
178,Toyota,2024-07-24,0.017616,-0.018041
179,Toyota,2024-07-25,0.025141,-0.009938
180,Toyota,2024-07-26,0.040057,-0.014081


In [42]:
# Filter the dataset for rows where 'company' is 'Asda'
Toyota_merged_df = merged_df[merged_df['company'] == 'Asda']
Toyota_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
0,Asda,2024-06-07,0.01,-0.019094
1,Asda,2024-07-08,0.052,-0.005153
2,Asda,2024-07-23,-0.0435,0.003829
3,Asda,2024-07-24,0.045582,-0.000708
4,Asda,2024-07-26,-0.04775,-0.003433


In [47]:
# Perform Granger causality test for ASDA
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Asda_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.1501  , p=0.7647  , df_denom=1, df_num=1
ssr based chi2 test:   chi2=0.6005  , p=0.4384  , df=1
likelihood ratio test: chi2=0.5595  , p=0.4545  , df=1
parameter F test:         F=0.1501  , p=0.7647  , df_denom=1, df_num=1
Lag 1: p-value = 0.7646763462978328
At lag 1, we fail to reject the null hypothesis.



In [50]:
# Filter the dataset for rows where 'company' is 'Ford'
Ford_merged_df = merged_df[merged_df['company'] == 'Ford']
Ford_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
5,Ford,2024-05-02,0.061575,0.023492
6,Ford,2024-06-20,0.03225,0.013491
7,Ford,2024-06-21,0.054579,-0.00841
8,Ford,2024-06-24,0.054556,0.032408
9,Ford,2024-06-25,0.05473,-0.011513


In [53]:
# Perform Granger causality test for FORD
# Set the maximum number of lags to test
max_lag = 5

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Ford_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.8919  , p=0.3568  , df_denom=19, df_num=1
ssr based chi2 test:   chi2=1.0327  , p=0.3095  , df=1
likelihood ratio test: chi2=1.0092  , p=0.3151  , df=1
parameter F test:         F=0.8919  , p=0.3568  , df_denom=19, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.8749  , p=0.4359  , df_denom=16, df_num=2
ssr based chi2 test:   chi2=2.2966  , p=0.3172  , df=2
likelihood ratio test: chi2=2.1795  , p=0.3363  , df=2
parameter F test:         F=0.8749  , p=0.4359  , df_denom=16, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.4632  , p=0.7128  , df_denom=13, df_num=3
ssr based chi2 test:   chi2=2.1380  , p=0.5443  , df=3
likelihood ratio test: chi2=2.0312  , p=0.5660  , df=3
parameter F test:         F=0.4632  , p=0.7128  , df_denom=13, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.7413  , p=0.5851  , df_d

In [56]:
# Filter the dataset for rows where 'company' is 'Marks & Spencer'
MarksSpencer_merged_df = merged_df[merged_df['company'] == 'Marks & Spencer']
MarksSpencer_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
28,Marks & Spencer,2024-05-22,0.05725,0.050563
29,Marks & Spencer,2024-05-29,0.0425,0.002646
30,Marks & Spencer,2024-05-30,0.0396,0.003626
31,Marks & Spencer,2024-06-03,0.052667,0.013485
32,Marks & Spencer,2024-06-06,0.0365,0.00323


In [58]:
# Perform Granger causality test for M&S
# Set the maximum number of lags to test
max_lag = 2

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(MarksSpencer_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=4.0383  , p=0.0575  , df_denom=21, df_num=1
ssr based chi2 test:   chi2=4.6152  , p=0.0317  , df=1
likelihood ratio test: chi2=4.2212  , p=0.0399  , df=1
parameter F test:         F=4.0383  , p=0.0575  , df_denom=21, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=5.6899  , p=0.0122  , df_denom=18, df_num=2
ssr based chi2 test:   chi2=14.5409 , p=0.0007  , df=2
likelihood ratio test: chi2=11.2685 , p=0.0036  , df=2
parameter F test:         F=5.6899  , p=0.0122  , df_denom=18, df_num=2
Lag 1: p-value = 0.057502848851336406
At lag 1, we fail to reject the null hypothesis.

Lag 2: p-value = 0.01216209340353528
At lag 2, we reject the null hypothesis. Sentiment influences stock market performance.



In [59]:
# Filter the dataset for rows where 'company' is 'Ocado'
Ocado_merged_df = merged_df[merged_df['company'] == 'Ocado']
Ocado_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
53,Ocado,2024-05-09,0.03475,0.022582
54,Ocado,2024-07-08,0.004,0.052527
55,Ocado,2024-07-16,0.078706,0.057371
56,Ocado,2024-07-17,0.059,-0.008357
57,Ocado,2024-07-19,-0.00175,-0.037378


In [61]:
# Perform Granger causality test for OCADO
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Ocado_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=4.5661  , p=0.0857  , df_denom=5, df_num=1
ssr based chi2 test:   chi2=7.3058  , p=0.0069  , df=1
likelihood ratio test: chi2=5.1903  , p=0.0227  , df=1
parameter F test:         F=4.5661  , p=0.0857  , df_denom=5, df_num=1
Lag 1: p-value = 0.08565825178491664
At lag 1, we fail to reject the null hypothesis.



In [62]:
# Filter the dataset for rows where 'company' is 'Polestar'
Polestar_merged_df = merged_df[merged_df['company'] == 'Polestar']
Polestar_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
62,Polestar,2024-06-05,0.027143,0.025477
63,Polestar,2024-06-28,-0.009143,-0.048391


In [67]:
# Perform Granger causality test for POLESTAR
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Polestar_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")


ValueError: Insufficient observations. Maximum allowable lag is -1

In [68]:
# Filter the dataset for rows where 'company' is 'Sainsburys'
Sainsburys_merged_df = merged_df[merged_df['company'] == 'Sainsburys']
Sainsburys_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
64,Sainsburys,2024-06-12,0.0335,-0.002337
65,Sainsburys,2024-07-01,0.0905,0.010921
66,Sainsburys,2024-07-02,-0.014406,-0.029124
67,Sainsburys,2024-07-16,0.0415,0.018962


In [69]:
# Perform Granger causality test for SAINSBURY'S
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Sainsburys_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")

ValueError: Insufficient observations. Maximum allowable lag is 0

In [70]:
# Filter the dataset for rows where 'company' is 'Stellantis'
Stellantis_merged_df = merged_df[merged_df['company'] == 'Stellantis']
Stellantis_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
68,Stellantis,2024-05-03,0.021,0.017366
69,Stellantis,2024-05-06,-0.01675,-0.004931
70,Stellantis,2024-05-07,0.033193,-0.001484
71,Stellantis,2024-05-13,0.0045,0.036315
72,Stellantis,2024-05-14,0.0776,0.020707


In [74]:
# Perform Granger causality test for STELLANTIS
# Set the maximum number of lags to test
max_lag = 5

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Stellantis_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.1345  , p=0.7197  , df_denom=13, df_num=1
ssr based chi2 test:   chi2=0.1656  , p=0.6841  , df=1
likelihood ratio test: chi2=0.1647  , p=0.6848  , df=1
parameter F test:         F=0.1345  , p=0.7197  , df_denom=13, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.0514  , p=0.9502  , df_denom=10, df_num=2
ssr based chi2 test:   chi2=0.1541  , p=0.9258  , df=2
likelihood ratio test: chi2=0.1534  , p=0.9262  , df=2
parameter F test:         F=0.0514  , p=0.9502  , df_denom=10, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.0655  , p=0.9765  , df_denom=7, df_num=3
ssr based chi2 test:   chi2=0.3930  , p=0.9417  , df=3
likelihood ratio test: chi2=0.3876  , p=0.9428  , df=3
parameter F test:         F=0.0655  , p=0.9765  , df_denom=7, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.2377  , p=0.9035  , df_den

In [75]:
# Filter the dataset for rows where 'company' is 'Tesco'
Tesco_merged_df = merged_df[merged_df['company'] == 'Tesco']
Tesco_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
85,Tesco,2024-05-03,0.04625,-0.005016
86,Tesco,2024-05-09,-0.01075,0.007767
87,Tesco,2024-05-14,0.058,0.00415
88,Tesco,2024-05-15,0.0232,-0.010247
89,Tesco,2024-05-20,-0.004251,-0.00161


In [77]:
# Perform Granger causality test for TESCO
# Set the maximum number of lags to test
max_lag = 5

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Tesco_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.1936  , p=0.6636  , df_denom=26, df_num=1
ssr based chi2 test:   chi2=0.2159  , p=0.6421  , df=1
likelihood ratio test: chi2=0.2151  , p=0.6428  , df=1
parameter F test:         F=0.1936  , p=0.6636  , df_denom=26, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.5928  , p=0.5610  , df_denom=23, df_num=2
ssr based chi2 test:   chi2=1.4433  , p=0.4859  , df=2
likelihood ratio test: chi2=1.4073  , p=0.4948  , df=2
parameter F test:         F=0.5928  , p=0.5610  , df_denom=23, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.3637  , p=0.7799  , df_denom=20, df_num=3
ssr based chi2 test:   chi2=1.4729  , p=0.6885  , df=3
likelihood ratio test: chi2=1.4341  , p=0.6976  , df=3
parameter F test:         F=0.3637  , p=0.7799  , df_denom=20, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.1790  , p=0.9462  , df_d

In [80]:
# Filter the dataset for rows where 'company' is 'Tesla'
Tesla_merged_df = merged_df[merged_df['company'] == 'Tesla']
Tesla_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
115,Tesla,2024-05-08,-0.00025,-0.017531
116,Tesla,2024-05-10,0.001667,-0.020562
117,Tesla,2024-05-14,-0.00325,0.032398
118,Tesla,2024-05-17,-0.02275,0.014874
119,Tesla,2024-05-20,0.088167,-0.014245


In [82]:
# Perform Granger causality test for TESLA
# Set the maximum number of lags to test
max_lag = 3

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Tesla_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.3368  , p=0.5657  , df_denom=32, df_num=1
ssr based chi2 test:   chi2=0.3684  , p=0.5439  , df=1
likelihood ratio test: chi2=0.3665  , p=0.5449  , df=1
parameter F test:         F=0.3368  , p=0.5657  , df_denom=32, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=2.4126  , p=0.1073  , df_denom=29, df_num=2
ssr based chi2 test:   chi2=5.6571  , p=0.0591  , df=2
likelihood ratio test: chi2=5.2329  , p=0.0731  , df=2
parameter F test:         F=2.4126  , p=0.1073  , df_denom=29, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=3.5877  , p=0.0271  , df_denom=26, df_num=3
ssr based chi2 test:   chi2=13.6610 , p=0.0034  , df=3
likelihood ratio test: chi2=11.4313 , p=0.0096  , df=3
parameter F test:         F=3.5877  , p=0.0271  , df_denom=26, df_num=3
Lag 1: p-value = 0.5657438251775035
At lag 1, we fail to reject the null hypothesis.

Lag 2: p-value

In [41]:
# Filter the dataset for rows where 'company' is 'Toyota'
Toyota_merged_df = merged_df[merged_df['company'] == 'Toyota']
Toyota_merged_df.head()

Unnamed: 0,company,date,Sent_d,daily_return
152,Toyota,2024-05-02,0.095889,0.017351
153,Toyota,2024-05-14,0.0714,0.00684
154,Toyota,2024-05-15,0.046738,0.01113
155,Toyota,2024-05-28,0.0459,0.008633
156,Toyota,2024-05-29,0.074167,-0.02205


In [79]:
# Perform Granger causality test for TOYOTA
# Set the maximum number of lags to test
max_lag = 7

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Toyota_merged_df[['daily_return','Sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.8919  , p=0.3568  , df_denom=19, df_num=1
ssr based chi2 test:   chi2=1.0327  , p=0.3095  , df=1
likelihood ratio test: chi2=1.0092  , p=0.3151  , df=1
parameter F test:         F=0.8919  , p=0.3568  , df_denom=19, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.8749  , p=0.4359  , df_denom=16, df_num=2
ssr based chi2 test:   chi2=2.2966  , p=0.3172  , df=2
likelihood ratio test: chi2=2.1795  , p=0.3363  , df=2
parameter F test:         F=0.8749  , p=0.4359  , df_denom=16, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.4632  , p=0.7128  , df_denom=13, df_num=3
ssr based chi2 test:   chi2=2.1380  , p=0.5443  , df=3
likelihood ratio test: chi2=2.0312  , p=0.5660  , df=3
parameter F test:         F=0.4632  , p=0.7128  , df_denom=13, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.7413  , p=0.5851  , df_d

In [None]:
# Count the number of available data points per company
available_data_counts = merged_df.groupby('company').size().reset_index(name='available_data_points')

# Merge the total days with available data counts
result_df = pd.merge(total_days_df[['company', 'total_days']], available_data_counts, on='company')

# Calculate the percentage of available data
result_df['percentage_available'] = (result_df['available_data_points'] / result_df['total_days']) * 100

# Display the result
print(result_df[['company', 'total_days', 'available_data_points', 'percentage_available']])

In [21]:
# Convert 'date' column to datetime format if not already
merged_df['date'] = pd.to_datetime(merged_df['date'])

# Count the number of available data points per company
available_data_counts = merged_df.groupby('company').size().reset_index(name='available_data_points')

# Add the total_days column with the same value for all companies
available_data_counts['total_days'] = N

# Calculate the percentage of available data
available_data_counts['percentage_available'] = (available_data_counts['available_data_points'] / available_data_counts['total_days']) * 100

# Display the result
print(available_data_counts[['company', 'total_days', 'available_data_points', 'percentage_available']])


           company  total_days  available_data_points  percentage_available
0             Asda          92                      5              5.434783
1             Ford          92                     23             25.000000
2  Marks & Spencer          92                     25             27.173913
3            Ocado          92                      9              9.782609
4         Polestar          92                      2              2.173913
5       Sainsburys          92                      4              4.347826
6       Stellantis          92                     17             18.478261
7            Tesco          92                     30             32.608696
8            Tesla          92                     36             39.130435
9           Toyota          92                     30             32.608696


### Conclusion: M&S and Tesla show correlation between sentiment and returns 

### Step 4: Latent Dirichlet Allocation (LDA)

In [92]:
# Let's extrapolate the pre-processed news articles for M&S only
Marks_Spencer_news_df = news_df[news_df['company'] == 'Marks & Spencer'].reset_index(drop=True)
Marks_Spencer_news_df.head()

Unnamed: 0,story,date,company,ticker,neg,neu,pos,compound,sentiment
0,new insight mamps revealing 10 population conf...,2024-07-04,Marks & Spencer,esg,0.039,0.809,0.152,0.9947,positive
1,working repair alterations specialist sojo uk ...,2024-07-02,Marks & Spencer,esg,0.009,0.864,0.127,0.9853,positive
2,uk retailer marks spencer tells style currentl...,2024-06-21,Marks & Spencer,esg,0.048,0.846,0.106,0.9274,positive
3,new insight mamps revealing 10 population conf...,2024-07-04,Marks & Spencer,environment,0.039,0.809,0.152,0.9947,positive
4,working repair alterations specialist sojo uk ...,2024-07-02,Marks & Spencer,environment,0.009,0.864,0.127,0.9853,positive


In [93]:
Marks_Spencer_news_df = Marks_Spencer_news_df[['story','date','company']]
Marks_Spencer_news_df.head()

Unnamed: 0,story,date,company
0,new insight mamps revealing 10 population conf...,2024-07-04,Marks & Spencer
1,working repair alterations specialist sojo uk ...,2024-07-02,Marks & Spencer
2,uk retailer marks spencer tells style currentl...,2024-06-21,Marks & Spencer
3,new insight mamps revealing 10 population conf...,2024-07-04,Marks & Spencer
4,working repair alterations specialist sojo uk ...,2024-07-02,Marks & Spencer


### Convert Preprocessed Data into Document-Term Matrix:

In [None]:
# Let's use the CountVectorizer from scikit-learn to transform 
# the preprocessed news articles into a document-term matrix.
# Vectoriser expects a list. Let's convert the content of the column
# 'story' as a list of strings

news_list = Marks_Spencer_news_df['story'].tolist()

In [101]:
s = pd.Series(news_list)
s.describe()

count                                                    56
unique                                                   46
top       new insight mamps revealing 10 population conf...
freq                                                      4
dtype: object

In [96]:
vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(news_list)

In [97]:
dtm

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 10905 stored elements and shape (56, 4204)>

### Train the LDA Model

In [107]:
# Let's use gensim to create an LDA model. 
# Specify the number of topics (n_topics), and fit the model to the document-term matrix.

import gensim
from gensim import corpora

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

Step 4: The Latent Dirichlet Allocation (LDA) model will be implemented
using dedicated Python libraries such as Gensim which is a library for topic modeling and natural language processing. Below are the indicative steps that will be
followed to build an LDA model:
- Create a dataset containing news stories;
- Perform text preprocessing by tokenising and cleaning the text;
- Generate a few LDA models using different topic values, then verify how
these models perform in the supervised classification model training
(Kelechava 2019);
- Visualise the topics generated with the words associated with each topic,
and
- Transform the original texts (the headlines) to the topic vectors.

Step 5: Finally, the topic vectors will be fed into a classifier and the process validated
by splitting the topic dataframe into train and test to simulate how the model
would perform with a new set of data. The classification report will be generated
to evaluate the model.

## End of Objctive 2

In [113]:
import gensim


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:


print(gensim.__version__)
