## Objective Three

#### Determine the correlation between corporate ESG and greenwashing from news stories

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from datetime import datetime
import re

# Granger's casuality test library
from statsmodels.tsa.stattools import grangercausalitytests

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


from statsmodels.tsa.api import VAR


# Import VADER for sentiment analysis
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import CountVectorizer

import gensim
from gensim import corpora
from gensim.models import CoherenceModel

from wordcloud import WordCloud
from gensim.models import LdaModel

import spacy
nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, roc_curve, auc

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>
[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>
[nltk_data] Error loading wordnet: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>
[nltk_data] Error loading vader_lexicon: <urlopen error [Errno 8]
[nltk_data]     nodename nor servname provided, or not known>


#### Step 1: Load the greenwashing stories and perform sentiment analysis

In [2]:
# Set up folders
greenwashing_news_raw_file_path = '../Data/Input/Eikon/refinitiv_greenwashing_stories_raw.csv'
greenwashing_news_clean_file_path = '../Data/Output/greenwashing_news_df.csv'
greenwashing_news_summary_file_path = '../Data/Output/greenwashing_news_summary_df.csv'
greenwashing_news_sentiment_file_path = '../Data/Output/greenwashing_news_sentiment_df.csv'

# Read the CSV file into a DataFrame
greenwashing_news_df = pd.read_csv(greenwashing_news_raw_file_path)

greenwashing_news_df

Unnamed: 0,story,date,storyId,company,ticker
0,"<div class=""storyContent"" lang=""en""><p><a href...",2024-07-03 14:11:06.165000+00:00,urn:newsml:newswire.refinitiv.com:20240703:nGL...,Ford,alternative energy
1,"<div class=""storyContent"" lang=""en""><p><a href...",2024-05-20 13:26:40.057000+00:00,urn:newsml:newswire.refinitiv.com:20240520:nGL...,Ford,alternative energy
2,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 20:13:28+00:00,urn:newsml:newswire.refinitiv.com:20240729:nL1...,Ford,fair
3,"<div class=""storyContent"" lang=""en""></div>",2024-07-29 19:50:26.009000+00:00,urn:newsml:newswire.refinitiv.com:20240729:nAQ...,Ford,fair
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 19:01:36.228000+00:00,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,fair
...,...,...,...,...,...
2644,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 11:00:00+00:00,urn:newsml:newswire.refinitiv.com:20240628:nL8...,Tesco,clean energy
2645,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 06:02:34.048000+00:00,urn:newsml:newsroom.refinitiv.com:20240628:nRS...,Tesco,clean energy
2646,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 06:00:03+00:00,urn:newsml:newswire.refinitiv.com:20240628:nRS...,Tesco,clean energy
2647,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-27 16:34:38.473000+00:00,urn:newsml:newswire.refinitiv.com:20240627:nRT...,Tesco,clean energy


In [3]:
# list of companies
companies = greenwashing_news_df['company'].unique().tolist()
companies

['Ford',
 'Polestar',
 'Stellantis',
 'Tesla',
 'Toyota',
 'Marks & Spencer',
 'Ocado',
 'Tesco']

In [97]:
# Observation: Eikon API did not return any of the greenwashing-related news for Asda and Sainsbury's during the period of observation. 

In [4]:
# Drop storyId column

greenwashing_news_df = greenwashing_news_df.drop(['storyId'], axis= 1)
greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,"<div class=""storyContent"" lang=""en""><p><a href...",2024-07-03 14:11:06.165000+00:00,Ford,alternative energy
1,"<div class=""storyContent"" lang=""en""><p><a href...",2024-05-20 13:26:40.057000+00:00,Ford,alternative energy
2,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 20:13:28+00:00,Ford,fair
3,"<div class=""storyContent"" lang=""en""></div>",2024-07-29 19:50:26.009000+00:00,Ford,fair
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 19:01:36.228000+00:00,Ford,fair
...,...,...,...,...
2644,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 11:00:00+00:00,Tesco,clean energy
2645,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 06:02:34.048000+00:00,Tesco,clean energy
2646,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 06:00:03+00:00,Tesco,clean energy
2647,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-27 16:34:38.473000+00:00,Tesco,clean energy


In [5]:
# Convert multiple columns to string type
columns_to_convert = ['story', 'company', 'ticker']
greenwashing_news_df[columns_to_convert] = greenwashing_news_df[columns_to_convert].astype(str)
greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,"<div class=""storyContent"" lang=""en""><p><a href...",2024-07-03 14:11:06.165000+00:00,Ford,alternative energy
1,"<div class=""storyContent"" lang=""en""><p><a href...",2024-05-20 13:26:40.057000+00:00,Ford,alternative energy
2,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 20:13:28+00:00,Ford,fair
3,"<div class=""storyContent"" lang=""en""></div>",2024-07-29 19:50:26.009000+00:00,Ford,fair
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 19:01:36.228000+00:00,Ford,fair
...,...,...,...,...
2644,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 11:00:00+00:00,Tesco,clean energy
2645,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 06:02:34.048000+00:00,Tesco,clean energy
2646,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 06:00:03+00:00,Tesco,clean energy
2647,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-27 16:34:38.473000+00:00,Tesco,clean energy


In [6]:
# Convert the 'date' column to datetime
greenwashing_news_df['date'] = pd.to_datetime(greenwashing_news_df['date'], errors='coerce')

# Format the datetime to 'YYYY-MM-DD HH:MM'
greenwashing_news_df['date'] = greenwashing_news_df['date'].dt.strftime('%Y-%m-%d')

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,"<div class=""storyContent"" lang=""en""><p><a href...",2024-07-03,Ford,alternative energy
1,"<div class=""storyContent"" lang=""en""><p><a href...",2024-05-20,Ford,alternative energy
2,"<div class=""storyContent"" lang=""en""><style typ...",,Ford,fair
3,"<div class=""storyContent"" lang=""en""></div>",2024-07-29,Ford,fair
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,fair
...,...,...,...,...
2644,"<div class=""storyContent"" lang=""en""><style typ...",,Tesco,clean energy
2645,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28,Tesco,clean energy
2646,"<div class=""storyContent"" lang=""en""><style typ...",,Tesco,clean energy
2647,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-27,Tesco,clean energy


In [7]:
# list of tickers
tickers = greenwashing_news_df['ticker'].unique().tolist()
tickers

['alternative energy',
 'fair',
 'low carbon',
 'transition',
 'renew',
 'organic',
 'fossil free',
 'clean energy',
 'zero waste',
 'green',
 'climate',
 'waste',
 'natural',
 'carbon-neutral']

In [8]:
# After inspecting the dataset, there are a few rows with compromised data. 
# Keep every rows unless date column contains a timestamp.

# Define a regular expression pattern to detect timestamps (format: YYYY-MM-DD HH:MM:SS)
timestamp_pattern = r'\d{4}-\d{2}-\d{2}'

# Filter the rows that contain timestamps in the 'date' column
greenwashing_news_df = greenwashing_news_df[greenwashing_news_df['date'].apply(lambda x: bool(re.match(timestamp_pattern, str(x))))]

# Reset the index of the filtered DataFrame
greenwashing_news_df = greenwashing_news_df.reset_index(drop=True)

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,"<div class=""storyContent"" lang=""en""><p><a href...",2024-07-03,Ford,alternative energy
1,"<div class=""storyContent"" lang=""en""><p><a href...",2024-05-20,Ford,alternative energy
2,"<div class=""storyContent"" lang=""en""></div>",2024-07-29,Ford,fair
3,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,fair
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,fair
...,...,...,...,...
1468,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-01,Tesco,clean energy
1469,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28,Tesco,clean energy
1470,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28,Tesco,clean energy
1471,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28,Tesco,clean energy


In [9]:
# Functions to remove text within <> and {}:

def remove_text_within_angle_brackets(text):
    if pd.notnull(text):  # Check if the text is not null
        return re.sub(r'<.*?>', '', text)
    return text  

# Apply the function to the 'story' column
greenwashing_news_df['story'] = greenwashing_news_df['story'].apply(remove_text_within_angle_brackets)


def remove_text_within_curly_brackets(text):
    if pd.notnull(text):  # Check if the text is not null
        return re.sub(r'{.*?}', '', text)
    return text  

# Apply the function to the 'story' column
greenwashing_news_df['story'] = greenwashing_news_df['story'].apply(remove_text_within_curly_brackets)

# Reset the index of the filtered DataFrame
greenwashing_news_df = greenwashing_news_df.reset_index(drop=True)

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,https://filings.ica.int.thomsonreuters.com/fil...,2024-07-03,Ford,alternative energy
1,https://filings.ica.int.thomsonreuters.com/fil...,2024-05-20,Ford,alternative energy
2,,2024-07-29,Ford,fair
3,".storyContent * NEW YORK, NY / ACCESSWIRE / Ju...",2024-07-29,Ford,fair
4,".storyContent * LOS ANGELES, CA / ACCESSWIRE /...",2024-07-29,Ford,fair
...,...,...,...,...
1468,.storyContent * RNS Number : 4659UTesco PLC01 ...,2024-07-01,Tesco,clean energy
1469,.storyContent * Tesco and Asda are being sued ...,2024-06-28,Tesco,clean energy
1470,.storyContent * Lawyers acting for two people ...,2024-06-28,Tesco,clean energy
1471,.storyContent * RNS Number : 2479UTesco PLC28 ...,2024-06-28,Tesco,clean energy


In [10]:
# Remove '.storyContent * ' substring from the 'story' column
greenwashing_news_df['story'] = greenwashing_news_df['story'].str.replace('.storyContent * ', '', regex=False)

# Reset the index of the filtered DataFrame
greenwashing_news_df = greenwashing_news_df.reset_index(drop=True)

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,https://filings.ica.int.thomsonreuters.com/fil...,2024-07-03,Ford,alternative energy
1,https://filings.ica.int.thomsonreuters.com/fil...,2024-05-20,Ford,alternative energy
2,,2024-07-29,Ford,fair
3,"NEW YORK, NY / ACCESSWIRE / July 29, 2024 / Le...",2024-07-29,Ford,fair
4,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,Ford,fair
...,...,...,...,...
1468,RNS Number : 4659UTesco PLC01 July 2024 Tesco...,2024-07-01,Tesco,clean energy
1469,"Tesco and Asda are being sued by customers, in...",2024-06-28,Tesco,clean energy
1470,Lawyers acting for two people who fell ill in ...,2024-06-28,Tesco,clean energy
1471,RNS Number : 2479UTesco PLC28 June 2024 Tesco...,2024-06-28,Tesco,clean energy


In [11]:
# Remove rows with empty cells
greenwashing_news_df = greenwashing_news_df.dropna(how='any')

# Reset the index of the filtered DataFrame
greenwashing_news_df = greenwashing_news_df.reset_index(drop=True)

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,https://filings.ica.int.thomsonreuters.com/fil...,2024-07-03,Ford,alternative energy
1,https://filings.ica.int.thomsonreuters.com/fil...,2024-05-20,Ford,alternative energy
2,,2024-07-29,Ford,fair
3,"NEW YORK, NY / ACCESSWIRE / July 29, 2024 / Le...",2024-07-29,Ford,fair
4,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,Ford,fair
...,...,...,...,...
1468,RNS Number : 4659UTesco PLC01 July 2024 Tesco...,2024-07-01,Tesco,clean energy
1469,"Tesco and Asda are being sued by customers, in...",2024-06-28,Tesco,clean energy
1470,Lawyers acting for two people who fell ill in ...,2024-06-28,Tesco,clean energy
1471,RNS Number : 2479UTesco PLC28 June 2024 Tesco...,2024-06-28,Tesco,clean energy


In [12]:
# Function to count the number of words in a string
def word_count(text):
    if pd.notnull(text):  # Check if the text is not null
        return len(text.split())
    return 0  # Return 0 if the text is null

# Filter out rows where the word count in 'story' column is less than 15
greenwashing_news_df = greenwashing_news_df[greenwashing_news_df['story'].apply(word_count) >= 15]

# Filter out rows where any column is empty or contains only whitespace
greenwashing_news_df = greenwashing_news_df[(greenwashing_news_df != '') & (greenwashing_news_df != ' ')].dropna()

# Reset the index of the filtered DataFrame
greenwashing_news_df = greenwashing_news_df.reset_index(drop=True)

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,"NEW YORK, NY / ACCESSWIRE / July 29, 2024 / Le...",2024-07-29,Ford,fair
1,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,Ford,fair
2,"Jul 29, 2024Ford Dealership by JeepersMedia / ...",2024-07-29,Ford,fair
3,"NEW YORK, NY / ACCESSWIRE / July 29, 2024 / Le...",2024-07-29,Ford,fair
4,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,Ford,fair
...,...,...,...,...
1290,RNS Number : 4659UTesco PLC01 July 2024 Tesco...,2024-07-01,Tesco,clean energy
1291,"Tesco and Asda are being sued by customers, in...",2024-06-28,Tesco,clean energy
1292,Lawyers acting for two people who fell ill in ...,2024-06-28,Tesco,clean energy
1293,RNS Number : 2479UTesco PLC28 June 2024 Tesco...,2024-06-28,Tesco,clean energy


In [13]:
# Strip whitespace from all columns and replace empty strings with NaN
greenwashing_news_df = greenwashing_news_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
greenwashing_news_df.replace('', pd.NA, inplace=True)

# Drop rows with any NaN values
greenwashing_news_df.dropna(inplace=True)

# Reset the index of the filtered DataFrame
greenwashing_news_df = greenwashing_news_df.reset_index(drop=True)

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,"NEW YORK, NY / ACCESSWIRE / July 29, 2024 / Le...",2024-07-29,Ford,fair
1,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,Ford,fair
2,"Jul 29, 2024Ford Dealership by JeepersMedia / ...",2024-07-29,Ford,fair
3,"NEW YORK, NY / ACCESSWIRE / July 29, 2024 / Le...",2024-07-29,Ford,fair
4,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,Ford,fair
...,...,...,...,...
1290,RNS Number : 4659UTesco PLC01 July 2024 Tesco...,2024-07-01,Tesco,clean energy
1291,"Tesco and Asda are being sued by customers, in...",2024-06-28,Tesco,clean energy
1292,Lawyers acting for two people who fell ill in ...,2024-06-28,Tesco,clean energy
1293,RNS Number : 2479UTesco PLC28 June 2024 Tesco...,2024-06-28,Tesco,clean energy


In [15]:
# Convert the 'story' column to lowercase to optimise polarity
greenwashing_news_df['story'] = greenwashing_news_df['story'].str.lower()

# Display the updated DataFrame
print(greenwashing_news_df[['story']])

                                                  story
0     new york, ny / accesswire / july 29, 2024 / le...
1     los angeles, ca / accesswire / july 29, 2024 /...
2     jul 29, 2024ford dealership by jeepersmedia / ...
3     new york, ny / accesswire / july 29, 2024 / le...
4     new york city, ny / accesswire / july 29, 2024...
...                                                 ...
1290  rns number : 4659utesco plc01 july 2024  tesco...
1291  tesco and asda are being sued by customers, in...
1292  lawyers acting for two people who fell ill in ...
1293  rns number : 2479utesco plc28 june 2024  tesco...
1294  click the following link to watch video: https...

[1295 rows x 1 columns]


In [16]:
# Let's initialise the 'stop words' function for English 

stop = stopwords.words('english')
stop[:5]

['i', 'me', 'my', 'myself', 'we']

In [17]:
# Let's remove 'stop words' from the stories

greenwashing_news_df['story'] = greenwashing_news_df['story'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop]))
greenwashing_news_df.head()

Unnamed: 0,story,date,company,ticker
0,"new york, ny / accesswire / july 29, 2024 / le...",2024-07-29,Ford,fair
1,"los angeles, ca / accesswire / july 29, 2024 /...",2024-07-29,Ford,fair
2,"jul 29, 2024ford dealership jeepersmedia / 2.0...",2024-07-29,Ford,fair
3,"new york, ny / accesswire / july 29, 2024 / le...",2024-07-29,Ford,fair
4,"new york city, ny / accesswire / july 29, 2024...",2024-07-29,Ford,fair


In [18]:
# Let's lemmatise the content of the 'story' column

for index, row in greenwashing_news_df.iterrows():
    words = nltk.word_tokenize(row['story'])

    # Lemmatise each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)

    # Update the 'story' column with the lemmatised text
    greenwashing_news_df.at[index, 'story'] = lemmatized_text


In [19]:
greenwashing_news_df.head()

Unnamed: 0,story,date,company,ticker
0,"new york , ny / accesswire / july 29 , 2024 / ...",2024-07-29,Ford,fair
1,"los angeles , ca / accesswire / july 29 , 2024...",2024-07-29,Ford,fair
2,"jul 29 , 2024ford dealership jeepersmedia / 2....",2024-07-29,Ford,fair
3,"new york , ny / accesswire / july 29 , 2024 / ...",2024-07-29,Ford,fair
4,"new york city , ny / accesswire / july 29 , 20...",2024-07-29,Ford,fair


In [20]:
# Let's remove numbers from the 'story' column using regular expressions

for index, row in greenwashing_news_df.iterrows():
    row_text = re.sub(r'\d+', '', row['story'])
    greenwashing_news_df.at[index, 'story'] = row_text

In [21]:
greenwashing_news_df.head()

Unnamed: 0,story,date,company,ticker
0,"new york , ny / accesswire / july , / levi &...",2024-07-29,Ford,fair
1,"los angeles , ca / accesswire / july , / sch...",2024-07-29,Ford,fair
2,"jul , ford dealership jeepersmedia / . ( http...",2024-07-29,Ford,fair
3,"new york , ny / accesswire / july , / levi &...",2024-07-29,Ford,fair
4,"new york city , ny / accesswire / july , / b...",2024-07-29,Ford,fair


In [22]:
# Let's remove non-alphanumeric characters from the column 'story'
def remove_symbols(text):
    cleaned_text = re.sub(r'[^\w\s]','', text)
    return cleaned_text

greenwashing_news_df['story'] = greenwashing_news_df['story'].apply(lambda x: remove_symbols(x))

In [23]:
greenwashing_news_df.head()

Unnamed: 0,story,date,company,ticker
0,new york ny accesswire july levi amp ...,2024-07-29,Ford,fair
1,los angeles ca accesswire july schall l...,2024-07-29,Ford,fair
2,jul ford dealership jeepersmedia http cr...,2024-07-29,Ford,fair
3,new york ny accesswire july levi amp ...,2024-07-29,Ford,fair
4,new york city ny accesswire july bronst...,2024-07-29,Ford,fair


In [25]:
# Let's remove first names of ppl from the news articles

def remove_first_names(text):
    doc = nlp(text)
    filtered_text = ' '.join([token.text if token.ent_type_ != 'person' else '' for token in doc])
    return ' '.join(filtered_text.split())

for index, row in greenwashing_news_df.iterrows():
 
    greenwashing_news_df.at[index, 'story'] = remove_first_names(row['story'])



In [26]:
greenwashing_news_df.head()

Unnamed: 0,story,date,company,ticker
0,new york ny accesswire july levi amp korsinsky...,2024-07-29,Ford,fair
1,los angeles ca accesswire july schall law firm...,2024-07-29,Ford,fair
2,jul ford dealership jeepersmedia http creative...,2024-07-29,Ford,fair
3,new york ny accesswire july levi amp korsinsky...,2024-07-29,Ford,fair
4,new york city ny accesswire july bronstein gew...,2024-07-29,Ford,fair


In [27]:
# Save the clean greenwashing-related news 
greenwashing_news_df.to_csv('../Data/Output/obj3_greenwashing_news_clean.csv', index=False)

In [28]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to apply VADER and get sentiment scores
def apply_vader_sentiment(text):
    # Get the sentiment scores from VADER
    sentiment_scores = analyzer.polarity_scores(text)
    return sentiment_scores

# Apply VADER sentiment analysis to the "story" column
greenwashing_news_df['vader_sentiment'] = greenwashing_news_df['story'].apply(apply_vader_sentiment)

# Split the sentiment scores into separate columns 
greenwashing_news_df = pd.concat([greenwashing_news_df.drop(['vader_sentiment'], axis=1), greenwashing_news_df['vader_sentiment'].apply(pd.Series)], axis=1)

# Define a function to classify sentiment based on the 'compound' score
def classify_sentiment(compound_score):
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply the classify_sentiment function to create a new 'sentiment' column
greenwashing_news_df['sentiment'] = greenwashing_news_df['compound'].apply(classify_sentiment)

greenwashing_news_df

Unnamed: 0,story,date,company,ticker,neg,neu,pos,compound,sentiment
0,new york ny accesswire july levi amp korsinsky...,2024-07-29,Ford,fair,0.049,0.774,0.177,0.9764,positive
1,los angeles ca accesswire july schall law firm...,2024-07-29,Ford,fair,0.104,0.769,0.127,0.5719,positive
2,jul ford dealership jeepersmedia http creative...,2024-07-29,Ford,fair,0.051,0.796,0.153,0.9935,positive
3,new york ny accesswire july levi amp korsinsky...,2024-07-29,Ford,fair,0.049,0.774,0.177,0.9764,positive
4,new york city ny accesswire july bronstein gew...,2024-07-29,Ford,fair,0.066,0.857,0.077,0.1027,positive
...,...,...,...,...,...,...,...,...,...
1290,rn number utesco plc july tesco plc july tesco...,2024-07-01,Tesco,clean energy,0.047,0.718,0.234,0.9879,positive
1291,tesco asda sued customer including family year...,2024-06-28,Tesco,clean energy,0.150,0.780,0.070,-0.9849,negative
1292,lawyer acting two people fell ill e coli outbr...,2024-06-28,Tesco,clean energy,0.151,0.771,0.079,-0.9904,negative
1293,rn number utesco plc june tesco plc june tesco...,2024-06-28,Tesco,clean energy,0.047,0.718,0.234,0.9879,positive


In [29]:
# Save the VADER sentiment df on greenwashing-related news 
greenwashing_news_df.to_csv('../Data/Output/obj3_greenwashing_news_vader.csv', index=False)


In [30]:
# Function to normalise sentiment proportions and calculate Sent_d
def calculate_sentiment_score(df):
    # Calculate total count of all sentiment categories
    df['total'] = df['pos'] + df['neu'] + df['neg']
    
    # Normalize to get probabilities (frequencies) of positive, neutral, and negative
    df['pos_prob'] = df['pos'] / df['total']
    df['neut_prob'] = df['neu'] / df['total']
    df['neg_prob'] = df['neg'] / df['total']
    
    # Confirm the probabilities sum to 1
    df['sum_probs'] = df['pos_prob'] + df['neut_prob'] + df['neg_prob']
    
    # Calculate Sent_d using the normalized probabilities
    df['Sent_d'] = (df['pos'] - df['neg']) / (df['pos'] + df['neu'] + df['neg'] + 3)
    
    return df


# Group by company and date to ensure daily aggregation
grouped_df = greenwashing_news_df.groupby(['company', 'date']).sum().reset_index()

# Apply the sentiment calculation
Daily_Sentiment_Compound_df = calculate_sentiment_score(grouped_df)


Daily_Sentiment_Compound_df = Daily_Sentiment_Compound_df[['company', 'date', 'pos_prob', 'neut_prob', 'neg_prob', 'Sent_d']]

Daily_Sentiment_Compound_df


Unnamed: 0,company,date,pos_prob,neut_prob,neg_prob,Sent_d
0,Ford,2024-05-10,0.108333,0.873333,0.018333,0.045000
1,Ford,2024-05-21,0.203796,0.777223,0.018981,0.046238
2,Ford,2024-05-31,0.073000,0.927000,0.000000,0.018250
3,Ford,2024-07-08,0.232000,0.734000,0.034000,0.049500
4,Ford,2024-07-24,0.196500,0.695000,0.108500,0.050286
...,...,...,...,...,...,...
124,Toyota,2024-07-25,0.135241,0.762521,0.102238,0.031717
125,Toyota,2024-07-26,0.122011,0.799390,0.078599,0.042309
126,Toyota,2024-07-27,0.157500,0.756000,0.086500,0.059789
127,Toyota,2024-07-28,0.148309,0.740710,0.110982,0.033180


In [33]:
# Let's save Daily_Sentiment_Compound_df for greenwashing-related news

Daily_Sentiment_Compound_df.to_csv('../Data/Output/obj3_greenwashing_daily_sentiment_compound.csv', index=False)

### Let's perform the Granger’s causality testing using the dedicated Python library “grangercausalitytests” between sentiment of ESG-related news and greenwashing-related news  

In [35]:
# Let's load the ESG Daily_Sentiment_Compound_df for ESG-related news from Objective Two

esg_daily_sentiment_compound_df = pd.read_csv('../Data/Output/obj2_ESG_daily_sentiment_compound.csv')
esg_daily_sentiment_compound_df

Unnamed: 0,company,date,pos_prob,neut_prob,neg_prob,Sent_d
0,Asda,2024-06-07,0.138000,0.765000,0.097000,0.010250
1,Asda,2024-07-08,0.173000,0.818000,0.009000,0.065600
2,Asda,2024-07-23,0.097000,0.624000,0.279000,-0.045500
3,Asda,2024-07-24,0.163000,0.794000,0.043000,0.048000
4,Asda,2024-07-26,0.097000,0.608000,0.295000,-0.049500
...,...,...,...,...,...,...
211,Toyota,2024-07-25,0.135904,0.755808,0.108288,0.025644
212,Toyota,2024-07-26,0.138128,0.766626,0.095246,0.040455
213,Toyota,2024-07-27,0.167167,0.754064,0.078770,0.064284
214,Toyota,2024-07-28,0.154341,0.733763,0.111896,0.035742


In [36]:
# Let's load the Greenwashing Daily_Sentiment_Compound_df for ESG-related news from Objective Three

greenwashing_daily_sentiment_compound_df = pd.read_csv('../Data/Output/obj3_greenwashing_daily_sentiment_compound.csv')
greenwashing_daily_sentiment_compound_df

Unnamed: 0,company,date,pos_prob,neut_prob,neg_prob,Sent_d
0,Ford,2024-05-10,0.108333,0.873333,0.018333,0.045000
1,Ford,2024-05-21,0.203796,0.777223,0.018981,0.046238
2,Ford,2024-05-31,0.073000,0.927000,0.000000,0.018250
3,Ford,2024-07-08,0.232000,0.734000,0.034000,0.049500
4,Ford,2024-07-24,0.196500,0.695000,0.108500,0.050286
...,...,...,...,...,...,...
124,Toyota,2024-07-25,0.135241,0.762521,0.102238,0.031717
125,Toyota,2024-07-26,0.122011,0.799390,0.078599,0.042309
126,Toyota,2024-07-27,0.157500,0.756000,0.086500,0.059789
127,Toyota,2024-07-28,0.148309,0.740710,0.110982,0.033180


In [37]:
# Ensure 'date' column is in datetime format in both dataframes
esg_daily_sentiment_compound_df['date'] = pd.to_datetime(esg_daily_sentiment_compound_df['date'])
greenwashing_daily_sentiment_compound_df['date'] = pd.to_datetime(greenwashing_daily_sentiment_compound_df['date'])

In [41]:
# Let's rename the Sen_d column on both df for clarity
esg_daily_sentiment_compound_df = esg_daily_sentiment_compound_df.rename(columns={'Sent_d':'esg_sent_d'})
greenwashing_daily_sentiment_compound_df = greenwashing_daily_sentiment_compound_df.rename(columns={'Sent_d':'greenwashing_sent_d'})

In [43]:
# Merge the sentiment dataframe with the stock performance dataframe on 'company' and 'date'
merged_df = pd.merge(esg_daily_sentiment_compound_df[['company', 'date', 'esg_sent_d']], 
                     greenwashing_daily_sentiment_compound_df[['company', 'date', 'greenwashing_sent_d']], 
                     on=['company', 'date'], how='inner')


In [44]:
merged_df

Unnamed: 0,company,date,esg_sent_d,greenwashing_sent_d
0,Ford,2024-07-08,0.085470,0.049500
1,Ford,2024-07-24,0.088584,0.050286
2,Ford,2024-07-25,0.059306,0.042424
3,Ford,2024-07-26,0.040587,0.043287
4,Ford,2024-07-27,0.033500,0.068571
...,...,...,...,...
104,Toyota,2024-07-25,0.025644,0.031717
105,Toyota,2024-07-26,0.040455,0.042309
106,Toyota,2024-07-27,0.064284,0.059789
107,Toyota,2024-07-28,0.035742,0.033180


In [45]:
# Drop any rows with missing data, as Granger causality tests require complete cases
merged_df = merged_df.dropna()
merged_df

Unnamed: 0,company,date,esg_sent_d,greenwashing_sent_d
0,Ford,2024-07-08,0.085470,0.049500
1,Ford,2024-07-24,0.088584,0.050286
2,Ford,2024-07-25,0.059306,0.042424
3,Ford,2024-07-26,0.040587,0.043287
4,Ford,2024-07-27,0.033500,0.068571
...,...,...,...,...
104,Toyota,2024-07-25,0.025644,0.031717
105,Toyota,2024-07-26,0.040455,0.042309
106,Toyota,2024-07-27,0.064284,0.059789
107,Toyota,2024-07-28,0.035742,0.033180


In [46]:
# Save merged_df
merged_df.to_csv('../Data/Output/obj3_esg_greenwashing_sentiment_merged.csv', index=False)

In [77]:
# Filter the dataset for rows where 'company' is 'Asda'
Sainsburys_merged_df = merged_df[merged_df['company'] == 'Asda']
Sainsburys_merged_df.head()

Unnamed: 0,company,date,esg_sent_d,greenwashing_sent_d


In [52]:
# Filter the dataset for rows where 'company' is 'Ford'
Ford_merged_df = merged_df[merged_df['company'] == 'Ford']
Ford_merged_df.head()

Unnamed: 0,company,date,esg_sent_d,greenwashing_sent_d
0,Ford,2024-07-08,0.08547,0.0495
1,Ford,2024-07-24,0.088584,0.050286
2,Ford,2024-07-25,0.059306,0.042424
3,Ford,2024-07-26,0.040587,0.043287
4,Ford,2024-07-27,0.0335,0.068571


In [53]:
# Perform Granger causality test for FORD
# To determine if Sentiment of greenwashing-related news 
# influences sentiment of esg-related news.
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Ford_merged_df[['esg_sent_d','greenwashing_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of greenwashing-related news influences sentiment of esg-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.7454  , p=0.4514  , df_denom=3, df_num=1
ssr based chi2 test:   chi2=1.4908  , p=0.2221  , df=1
likelihood ratio test: chi2=1.3315  , p=0.2485  , df=1
parameter F test:         F=0.7454  , p=0.4514  , df_denom=3, df_num=1
Lag 1: p-value = 0.4514332350814374
At lag 1, we fail to reject the null hypothesis.



In [56]:
# Perform Granger causality test for FORD
# To determine if sentiment of esg-related news 
# influences sentiment of greenwashing-related news.
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Ford_merged_df[['greenwashing_sent_d', 'esg_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of esg-related news influences sentiment of greenwashing-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=13.4587 , p=0.0350  , df_denom=3, df_num=1
ssr based chi2 test:   chi2=26.9173 , p=0.0000  , df=1
likelihood ratio test: chi2=10.2134 , p=0.0014  , df=1
parameter F test:         F=13.4587 , p=0.0350  , df_denom=3, df_num=1
Lag 1: p-value = 0.035034105622199885
At lag 1, we reject the null hypothesis. Sentiment of esg-related news influences sentiment of greenwashing-related news.



In [57]:
# Filter the dataset for rows where 'company' is 'Marks & Spencer'
MarksSpencer_merged_df = merged_df[merged_df['company'] == 'Marks & Spencer']
MarksSpencer_merged_df.head()

Unnamed: 0,company,date,esg_sent_d,greenwashing_sent_d
7,Marks & Spencer,2024-05-22,0.062016,0.0818
8,Marks & Spencer,2024-05-30,0.0422,0.058874
9,Marks & Spencer,2024-06-02,0.014254,0.028
10,Marks & Spencer,2024-06-03,0.055667,0.081371
11,Marks & Spencer,2024-06-06,0.037,0.080703


In [62]:
# Perform Granger causality test for MARKS & SPENCER
# To determine if Sentiment of greenwashing-related news 
# influences sentiment of esg-related news.
# Set the maximum number of lags to test
max_lag = 5

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(MarksSpencer_merged_df[['esg_sent_d','greenwashing_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of greenwashing-related news influences sentiment of esg-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.3663  , p=0.5507  , df_denom=24, df_num=1
ssr based chi2 test:   chi2=0.4120  , p=0.5209  , df=1
likelihood ratio test: chi2=0.4089  , p=0.5225  , df=1
parameter F test:         F=0.3663  , p=0.5507  , df_denom=24, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.0250  , p=0.3761  , df_denom=21, df_num=2
ssr based chi2 test:   chi2=2.5381  , p=0.2811  , df=2
likelihood ratio test: chi2=2.4218  , p=0.2979  , df=2
parameter F test:         F=1.0250  , p=0.3761  , df_denom=21, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.0165  , p=0.4085  , df_denom=18, df_num=3
ssr based chi2 test:   chi2=4.2354  , p=0.2371  , df=3
likelihood ratio test: chi2=3.9126  , p=0.2711  , df=3
parameter F test:         F=1.0165  , p=0.4085  , df_denom=18, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.1459  , p=0.3730  , df_d

In [67]:
# Perform Granger causality test for MARKS & SPENCER
# To determine if sentiment of esg-related news 
# influences sentiment of greenwashing-related news.
# Set the maximum number of lags to test
max_lag = 8

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(MarksSpencer_merged_df[['greenwashing_sent_d', 'esg_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of esg-related news influences sentiment of greenwashing-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.0198  , p=0.8892  , df_denom=24, df_num=1
ssr based chi2 test:   chi2=0.0223  , p=0.8813  , df=1
likelihood ratio test: chi2=0.0223  , p=0.8813  , df=1
parameter F test:         F=0.0198  , p=0.8892  , df_denom=24, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.1163  , p=0.8907  , df_denom=21, df_num=2
ssr based chi2 test:   chi2=0.2881  , p=0.8659  , df=2
likelihood ratio test: chi2=0.2865  , p=0.8665  , df=2
parameter F test:         F=0.1163  , p=0.8907  , df_denom=21, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.0427  , p=0.3976  , df_denom=18, df_num=3
ssr based chi2 test:   chi2=4.3444  , p=0.2266  , df=3
likelihood ratio test: chi2=4.0056  , p=0.2609  , df=3
parameter F test:         F=1.0427  , p=0.3976  , df_denom=18, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.5349  , p=0.2425  , df_d

In [68]:
# Filter the dataset for rows where 'company' is 'Ocado'
Ocado_merged_df = merged_df[merged_df['company'] == 'Ocado']
Ocado_merged_df.head()

Unnamed: 0,company,date,esg_sent_d,greenwashing_sent_d
35,Ocado,2024-07-16,0.074819,0.102704
36,Ocado,2024-07-17,0.0635,0.133714
37,Ocado,2024-07-19,-0.002001,0.052571
38,Ocado,2024-07-22,0.017997,0.028151
39,Ocado,2024-07-23,0.0126,0.025828


In [70]:
# Perform Granger causality test for OCADO
# To determine if Sentiment of greenwashing-related news 
# influences sentiment of esg-related news.
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Ocado_merged_df[['esg_sent_d','greenwashing_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of greenwashing-related news influences sentiment of esg-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.3040  , p=0.6107  , df_denom=4, df_num=1
ssr based chi2 test:   chi2=0.5320  , p=0.4658  , df=1
likelihood ratio test: chi2=0.5127  , p=0.4740  , df=1
parameter F test:         F=0.3040  , p=0.6107  , df_denom=4, df_num=1
Lag 1: p-value = 0.6107499466075142
At lag 1, we fail to reject the null hypothesis.



In [72]:
# Perform Granger causality test for OCADO
# To determine if sentiment of esg-related news 
# influences sentiment of greenwashing-related news.
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Ocado_merged_df[['greenwashing_sent_d', 'esg_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of esg-related news influences sentiment of greenwashing-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.2261  , p=0.3303  , df_denom=4, df_num=1
ssr based chi2 test:   chi2=2.1456  , p=0.1430  , df=1
likelihood ratio test: chi2=1.8715  , p=0.1713  , df=1
parameter F test:         F=1.2261  , p=0.3303  , df_denom=4, df_num=1
Lag 1: p-value = 0.3302760786455514
At lag 1, we fail to reject the null hypothesis.



In [73]:
# Filter the dataset for rows where 'company' is 'Polestar'
Polestar_merged_df = merged_df[merged_df['company'] == 'Polestar']
Polestar_merged_df.head()

Unnamed: 0,company,date,esg_sent_d,greenwashing_sent_d
43,Polestar,2024-06-28,-0.007145,0.036007


In [74]:
# Perform Granger causality test for POLESTAR
# To determine if Sentiment of greenwashing-related news 
# influences sentiment of esg-related news.
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Polestar_merged_df[['esg_sent_d','greenwashing_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of greenwashing-related news influences sentiment of esg-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")


ValueError: Insufficient observations. Maximum allowable lag is -1

In [75]:
# Perform Granger causality test for POLESTAR
# To determine if sentiment of esg-related news 
# influences sentiment of greenwashing-related news.
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Polestar_merged_df[['greenwashing_sent_d', 'esg_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of esg-related news influences sentiment of greenwashing-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")


ValueError: Insufficient observations. Maximum allowable lag is -1

In [76]:
# Filter the dataset for rows where 'company' is 'Sainsburys'
Sainsburys_merged_df = merged_df[merged_df['company'] == 'Sainsburys']
Sainsburys_merged_df.head()

Unnamed: 0,company,date,esg_sent_d,greenwashing_sent_d


In [78]:
# Filter the dataset for rows where 'company' is 'Stellantis'
Stellantis_merged_df = merged_df[merged_df['company'] == 'Stellantis']
Stellantis_merged_df.head()

Unnamed: 0,company,date,esg_sent_d,greenwashing_sent_d
44,Stellantis,2024-06-25,0.0055,-0.00425
45,Stellantis,2024-07-25,0.0105,0.085975


In [79]:
# Perform Granger causality test for STELLANTIS
# To determine if Sentiment of greenwashing-related news 
# influences sentiment of esg-related news.
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Stellantis_merged_df[['esg_sent_d','greenwashing_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of greenwashing-related news influences sentiment of esg-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")


ValueError: Insufficient observations. Maximum allowable lag is -1

In [80]:
# Perform Granger causality test for STELLANTIS
# To determine if sentiment of esg-related news 
# influences sentiment of greenwashing-related news.
# Set the maximum number of lags to test
max_lag = 1

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Stellantis_merged_df[['greenwashing_sent_d', 'esg_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of esg-related news influences sentiment of greenwashing-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")


ValueError: Insufficient observations. Maximum allowable lag is -1

In [81]:
# Filter the dataset for rows where 'company' is 'Tesco'
Tesco_merged_df = merged_df[merged_df['company'] == 'Tesco']
Tesco_merged_df.head()

Unnamed: 0,company,date,esg_sent_d,greenwashing_sent_d
46,Tesco,2024-05-30,0.028,-0.01875
47,Tesco,2024-05-31,0.02725,0.0428
48,Tesco,2024-06-06,0.0095,0.0152
49,Tesco,2024-06-26,0.048238,0.065026
50,Tesco,2024-06-27,0.086183,0.153817


In [84]:
# Perform Granger causality test for TESCO
# To determine if Sentiment of greenwashing-related news 
# influences sentiment of esg-related news.
# Set the maximum number of lags to test
max_lag = 6

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Tesco_merged_df[['esg_sent_d','greenwashing_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of greenwashing-related news influences sentiment of esg-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.1873  , p=0.6701  , df_denom=19, df_num=1
ssr based chi2 test:   chi2=0.2169  , p=0.6414  , df=1
likelihood ratio test: chi2=0.2158  , p=0.6423  , df=1
parameter F test:         F=0.1873  , p=0.6701  , df_denom=19, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.6106  , p=0.5552  , df_denom=16, df_num=2
ssr based chi2 test:   chi2=1.6029  , p=0.4487  , df=2
likelihood ratio test: chi2=1.5447  , p=0.4619  , df=2
parameter F test:         F=0.6106  , p=0.5552  , df_denom=16, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.9433  , p=0.4481  , df_denom=13, df_num=3
ssr based chi2 test:   chi2=4.3537  , p=0.2257  , df=3
likelihood ratio test: chi2=3.9390  , p=0.2681  , df=3
parameter F test:         F=0.9433  , p=0.4481  , df_denom=13, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.6697  , p=0.6277  , df_d

In [88]:
# Perform Granger causality test for TESCO
# To determine if sentiment of esg-related news 
# influences sentiment of greenwashing-related news.
# Set the maximum number of lags to test
max_lag = 6

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Tesco_merged_df[['greenwashing_sent_d', 'esg_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of esg-related news influences sentiment of greenwashing-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.4226  , p=0.2477  , df_denom=19, df_num=1
ssr based chi2 test:   chi2=1.6472  , p=0.1993  , df=1
likelihood ratio test: chi2=1.5884  , p=0.2075  , df=1
parameter F test:         F=1.4226  , p=0.2477  , df_denom=19, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.2483  , p=0.3135  , df_denom=16, df_num=2
ssr based chi2 test:   chi2=3.2768  , p=0.1943  , df=2
likelihood ratio test: chi2=3.0449  , p=0.2182  , df=2
parameter F test:         F=1.2483  , p=0.3135  , df_denom=16, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.3248  , p=0.3088  , df_denom=13, df_num=3
ssr based chi2 test:   chi2=6.1143  , p=0.1062  , df=3
likelihood ratio test: chi2=5.3350  , p=0.1488  , df=3
parameter F test:         F=1.3248  , p=0.3088  , df_denom=13, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.8304  , p=0.5355  , df_d

In [89]:
# Filter the dataset for rows where 'company' is 'Tesla'
Tesla_merged_df = merged_df[merged_df['company'] == 'Tesla']
Tesla_merged_df.head()

Unnamed: 0,company,date,esg_sent_d,greenwashing_sent_d
69,Tesla,2024-05-10,0.003834,-0.040808
70,Tesla,2024-05-14,-0.002999,0.046262
71,Tesla,2024-05-17,-0.013748,-0.0015
72,Tesla,2024-05-28,-0.0055,0.0688
73,Tesla,2024-05-31,0.021714,0.046512


In [92]:
# Perform Granger causality test for TESLA
# To determine if Sentiment of greenwashing-related news 
# influences sentiment of esg-related news.
# Set the maximum number of lags to test
max_lag = 6

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Tesla_merged_df[['esg_sent_d','greenwashing_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of greenwashing-related news influences sentiment of esg-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=4.7096  , p=0.0422  , df_denom=20, df_num=1
ssr based chi2 test:   chi2=5.4160  , p=0.0200  , df=1
likelihood ratio test: chi2=4.8635  , p=0.0274  , df=1
parameter F test:         F=4.7096  , p=0.0422  , df_denom=20, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=2.1331  , p=0.1491  , df_denom=17, df_num=2
ssr based chi2 test:   chi2=5.5209  , p=0.0633  , df=2
likelihood ratio test: chi2=4.9259  , p=0.0852  , df=2
parameter F test:         F=2.1331  , p=0.1491  , df_denom=17, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=2.4098  , p=0.1105  , df_denom=14, df_num=3
ssr based chi2 test:   chi2=10.8443 , p=0.0126  , df=3
likelihood ratio test: chi2=8.7431  , p=0.0329  , df=3
parameter F test:         F=2.4098  , p=0.1105  , df_denom=14, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.4735  , p=0.2756  , df_d

In [96]:
# Perform Granger causality test for TESLA
# To determine if sentiment of esg-related news 
# influences sentiment of greenwashing-related news.
# Set the maximum number of lags to test
max_lag = 6

# Perform Granger causality test to see if Sent_d (sentiment) Granger-causes daily_return (market performance)
granger_results = grangercausalitytests(Tesla_merged_df[['greenwashing_sent_d', 'esg_sent_d']], max_lag)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment of esg-related news influences sentiment of greenwashing-related news.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.2220  , p=0.6426  , df_denom=20, df_num=1
ssr based chi2 test:   chi2=0.2553  , p=0.6134  , df=1
likelihood ratio test: chi2=0.2539  , p=0.6144  , df=1
parameter F test:         F=0.2220  , p=0.6426  , df_denom=20, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.4145  , p=0.6671  , df_denom=17, df_num=2
ssr based chi2 test:   chi2=1.0729  , p=0.5848  , df=2
likelihood ratio test: chi2=1.0476  , p=0.5923  , df=2
parameter F test:         F=0.4145  , p=0.6671  , df_denom=17, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.4445  , p=0.7249  , df_denom=14, df_num=3
ssr based chi2 test:   chi2=2.0004  , p=0.5723  , df=3
likelihood ratio test: chi2=1.9107  , p=0.5911  , df=3
parameter F test:         F=0.4445  , p=0.7249  , df_denom=14, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.6637  , p=0.6301  , df_d

### Conclusions: 
For FORD the sentiment of esg-related news influences sentiment of greenwashing-related news. <br>
For M&S the sentiment of greenwashing-related news influences sentiment of esg-related news.



### End of main