## Objective Three

#### Determine the correlation between corporate ESG and greenwashing from news stories

In [79]:
# Import libraries

import pandas as pd
import numpy as np
import os
import string
import torch
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
import re

# Load FinBERT tokenizer and model
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Initialize sentiment analysis pipeline
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

nltk.download('stopwords')

# Granger's casuality test library
from statsmodels.tsa.stattools import grangercausalitytests

import warnings
warnings.filterwarnings('ignore')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
[nltk_data] Downloading package stopwords to /Users/luca/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Step 1: Load the greenwashing stories and perform sentiment analysis

In [48]:
# Set up folders
greenwashing_news_raw_file_path = '../Data/Input/Eikon/refinitiv_greenwashing_stories_raw.csv'
greenwashing_news_clean_file_path = '../Data/Output/greenwashing_news_df.csv'
greenwashing_news_summary_file_path = '../Data/Output/greenwashing_news_summary_df.csv'
greenwashing_news_sentiment_file_path = '../Data/Output/greenwashing_news_sentiment_df.csv'

# Read the CSV file into a DataFrame
greenwashing_news_df = pd.read_csv(greenwashing_news_raw_file_path)

greenwashing_news_df

Unnamed: 0,story,date,storyId,company,ticker
0,"<div class=""storyContent"" lang=""en""><p><a href...",2024-07-03 14:11:06.165000+00:00,urn:newsml:newswire.refinitiv.com:20240703:nGL...,Ford,alternative energy
1,"<div class=""storyContent"" lang=""en""><p><a href...",2024-05-20 13:26:40.057000+00:00,urn:newsml:newswire.refinitiv.com:20240520:nGL...,Ford,alternative energy
2,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 20:13:28+00:00,urn:newsml:newswire.refinitiv.com:20240729:nL1...,Ford,fair
3,"<div class=""storyContent"" lang=""en""></div>",2024-07-29 19:50:26.009000+00:00,urn:newsml:newswire.refinitiv.com:20240729:nAQ...,Ford,fair
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 19:01:36.228000+00:00,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,fair
...,...,...,...,...,...
2644,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 11:00:00+00:00,urn:newsml:newswire.refinitiv.com:20240628:nL8...,Tesco,clean energy
2645,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 06:02:34.048000+00:00,urn:newsml:newsroom.refinitiv.com:20240628:nRS...,Tesco,clean energy
2646,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 06:00:03+00:00,urn:newsml:newswire.refinitiv.com:20240628:nRS...,Tesco,clean energy
2647,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-27 16:34:38.473000+00:00,urn:newsml:newswire.refinitiv.com:20240627:nRT...,Tesco,clean energy


In [49]:
# list of companies
companies = greenwashing_news_df['company'].unique().tolist()
companies

['Ford',
 'Polestar',
 'Stellantis',
 'Tesla',
 'Toyota',
 'Marks & Spencer',
 'Ocado',
 'Tesco']

In [42]:
# Drop storyId column

greenwashing_news_df = greenwashing_news_df.drop(['storyId'], axis= 1)
greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,"<div class=""storyContent"" lang=""en""><p><a href...",2024-07-03 14:11:06.165000+00:00,Ford,alternative energy
1,"<div class=""storyContent"" lang=""en""><p><a href...",2024-05-20 13:26:40.057000+00:00,Ford,alternative energy
2,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 20:13:28+00:00,Ford,fair
3,"<div class=""storyContent"" lang=""en""></div>",2024-07-29 19:50:26.009000+00:00,Ford,fair
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 19:01:36.228000+00:00,Ford,fair
...,...,...,...,...
2644,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 11:00:00+00:00,Tesco,clean energy
2645,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 06:02:34.048000+00:00,Tesco,clean energy
2646,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 06:00:03+00:00,Tesco,clean energy
2647,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-27 16:34:38.473000+00:00,Tesco,clean energy


In [43]:
# Convert multiple columns to string type
columns_to_convert = ['story', 'company', 'ticker']
greenwashing_news_df[columns_to_convert] = greenwashing_news_df[columns_to_convert].astype(str)
greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,"<div class=""storyContent"" lang=""en""><p><a href...",2024-07-03 14:11:06.165000+00:00,Ford,alternative energy
1,"<div class=""storyContent"" lang=""en""><p><a href...",2024-05-20 13:26:40.057000+00:00,Ford,alternative energy
2,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 20:13:28+00:00,Ford,fair
3,"<div class=""storyContent"" lang=""en""></div>",2024-07-29 19:50:26.009000+00:00,Ford,fair
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 19:01:36.228000+00:00,Ford,fair
...,...,...,...,...
2644,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 11:00:00+00:00,Tesco,clean energy
2645,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 06:02:34.048000+00:00,Tesco,clean energy
2646,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28 06:00:03+00:00,Tesco,clean energy
2647,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-27 16:34:38.473000+00:00,Tesco,clean energy


In [44]:
# Convert the 'date' column to datetime
greenwashing_news_df['date'] = pd.to_datetime(greenwashing_news_df['date'], errors='coerce')

# Format the datetime to 'YYYY-MM-DD HH:MM'
greenwashing_news_df['date'] = greenwashing_news_df['date'].dt.strftime('%Y-%m-%d')

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,"<div class=""storyContent"" lang=""en""><p><a href...",2024-07-03,Ford,alternative energy
1,"<div class=""storyContent"" lang=""en""><p><a href...",2024-05-20,Ford,alternative energy
2,"<div class=""storyContent"" lang=""en""><style typ...",,Ford,fair
3,"<div class=""storyContent"" lang=""en""></div>",2024-07-29,Ford,fair
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,fair
...,...,...,...,...
2644,"<div class=""storyContent"" lang=""en""><style typ...",,Tesco,clean energy
2645,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28,Tesco,clean energy
2646,"<div class=""storyContent"" lang=""en""><style typ...",,Tesco,clean energy
2647,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-27,Tesco,clean energy


In [45]:
# list of tickers
tickers = greenwashing_news_df['ticker'].unique().tolist()
tickers

['alternative energy',
 'fair',
 'low carbon',
 'transition',
 'renew',
 'organic',
 'fossil free',
 'clean energy',
 'zero waste',
 'green',
 'climate',
 'waste',
 'natural',
 'carbon-neutral']

In [7]:
# After inspecting the dataset, there are a few rows with compromised data. 
# Keep every rows unless date column contains a timestamp.

# Define a regular expression pattern to detect timestamps (format: YYYY-MM-DD HH:MM:SS)
timestamp_pattern = r'\d{4}-\d{2}-\d{2}'

# Filter the rows that contain timestamps in the 'date' column
greenwashing_news_df = greenwashing_news_df[greenwashing_news_df['date'].apply(lambda x: bool(re.match(timestamp_pattern, str(x))))]

# Reset the index of the filtered DataFrame
greenwashing_news_df = greenwashing_news_df.reset_index(drop=True)

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,"<div class=""storyContent"" lang=""en""><p><a href...",2024-07-03,Ford,alternative energy
1,"<div class=""storyContent"" lang=""en""><p><a href...",2024-05-20,Ford,alternative energy
2,"<div class=""storyContent"" lang=""en""></div>",2024-07-29,Ford,fair
3,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,fair
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,fair
...,...,...,...,...
1468,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-01,Tesco,clean energy
1469,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28,Tesco,clean energy
1470,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28,Tesco,clean energy
1471,"<div class=""storyContent"" lang=""en""><style typ...",2024-06-28,Tesco,clean energy


In [8]:
# Functions to remove text within <> and {}:

def remove_text_within_angle_brackets(text):
    if pd.notnull(text):  # Check if the text is not null
        return re.sub(r'<.*?>', '', text)
    return text  # Return the text as is if it's null

# Apply the function to the 'story' column
greenwashing_news_df['story'] = greenwashing_news_df['story'].apply(remove_text_within_angle_brackets)


def remove_text_within_curly_brackets(text):
    if pd.notnull(text):  # Check if the text is not null
        return re.sub(r'{.*?}', '', text)
    return text  # Return the text as is if it's null

# Apply the function to the 'story' column
greenwashing_news_df['story'] = greenwashing_news_df['story'].apply(remove_text_within_curly_brackets)

# Reset the index of the filtered DataFrame
greenwashing_news_df = greenwashing_news_df.reset_index(drop=True)

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,https://filings.ica.int.thomsonreuters.com/fil...,2024-07-03,Ford,alternative energy
1,https://filings.ica.int.thomsonreuters.com/fil...,2024-05-20,Ford,alternative energy
2,,2024-07-29,Ford,fair
3,".storyContent * NEW YORK, NY / ACCESSWIRE / Ju...",2024-07-29,Ford,fair
4,".storyContent * LOS ANGELES, CA / ACCESSWIRE /...",2024-07-29,Ford,fair
...,...,...,...,...
1468,.storyContent * RNS Number : 4659UTesco PLC01 ...,2024-07-01,Tesco,clean energy
1469,.storyContent * Tesco and Asda are being sued ...,2024-06-28,Tesco,clean energy
1470,.storyContent * Lawyers acting for two people ...,2024-06-28,Tesco,clean energy
1471,.storyContent * RNS Number : 2479UTesco PLC28 ...,2024-06-28,Tesco,clean energy


In [9]:
# Remove '.storyContent * ' substring from the 'story' column
greenwashing_news_df['story'] = greenwashing_news_df['story'].str.replace('.storyContent * ', '', regex=False)

# Reset the index of the filtered DataFrame
greenwashing_news_df = greenwashing_news_df.reset_index(drop=True)

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,https://filings.ica.int.thomsonreuters.com/fil...,2024-07-03,Ford,alternative energy
1,https://filings.ica.int.thomsonreuters.com/fil...,2024-05-20,Ford,alternative energy
2,,2024-07-29,Ford,fair
3,"NEW YORK, NY / ACCESSWIRE / July 29, 2024 / Le...",2024-07-29,Ford,fair
4,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,Ford,fair
...,...,...,...,...
1468,RNS Number : 4659UTesco PLC01 July 2024 Tesco...,2024-07-01,Tesco,clean energy
1469,"Tesco and Asda are being sued by customers, in...",2024-06-28,Tesco,clean energy
1470,Lawyers acting for two people who fell ill in ...,2024-06-28,Tesco,clean energy
1471,RNS Number : 2479UTesco PLC28 June 2024 Tesco...,2024-06-28,Tesco,clean energy


In [10]:
# Remove rows with empty cells
greenwashing_news_df = greenwashing_news_df.dropna(how='any')

# Reset the index of the filtered DataFrame
greenwashing_news_df = greenwashing_news_df.reset_index(drop=True)

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,https://filings.ica.int.thomsonreuters.com/fil...,2024-07-03,Ford,alternative energy
1,https://filings.ica.int.thomsonreuters.com/fil...,2024-05-20,Ford,alternative energy
2,,2024-07-29,Ford,fair
3,"NEW YORK, NY / ACCESSWIRE / July 29, 2024 / Le...",2024-07-29,Ford,fair
4,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,Ford,fair
...,...,...,...,...
1468,RNS Number : 4659UTesco PLC01 July 2024 Tesco...,2024-07-01,Tesco,clean energy
1469,"Tesco and Asda are being sued by customers, in...",2024-06-28,Tesco,clean energy
1470,Lawyers acting for two people who fell ill in ...,2024-06-28,Tesco,clean energy
1471,RNS Number : 2479UTesco PLC28 June 2024 Tesco...,2024-06-28,Tesco,clean energy


In [11]:
# Function to count the number of words in a string
def word_count(text):
    if pd.notnull(text):  # Check if the text is not null
        return len(text.split())
    return 0  # Return 0 if the text is null

# Filter out rows where the word count in 'story' column is less than 15
greenwashing_news_df = greenwashing_news_df[greenwashing_news_df['story'].apply(word_count) >= 15]

# Filter out rows where any column is empty or contains only whitespace
greenwashing_news_df = greenwashing_news_df[(greenwashing_news_df != '') & (greenwashing_news_df != ' ')].dropna()

# Reset the index of the filtered DataFrame
greenwashing_news_df = greenwashing_news_df.reset_index(drop=True)

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,"NEW YORK, NY / ACCESSWIRE / July 29, 2024 / Le...",2024-07-29,Ford,fair
1,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,Ford,fair
2,"Jul 29, 2024Ford Dealership by JeepersMedia / ...",2024-07-29,Ford,fair
3,"NEW YORK, NY / ACCESSWIRE / July 29, 2024 / Le...",2024-07-29,Ford,fair
4,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,Ford,fair
...,...,...,...,...
1290,RNS Number : 4659UTesco PLC01 July 2024 Tesco...,2024-07-01,Tesco,clean energy
1291,"Tesco and Asda are being sued by customers, in...",2024-06-28,Tesco,clean energy
1292,Lawyers acting for two people who fell ill in ...,2024-06-28,Tesco,clean energy
1293,RNS Number : 2479UTesco PLC28 June 2024 Tesco...,2024-06-28,Tesco,clean energy


In [12]:
# Strip whitespace from all columns and replace empty strings with NaN
greenwashing_news_df = greenwashing_news_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
greenwashing_news_df.replace('', pd.NA, inplace=True)

# Drop rows with any NaN values
greenwashing_news_df.dropna(inplace=True)

# Reset the index of the filtered DataFrame
greenwashing_news_df = greenwashing_news_df.reset_index(drop=True)

greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,"NEW YORK, NY / ACCESSWIRE / July 29, 2024 / Le...",2024-07-29,Ford,fair
1,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,Ford,fair
2,"Jul 29, 2024Ford Dealership by JeepersMedia / ...",2024-07-29,Ford,fair
3,"NEW YORK, NY / ACCESSWIRE / July 29, 2024 / Le...",2024-07-29,Ford,fair
4,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,Ford,fair
...,...,...,...,...
1290,RNS Number : 4659UTesco PLC01 July 2024 Tesco...,2024-07-01,Tesco,clean energy
1291,"Tesco and Asda are being sued by customers, in...",2024-06-28,Tesco,clean energy
1292,Lawyers acting for two people who fell ill in ...,2024-06-28,Tesco,clean energy
1293,RNS Number : 2479UTesco PLC28 June 2024 Tesco...,2024-06-28,Tesco,clean energy


In [13]:
# Aggregate the 'company' column and calculate the total number of rows for each unique value
company_counts = greenwashing_news_df.groupby('company').size().reset_index(name='total_rows')
# Sort the results
company_counts = company_counts.sort_values(by='total_rows', ascending=False)


# Display the aggregated DataFrame
print(company_counts)

           company  total_rows
7           Toyota         332
0             Ford         241
1  Marks & Spencer         201
6            Tesla         197
5            Tesco         168
2            Ocado         146
4       Stellantis           7
3         Polestar           3


In [14]:
# Group by both 'company' and 'ticker' columns and calculate the number of rows for each group
aggregated_counts = greenwashing_news_df.groupby(['company', 'ticker']).size().reset_index(name='total_rows')

# aggregated_counts = aggregated_counts.sort_values(by='total_rows', ascending=False)

# Display the aggregated DataFrame
print(aggregated_counts)

            company              ticker  total_rows
0              Ford        clean energy          58
1              Ford                fair          57
2              Ford         fossil free          60
3              Ford          low carbon           3
4              Ford             organic           2
5              Ford               renew          58
6              Ford          transition           3
7   Marks & Spencer        clean energy          45
8   Marks & Spencer                fair          56
9   Marks & Spencer         fossil free          51
10  Marks & Spencer          low carbon           2
11  Marks & Spencer             organic           2
12  Marks & Spencer               renew          45
13            Ocado        clean energy          34
14            Ocado                fair          38
15            Ocado         fossil free          38
16            Ocado             organic           2
17            Ocado               renew          34
18         P

In [15]:
# Removing punctuation to optimise the polarity

# Define function to remove punctuation
def remove_punctuation(text):
    if pd.isna(text):
        return text
    return text.translate(str.maketrans('', '', string.punctuation))

# Apply the function to the 'story' column
greenwashing_news_df['story'] = greenwashing_news_df['story'].apply(remove_punctuation)

# Display the DataFrame
print(greenwashing_news_df[['story']])

                                                  story
0     NEW YORK NY  ACCESSWIRE  July 29 2024  Levi am...
1     LOS ANGELES CA  ACCESSWIRE  July 29 2024  The ...
2     Jul 29 2024Ford Dealership by JeepersMedia  BY...
3     NEW YORK NY  ACCESSWIRE  July 29 2024  Levi am...
4     NEW YORK CITY NY  ACCESSWIRE  July 29 2024  Br...
...                                                 ...
1290  RNS Number  4659UTesco PLC01 July 2024  Tesco ...
1291  Tesco and Asda are being sued by customers inc...
1292  Lawyers acting for two people who fell ill in ...
1293  RNS Number  2479UTesco PLC28 June 2024  Tesco ...
1294  Click the following link to watch video httpss...

[1295 rows x 1 columns]


In [16]:
# Convert the 'story' column to lowercase to optimise polarity
greenwashing_news_df['story'] = greenwashing_news_df['story'].str.lower()

# Display the updated DataFrame
print(greenwashing_news_df[['story']])

                                                  story
0     new york ny  accesswire  july 29 2024  levi am...
1     los angeles ca  accesswire  july 29 2024  the ...
2     jul 29 2024ford dealership by jeepersmedia  by...
3     new york ny  accesswire  july 29 2024  levi am...
4     new york city ny  accesswire  july 29 2024  br...
...                                                 ...
1290  rns number  4659utesco plc01 july 2024  tesco ...
1291  tesco and asda are being sued by customers inc...
1292  lawyers acting for two people who fell ill in ...
1293  rns number  2479utesco plc28 june 2024  tesco ...
1294  click the following link to watch video httpss...

[1295 rows x 1 columns]


In [17]:
greenwashing_news_df

Unnamed: 0,story,date,company,ticker
0,new york ny accesswire july 29 2024 levi am...,2024-07-29,Ford,fair
1,los angeles ca accesswire july 29 2024 the ...,2024-07-29,Ford,fair
2,jul 29 2024ford dealership by jeepersmedia by...,2024-07-29,Ford,fair
3,new york ny accesswire july 29 2024 levi am...,2024-07-29,Ford,fair
4,new york city ny accesswire july 29 2024 br...,2024-07-29,Ford,fair
...,...,...,...,...
1290,rns number 4659utesco plc01 july 2024 tesco ...,2024-07-01,Tesco,clean energy
1291,tesco and asda are being sued by customers inc...,2024-06-28,Tesco,clean energy
1292,lawyers acting for two people who fell ill in ...,2024-06-28,Tesco,clean energy
1293,rns number 2479utesco plc28 june 2024 tesco ...,2024-06-28,Tesco,clean energy


In [18]:
# Save the dataframe

greenwashing_news_df.to_csv(greenwashing_news_clean_file_path, index=False)

In [19]:
# As the news stories are very long, let's use a text summarization model to condense the text 
# to a manageable length before feeding it into the sentiment classifier.
# Load the summarization pipeline with BART
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [21]:
# Let's summarise the content of the news articles

Text_Summary = []
for i in range(0, len(greenwashing_news_df['story'])):
  summary = summarizer(greenwashing_news_df['story'][i][:512], max_length=50, min_length=25, do_sample=False)[0]['summary_text']
  Text_Summary.append(summary)

Your max_length is set to 50, but your input_length is only 49. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 50, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max_length is set to 50, but your input_length is only 49. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 50, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max

In [22]:
# Let's add the summarised articles to the dataframe
greenwashing_news_df['Text Summary'] = Text_Summary

In [23]:
greenwashing_news_df

Unnamed: 0,story,date,company,ticker,Text Summary
0,new york ny accesswire july 29 2024 levi am...,2024-07-29,Ford,fair,ford released its second quarter earnings on j...
1,los angeles ca accesswire july 29 2024 the ...,2024-07-29,Ford,fair,The schall law firm a national shareholder rig...
2,jul 29 2024ford dealership by jeepersmedia by...,2024-07-29,Ford,fair,The ford motor company has been underperformin...
3,new york ny accesswire july 29 2024 levi am...,2024-07-29,Ford,fair,ford released its second quarter earnings on j...
4,new york city ny accesswire july 29 2024 br...,2024-07-29,Ford,fair,Investors who purchased ford securities are en...
...,...,...,...,...,...
1290,rns number 4659utesco plc01 july 2024 tesco ...,2024-07-01,Tesco,clean energy,Tesco plc announces that on 28 june 2024 it ha...
1291,tesco and asda are being sued by customers inc...,2024-06-28,Tesco,clean energy,tesco and asda are being sued by customers in...
1292,lawyers acting for two people who fell ill in ...,2024-06-28,Tesco,clean energy,Lawyers acting for two people who fell ill in ...
1293,rns number 2479utesco plc28 june 2024 tesco ...,2024-06-28,Tesco,clean energy,Tesco plc has purchased in accordance with the...


In [24]:
# Save the dataframe - greenwashing_news summarised

greenwashing_news_df.to_csv(greenwashing_news_summary_file_path, index=False)

In [25]:
# Let's initialise the 'stop words' function for English 
stop = stopwords.words('english')
stop[:5]

['i', 'me', 'my', 'myself', 'we']

In [26]:
# Let's remove 'stop words' from the summarised articles

greenwashing_news_df['Text Summary'] = greenwashing_news_df['Text Summary'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop]))

In [27]:
greenwashing_news_df.head()

Unnamed: 0,story,date,company,ticker,Text Summary
0,new york ny accesswire july 29 2024 levi am...,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...
1,los angeles ca accesswire july 29 2024 the ...,2024-07-29,Ford,fair,The schall law firm national shareholder right...
2,jul 29 2024ford dealership by jeepersmedia by...,2024-07-29,Ford,fair,The ford motor company underperforming compare...
3,new york ny accesswire july 29 2024 levi am...,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...
4,new york city ny accesswire july 29 2024 br...,2024-07-29,Ford,fair,Investors purchased ford securities encouraged...


In [28]:
# Let's initialise FinBERT for sentiment analysis

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [29]:
# Let's run the sentiment analysis for each news article to obtain 'sentiment', 'label' and 'confidence score'.

greenwashing_news_sentiment_df = greenwashing_news_df.assign(sentiment = lambda x: x['Text Summary'].apply(lambda s: classifier(s))).assign(label = lambda x: x['sentiment'].apply(lambda s: (s[0]['label'])),score = lambda x: x['sentiment'].apply(lambda s: (s[0]['score'])))

In [31]:
# Save the dataframe - greenwashing news sentiment analysis
greenwashing_news_sentiment_df.to_csv(greenwashing_news_sentiment_file_path, index=False)

In [32]:
greenwashing_news_sentiment_df.head()

Unnamed: 0,story,date,company,ticker,Text Summary,sentiment,label,score
0,new york ny accesswire july 29 2024 levi am...,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...,"[{'label': 'negative', 'score': 0.887848019599...",negative,0.887848
1,los angeles ca accesswire july 29 2024 the ...,2024-07-29,Ford,fair,The schall law firm national shareholder right...,"[{'label': 'negative', 'score': 0.915385782718...",negative,0.915386
2,jul 29 2024ford dealership by jeepersmedia by...,2024-07-29,Ford,fair,The ford motor company underperforming compare...,"[{'label': 'negative', 'score': 0.565261840820...",negative,0.565262
3,new york ny accesswire july 29 2024 levi am...,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...,"[{'label': 'negative', 'score': 0.887848019599...",negative,0.887848
4,new york city ny accesswire july 29 2024 br...,2024-07-29,Ford,fair,Investors purchased ford securities encouraged...,"[{'label': 'neutral', 'score': 0.9397768378257...",neutral,0.939777


In [33]:
# Let's drop the story and sentiment columns

greenwashing_news_sentiment_df.drop(['story','sentiment'], axis=1, inplace=True)

In [34]:
greenwashing_news_sentiment_df.columns = ['Timestamp', 'Company', 'Ticker', 'Text Summary', 'Sentiment', 'Confidence Score']

In [36]:
greenwashing_news_sentiment_df.head()

Unnamed: 0,Timestamp,Company,Ticker,Text Summary,Sentiment,Confidence Score
0,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...,negative,0.887848
1,2024-07-29,Ford,fair,The schall law firm national shareholder right...,negative,0.915386
2,2024-07-29,Ford,fair,The ford motor company underperforming compare...,negative,0.565262
3,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...,negative,0.887848
4,2024-07-29,Ford,fair,Investors purchased ford securities encouraged...,neutral,0.939777


In [37]:
# Define the mapping to convert the Sentiment labels into corresponding values 
sentiment_mapping = {
    'negative': -1,
    'neutral': 0,
    'positive' : 1
}

# Apply the mapping to create the new 'Sentiment Value' column
greenwashing_news_sentiment_df['Sentiment Value'] = greenwashing_news_sentiment_df['Sentiment'].map(sentiment_mapping)

greenwashing_news_sentiment_df

Unnamed: 0,Timestamp,Company,Ticker,Text Summary,Sentiment,Confidence Score,Sentiment Value
0,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...,negative,0.887848,-1
1,2024-07-29,Ford,fair,The schall law firm national shareholder right...,negative,0.915386,-1
2,2024-07-29,Ford,fair,The ford motor company underperforming compare...,negative,0.565262,-1
3,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...,negative,0.887848,-1
4,2024-07-29,Ford,fair,Investors purchased ford securities encouraged...,neutral,0.939777,0
...,...,...,...,...,...,...,...
1290,2024-07-01,Tesco,clean energy,Tesco plc announces 28 june 2024 purchased acc...,neutral,0.933536,0
1291,2024-06-28,Tesco,clean energy,tesco asda sued customers including family 11y...,negative,0.952730,-1
1292,2024-06-28,Tesco,clean energy,Lawyers acting two people fell ill e coli outb...,negative,0.838622,-1
1293,2024-06-28,Tesco,clean energy,Tesco plc purchased accordance authority grant...,neutral,0.926502,0


In [38]:
# Define the composite score as the result of weight x sentiment_score

greenwashing_news_sentiment_df['Composite Score'] = greenwashing_news_sentiment_df['Confidence Score'] * greenwashing_news_sentiment_df['Sentiment Value']
greenwashing_news_sentiment_df 


Unnamed: 0,Timestamp,Company,Ticker,Text Summary,Sentiment,Confidence Score,Sentiment Value,Composite Score
0,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...,negative,0.887848,-1,-0.887848
1,2024-07-29,Ford,fair,The schall law firm national shareholder right...,negative,0.915386,-1,-0.915386
2,2024-07-29,Ford,fair,The ford motor company underperforming compare...,negative,0.565262,-1,-0.565262
3,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...,negative,0.887848,-1,-0.887848
4,2024-07-29,Ford,fair,Investors purchased ford securities encouraged...,neutral,0.939777,0,0.000000
...,...,...,...,...,...,...,...,...
1290,2024-07-01,Tesco,clean energy,Tesco plc announces 28 june 2024 purchased acc...,neutral,0.933536,0,0.000000
1291,2024-06-28,Tesco,clean energy,tesco asda sued customers including family 11y...,negative,0.952730,-1,-0.952730
1292,2024-06-28,Tesco,clean energy,Lawyers acting two people fell ill e coli outb...,negative,0.838622,-1,-0.838622
1293,2024-06-28,Tesco,clean energy,Tesco plc purchased accordance authority grant...,neutral,0.926502,0,0.000000


In [39]:
# Group by company and calculate the average composite_score across the entire period
average_scores = greenwashing_news_sentiment_df.groupby('Company')['Composite Score'].mean().reset_index()
average_scores

Unnamed: 0,Company,Composite Score
0,Ford,-0.404126
1,Marks & Spencer,0.060601
2,Ocado,0.122721
3,Polestar,0.0
4,Stellantis,0.108331
5,Tesco,-0.011187
6,Tesla,-0.177432
7,Toyota,-0.071564


In [50]:
# Convert 'date' column to datetime
greenwashing_news_sentiment_df['Timestamp'] = pd.to_datetime(greenwashing_news_sentiment_df['Timestamp'])

# Extract month and year from 'date'
greenwashing_news_sentiment_df['Month'] = greenwashing_news_sentiment_df['Timestamp'].dt.to_period('M')

greenwashing_news_sentiment_df

Unnamed: 0,Timestamp,Company,Ticker,Text Summary,Sentiment,Confidence Score,Sentiment Value,Composite Score,Month
0,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...,negative,0.887848,-1,-0.887848,2024-07
1,2024-07-29,Ford,fair,The schall law firm national shareholder right...,negative,0.915386,-1,-0.915386,2024-07
2,2024-07-29,Ford,fair,The ford motor company underperforming compare...,negative,0.565262,-1,-0.565262,2024-07
3,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...,negative,0.887848,-1,-0.887848,2024-07
4,2024-07-29,Ford,fair,Investors purchased ford securities encouraged...,neutral,0.939777,0,0.000000,2024-07
...,...,...,...,...,...,...,...,...,...
1290,2024-07-01,Tesco,clean energy,Tesco plc announces 28 june 2024 purchased acc...,neutral,0.933536,0,0.000000,2024-07
1291,2024-06-28,Tesco,clean energy,tesco asda sued customers including family 11y...,negative,0.952730,-1,-0.952730,2024-06
1292,2024-06-28,Tesco,clean energy,Lawyers acting two people fell ill e coli outb...,negative,0.838622,-1,-0.838622,2024-06
1293,2024-06-28,Tesco,clean energy,Tesco plc purchased accordance authority grant...,neutral,0.926502,0,0.000000,2024-06


In [51]:
# Group by company and month, then calculate the average composite_score
monthly_average_composite_scores = greenwashing_news_sentiment_df.groupby(['Company', 'Month'])['Composite Score'].mean().reset_index()
monthly_average_composite_scores

Unnamed: 0,Company,Month,Composite Score
0,Ford,2024-05,0.128378
1,Ford,2024-07,-0.415408
2,Marks & Spencer,2024-05,0.157038
3,Marks & Spencer,2024-06,0.085274
4,Marks & Spencer,2024-07,0.036276
5,Ocado,2024-07,0.122721
6,Polestar,2024-06,0.0
7,Polestar,2024-07,0.0
8,Stellantis,2024-06,0.252772
9,Stellantis,2024-07,0.0


In [60]:
# Convert 'date' column to datetime
greenwashing_news_sentiment_df['Timestamp'] = pd.to_datetime(greenwashing_news_sentiment_df['Timestamp'])

# Extract week of the year and year from 'date'
greenwashing_news_sentiment_df['week_of_year'] = greenwashing_news_sentiment_df['Timestamp'].dt.isocalendar().week
greenwashing_news_sentiment_df['year'] = greenwashing_news_sentiment_df['Timestamp'].dt.year

greenwashing_news_sentiment_df.head()

Unnamed: 0,Timestamp,Company,Ticker,Text Summary,Sentiment,Confidence Score,Sentiment Value,Composite Score,Month,week_of_year,year
0,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...,negative,0.887848,-1,-0.887848,2024-07,31,2024
1,2024-07-29,Ford,fair,The schall law firm national shareholder right...,negative,0.915386,-1,-0.915386,2024-07,31,2024
2,2024-07-29,Ford,fair,The ford motor company underperforming compare...,negative,0.565262,-1,-0.565262,2024-07,31,2024
3,2024-07-29,Ford,fair,ford released second quarter earnings july 25 ...,negative,0.887848,-1,-0.887848,2024-07,31,2024
4,2024-07-29,Ford,fair,Investors purchased ford securities encouraged...,neutral,0.939777,0,0.0,2024-07,31,2024


In [62]:
# Group by company and week of the year, then calculate the average composite_score
weekly_average_greenwashing_composite_scores = greenwashing_news_sentiment_df.groupby(['Company', 'year', 'week_of_year'])['Composite Score'].mean().reset_index()

weekly_average_greenwashing_composite_scores.to_csv('../Data/Output/obj1_weekly_average_greenwashing_composite_scores.csv', index=False)

weekly_average_greenwashing_composite_scores

Unnamed: 0,Company,year,week_of_year,Composite Score
0,Ford,2024,19,0.0
1,Ford,2024,21,0.641892
2,Ford,2024,22,0.0
3,Ford,2024,28,0.566291
4,Ford,2024,30,-0.425757
5,Ford,2024,31,-0.383099
6,Marks & Spencer,2024,21,0.471113
7,Marks & Spencer,2024,22,-0.410016
8,Marks & Spencer,2024,23,0.017212
9,Marks & Spencer,2024,24,0.342992


In [68]:
# Rename column Composite Score

weekly_average_greenwashing_composite_scores = weekly_average_greenwashing_composite_scores.rename(columns={'Composite Score': 'Composite Score Greenwashing'})

In [69]:
weekly_average_greenwashing_composite_scores.head()

Unnamed: 0,Company,year,week_of_year,Composite Score Greenwashing
0,Ford,2024,19,0.0
1,Ford,2024,21,0.641892
2,Ford,2024,22,0.0
3,Ford,2024,28,0.566291
4,Ford,2024,30,-0.425757


#### Step 2: Load the ESG weekly average composite scores from Objective one

In [63]:
# Read the CSV file into a DataFrame
weekly_average_esg_composite_scores = pd.read_csv('../Data/Output/obj1_weekly_average_esg_composite_scores.csv')
weekly_average_esg_composite_scores

Unnamed: 0,Company,year,week_of_year,Composite Score
0,Asda,2024,23,0.682767
1,Asda,2024,28,0.883994
2,Asda,2024,30,-0.436915
3,Ford,2024,18,0.000000
4,Ford,2024,25,-0.227749
...,...,...,...,...
74,Toyota,2024,26,0.000000
75,Toyota,2024,28,0.091921
76,Toyota,2024,29,-0.152090
77,Toyota,2024,30,-0.178072


In [70]:
# Rename column Composite Score

weekly_average_esg_composite_scores = weekly_average_esg_composite_scores.rename(columns={'Composite Score': 'Composite Score ESG'})

In [71]:
weekly_average_esg_composite_scores.head()

Unnamed: 0,Company,year,week_of_year,Composite Score ESG
0,Asda,2024,23,0.682767
1,Asda,2024,28,0.883994
2,Asda,2024,30,-0.436915
3,Ford,2024,18,0.0
4,Ford,2024,25,-0.227749


#### Step 3: Perform Granger's casuality testing between 'esg composite scores' and 'greenwashing composite scores' 

In [74]:
# Merge the 'esg composite scores' and 'greenwashing composite scores' on 'company', 'year' and 'week_of_year'
merged_df = pd.merge(weekly_average_esg_composite_scores[['Company', 'year', 'week_of_year', 'Composite Score ESG']], 
                     weekly_average_greenwashing_composite_scores[['Company', 'year', 'week_of_year', 'Composite Score Greenwashing']], 
                     on=['Company', 'year', 'week_of_year'], how='inner')


In [75]:
merged_df.head()

Unnamed: 0,Company,year,week_of_year,Composite Score ESG,Composite Score Greenwashing
0,Ford,2024,28,0.032804,0.566291
1,Ford,2024,30,-0.331602,-0.425757
2,Ford,2024,31,-0.278115,-0.383099
3,Marks & Spencer,2024,21,0.942227,0.471113
4,Marks & Spencer,2024,22,-0.239176,-0.410016


In [77]:
# Drop any rows with missing data, as Granger causality tests require complete cases
merged_df = merged_df.dropna()
merged_df.head()

Unnamed: 0,Company,year,week_of_year,Composite Score ESG,Composite Score Greenwashing
0,Ford,2024,28,0.032804,0.566291
1,Ford,2024,30,-0.331602,-0.425757
2,Ford,2024,31,-0.278115,-0.383099
3,Marks & Spencer,2024,21,0.942227,0.471113
4,Marks & Spencer,2024,22,-0.239176,-0.410016


In [91]:
# Set the maximum number of lags to test
max_lag = 5

# Perform Granger causality test to see if greenwashing-related news Granger-causes ESG-related news
granger_results = grangercausalitytests(merged_df[['Composite Score ESG', 'Composite Score Greenwashing']], max_lag, verbose=True)

# Print summary of the test results
for lag, test_result in granger_results.items():
    p_value = test_result[0]['ssr_ftest'][1]  # Extract the p-value for each lag's F-test
    print(f"Lag {lag}: p-value = {p_value}")
    if p_value < 0.05:
        print(f"At lag {lag}, we reject the null hypothesis. Sentiment influences stock market performance.\n")
    else:
        print(f"At lag {lag}, we fail to reject the null hypothesis.\n")



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=3.3212  , p=0.0755  , df_denom=42, df_num=1
ssr based chi2 test:   chi2=3.5585  , p=0.0592  , df=1
likelihood ratio test: chi2=3.4248  , p=0.0642  , df=1
parameter F test:         F=3.3212  , p=0.0755  , df_denom=42, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.3517  , p=0.2707  , df_denom=39, df_num=2
ssr based chi2 test:   chi2=3.0500  , p=0.2176  , df=2
likelihood ratio test: chi2=2.9489  , p=0.2289  , df=2
parameter F test:         F=1.3517  , p=0.2707  , df_denom=39, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.4468  , p=0.2453  , df_denom=36, df_num=3
ssr based chi2 test:   chi2=5.1845  , p=0.1588  , df=3
likelihood ratio test: chi2=4.8950  , p=0.1796  , df=3
parameter F test:         F=1.4468  , p=0.2453  , df_denom=36, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.8141  , p=0.5253  , df_d

## END OF SECTION