## Objective One

This notebook has the following purposes:
- Clean the raw news stories downloaded from Eikon
- Run the sentiment analysis
- Aggregate the data to determine insights

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os
import string
import torch
from transformers import pipeline
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
import re

# Load Vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# Initialize VADER sentiment intensity analyzer
analyzer = SentimentIntensityAnalyzer()

# Load FinBERT tokenizer and model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Initialize sentiment analysis pipeline
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)



import warnings
warnings.filterwarnings('ignore')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [2]:
file_path = '../Data/Input/Eikon/refinitiv_stories_raw.csv'
fname_out = '../Data/Output/news_df.csv'

# Read the CSV file into a DataFrame
news_df = pd.read_csv(file_path)

# Remove columns from the 8th column onwards
news_df = news_df.iloc[:, :5]  # Keep only the first 7 columns (0-indexed)

news_df


Unnamed: 0,story,date,storyId,company,ticker
0,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 18:45:56.488000+00:00,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
1,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 14:01:47.535000+00:00,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
2,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 13:40:20.688000+00:00,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
3,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 13:17:18.603000+00:00,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 12:45:00.106000+00:00,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg
...,...,...,...,...,...
5161,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-15 20:11:27.466000+00:00,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance
5162,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14 19:32:32.800000+00:00,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
5163,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14 15:09:52.109000+00:00,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
5164,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-20 22:33:04.975000+00:00,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability


In [5]:
# Convert multiple columns to string type
columns_to_convert = ['story', 'storyId', 'company', 'ticker']
news_df[columns_to_convert] = news_df[columns_to_convert].astype(str)

In [6]:
# Convert the 'date' column to datetime, handling possible timezone info and various formats
# Convert the 'date' column to datetime format while considering the timezone info
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce')

# Format the datetime to 'YYYY-MM-DD HH:MM'
news_df['date'] = news_df['date'].dt.strftime('%Y-%m-%d')

news_df

Unnamed: 0,story,date,storyId,company,ticker
0,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
1,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
2,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
3,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg
...,...,...,...,...,...
5161,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance
5162,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
5163,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
5164,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability


In [7]:
# After inspecting the dataset, there are a few rows with compromised data. 
# Remove rows of dataset where date column doesn't have timestamp.

# Define a regular expression pattern to detect timestamps (format: YYYY-MM-DD HH:MM:SS)
timestamp_pattern = r'\d{4}-\d{2}-\d{2}'

# Filter the rows that contain timestamps in the 'date' column
news_df = news_df[news_df['date'].apply(lambda x: bool(re.match(timestamp_pattern, str(x))))]

# Reset the index of the filtered DataFrame
news_df = news_df.reset_index(drop=True)

news_df


Unnamed: 0,story,date,storyId,company,ticker
0,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
1,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
2,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
3,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg
...,...,...,...,...,...
1078,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance
1079,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
1080,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
1081,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability


In [8]:
# Functions to remove text within <> and {}:

def remove_text_within_angle_brackets(text):
    if pd.notnull(text):  # Check if the text is not null
        return re.sub(r'<.*?>', '', text)
    return text  # Return the text as is if it's null

# Apply the function to the 'story' column
news_df['story'] = news_df['story'].apply(remove_text_within_angle_brackets)


def remove_text_within_curly_brackets(text):
    if pd.notnull(text):  # Check if the text is not null
        return re.sub(r'{.*?}', '', text)
    return text  # Return the text as is if it's null

# Apply the function to the 'story' column
news_df['story'] = news_df['story'].apply(remove_text_within_curly_brackets)

# Reset the index of the filtered DataFrame
news_df = news_df.reset_index(drop=True)

news_df


Unnamed: 0,story,date,storyId,company,ticker
0,".storyContent * LOS ANGELES, CA / ACCESSWIRE /...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
1,".storyContent * NEW YORK CITY, NY / ACCESSWIRE...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
2,.storyContent * FORD ALERT: Bragar Eagel &amp;...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
3,".storyContent * FIRST ATLANTIC NICKEL CORP (""F...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
4,".storyContent * PALM BEACH, Fla., July 29, 20...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg
...,...,...,...,...,...
1078,".storyContent * Tesco, the UK's largest superm...",2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance
1079,.storyContent * Tesco has been accused of givi...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
1080,.storyContent * Tesco boss Ken Murphy has seen...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
1081,.storyContent * Tesco has apologised after a B...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability


In [9]:
# Remove '.storyContent * ' substring from the 'story' column
news_df['story'] = news_df['story'].str.replace('.storyContent * ', '', regex=False)

# Reset the index of the filtered DataFrame
news_df = news_df.reset_index(drop=True)

news_df


Unnamed: 0,story,date,storyId,company,ticker
0,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
1,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
2,"FORD ALERT: Bragar Eagel &amp; Squire, P.C. is...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
3,"FIRST ATLANTIC NICKEL CORP (""FAN-V"")\nALASKA E...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
4,"PALM BEACH, Fla., July 29, 2024 (GLOBE NEWSW...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg
...,...,...,...,...,...
1078,"Tesco, the UK's largest supermarket chain, has...",2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance
1079,Tesco has been accused of giving struggling wo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
1080,Tesco boss Ken Murphy has seen his pay deal mo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
1081,Tesco has apologised after a Black publisher s...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability


In [10]:
news_df

Unnamed: 0,story,date,storyId,company,ticker
0,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
1,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
2,"FORD ALERT: Bragar Eagel &amp; Squire, P.C. is...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
3,"FIRST ATLANTIC NICKEL CORP (""FAN-V"")\nALASKA E...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
4,"PALM BEACH, Fla., July 29, 2024 (GLOBE NEWSW...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg
...,...,...,...,...,...
1078,"Tesco, the UK's largest supermarket chain, has...",2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance
1079,Tesco has been accused of giving struggling wo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
1080,Tesco boss Ken Murphy has seen his pay deal mo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
1081,Tesco has apologised after a Black publisher s...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability


In [11]:
# Remove rows with empty cells

news_df = news_df.dropna(how='any')

# Reset the index of the filtered DataFrame
news_df = news_df.reset_index(drop=True)

news_df

Unnamed: 0,story,date,storyId,company,ticker
0,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
1,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
2,"FORD ALERT: Bragar Eagel &amp; Squire, P.C. is...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
3,"FIRST ATLANTIC NICKEL CORP (""FAN-V"")\nALASKA E...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
4,"PALM BEACH, Fla., July 29, 2024 (GLOBE NEWSW...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg
...,...,...,...,...,...
1078,"Tesco, the UK's largest supermarket chain, has...",2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance
1079,Tesco has been accused of giving struggling wo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
1080,Tesco boss Ken Murphy has seen his pay deal mo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
1081,Tesco has apologised after a Black publisher s...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability


In [12]:
# Function to count the number of words in a string
def word_count(text):
    if pd.notnull(text):  # Check if the text is not null
        return len(text.split())
    return 0  # Return 0 if the text is null

# Filter out rows where the word count in 'story' column is less than 15
news_df = news_df[news_df['story'].apply(word_count) >= 15]

# Filter out rows where any column is empty or contains only whitespace
news_df = news_df[(news_df != '') & (news_df != ' ')].dropna()

# Reset the index of the filtered DataFrame
news_df = news_df.reset_index(drop=True)

news_df

Unnamed: 0,story,date,storyId,company,ticker
0,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
1,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
2,"FORD ALERT: Bragar Eagel &amp; Squire, P.C. is...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
3,"FIRST ATLANTIC NICKEL CORP (""FAN-V"")\nALASKA E...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
4,"PALM BEACH, Fla., July 29, 2024 (GLOBE NEWSW...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg
...,...,...,...,...,...
985,"Tesco, the UK's largest supermarket chain, has...",2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance
986,Tesco has been accused of giving struggling wo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
987,Tesco boss Ken Murphy has seen his pay deal mo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
988,Tesco has apologised after a Black publisher s...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability


In [13]:
# Strip whitespace from all columns and replace empty strings with NaN
news_df = news_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
news_df.replace('', pd.NA, inplace=True)

# Drop rows with any NaN values
news_df.dropna(inplace=True)

# Reset the index of the filtered DataFrame
news_df = news_df.reset_index(drop=True)

news_df

Unnamed: 0,story,date,storyId,company,ticker
0,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
1,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
2,"FORD ALERT: Bragar Eagel &amp; Squire, P.C. is...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
3,"FIRST ATLANTIC NICKEL CORP (""FAN-V"")\nALASKA E...",2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
4,"PALM BEACH, Fla., July 29, 2024 (GLOBE NEWSW...",2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg
...,...,...,...,...,...
985,"Tesco, the UK's largest supermarket chain, has...",2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance
986,Tesco has been accused of giving struggling wo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
987,Tesco boss Ken Murphy has seen his pay deal mo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
988,Tesco has apologised after a Black publisher s...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability


In [14]:
news_df.to_csv(fname_out, index=False)

In [17]:
# Aggregate the 'company' column and calculate the total number of rows for each unique value
company_counts = news_df.groupby('company').size().reset_index(name='total_rows')
# Sort the results
company_counts = company_counts.sort_values(by='total_rows', ascending=False)


# Display the aggregated DataFrame
print(company_counts)

           company  total_rows
9           Toyota         324
1             Ford         261
8            Tesla         198
7            Tesco          58
2  Marks & Spencer          56
3            Ocado          38
6       Stellantis          31
5       Sainsburys           9
4         Polestar           8
0             Asda           7


In [18]:
# Group by both 'company' and 'ticker' columns and calculate the number of rows for each group
aggregated_counts = news_df.groupby(['company', 'ticker']).size().reset_index(name='total_rows')

# aggregated_counts = aggregated_counts.sort_values(by='total_rows', ascending=False)

# Display the aggregated DataFrame
print(aggregated_counts)

            company          ticker  total_rows
0              Asda     environment           1
1              Asda             esg           4
2              Asda  sustainability           2
3              Ford     environment           3
4              Ford             esg          99
5              Ford      governance          99
6              Ford          social          57
7              Ford  sustainability           3
8   Marks & Spencer     environment           4
9   Marks & Spencer             esg           3
10  Marks & Spencer          social          45
11  Marks & Spencer  sustainability           4
12            Ocado             esg           2
13            Ocado          social          36
14         Polestar     environment           4
15         Polestar             esg           1
16         Polestar  sustainability           3
17       Sainsburys     environment           3
18       Sainsburys             esg           4
19       Sainsburys  sustainability     

In [19]:
# Removing punctuation to optimise the polarity

# Define function to remove punctuation
def remove_punctuation(text):
    if pd.isna(text):
        return text
    return text.translate(str.maketrans('', '', string.punctuation))

# Apply the function to the 'story' column
news_df['story'] = news_df['story'].apply(remove_punctuation)

# Display the DataFrame
print(news_df[['story']])

                                                 story
0    LOS ANGELES CA  ACCESSWIRE  July 29 2024  The ...
1    NEW YORK CITY NY  ACCESSWIRE  July 29 2024  Br...
2    FORD ALERT Bragar Eagel amp Squire PC is Inves...
3    FIRST ATLANTIC NICKEL CORP FANV\nALASKA ENERGY...
4    PALM BEACH Fla July  29 2024  GLOBE NEWSWIRE  ...
..                                                 ...
985  Tesco the UKs largest supermarket chain has sp...
986  Tesco has been accused of giving struggling wo...
987  Tesco boss Ken Murphy has seen his pay deal mo...
988  Tesco has apologised after a Black publisher s...
989  Tesco Ireland the Republic of Ireland based su...

[990 rows x 1 columns]


In [20]:
# Convert the 'story' column to lowercase to optimise polarity
news_df['story'] = news_df['story'].str.lower()

# Display the updated DataFrame
print(news_df[['story']])

                                                 story
0    los angeles ca  accesswire  july 29 2024  the ...
1    new york city ny  accesswire  july 29 2024  br...
2    ford alert bragar eagel amp squire pc is inves...
3    first atlantic nickel corp fanv\nalaska energy...
4    palm beach fla july  29 2024  globe newswire  ...
..                                                 ...
985  tesco the uks largest supermarket chain has sp...
986  tesco has been accused of giving struggling wo...
987  tesco boss ken murphy has seen his pay deal mo...
988  tesco has apologised after a black publisher s...
989  tesco ireland the republic of ireland based su...

[990 rows x 1 columns]


In [21]:
news_df

Unnamed: 0,story,date,storyId,company,ticker
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
1,new york city ny accesswire july 29 2024 br...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg
...,...,...,...,...,...
985,tesco the uks largest supermarket chain has sp...,2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance
986,tesco has been accused of giving struggling wo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
987,tesco boss ken murphy has seen his pay deal mo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
988,tesco has apologised after a black publisher s...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability


In [22]:
# Function to calculate normalized sentiment score between 0 and 1
def calculate_normalized_score(text):
    sentiment = analyzer.polarity_scores(text)
    # VADER's compound score ranges from -1 to 1, normalize to 0 to 1
    normalized_score = (sentiment['compound'] + 1) / 2
    return normalized_score

# Apply the function to the DataFrame column
news_df['sentiment_score'] = news_df['story'].apply(calculate_normalized_score)

news_df


Unnamed: 0,story,date,storyId,company,ticker,sentiment_score
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.76335
1,new york city ny accesswire july 29 2024 br...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.16265
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.95170
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.99970
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg,0.99965
...,...,...,...,...,...,...
985,tesco the uks largest supermarket chain has sp...,2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance,0.97300
986,tesco has been accused of giving struggling wo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99495
987,tesco boss ken murphy has seen his pay deal mo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99360
988,tesco has apologised after a black publisher s...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability,0.02185


In [23]:
# Sentiment analysis using FinBERT
# Because the limit of FinBERT is 512 tokens, splitting the text 
# into chunks of 512 tokens or less before processing is required.

def analyse_sentiment_with_chunking(text, max_length=512):
    # Tokenize and split the text into chunks of max_length tokens
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=max_length, padding='max_length')

    # Split the inputs into smaller chunks if they exceed the max_length
    input_ids = inputs['input_ids'].squeeze(0)
    attention_mask = inputs['attention_mask'].squeeze(0)

    chunks = []
    for i in range(0, len(input_ids), max_length):
        chunk_input_ids = input_ids[i:i + max_length].unsqueeze(0)
        chunk_attention_mask = attention_mask[i:i + max_length].unsqueeze(0)
        chunks.append((chunk_input_ids, chunk_attention_mask))
    
    sentiments = []
    for chunk_input_ids, chunk_attention_mask in chunks:
        with torch.no_grad():
            outputs = model(chunk_input_ids, attention_mask=chunk_attention_mask)
        logits = outputs.logits
        sentiment = torch.argmax(logits, dim=1).item()
        sentiments.append(sentiment)

    # Return the most frequent sentiment across all chunks
    return max(set(sentiments), key=sentiments.count)

# Apply the function to the 'story' column
news_df['finbert_score'] = news_df['story'].apply(analyse_sentiment_with_chunking)

news_df

Unnamed: 0,story,date,storyId,company,ticker,sentiment_score,finbert_score
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.76335,1
1,new york city ny accesswire july 29 2024 br...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.16265,2
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.95170,2
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.99970,2
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg,0.99965,2
...,...,...,...,...,...,...,...
985,tesco the uks largest supermarket chain has sp...,2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance,0.97300,1
986,tesco has been accused of giving struggling wo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99495,1
987,tesco boss ken murphy has seen his pay deal mo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99360,0
988,tesco has apologised after a black publisher s...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability,0.02185,2


In [24]:
# Define the mapping from sentiment scores to labels
sentiment_mapping = {
    0: 'negative',
    1: 'neutral',
    2: 'positive'
}

# Apply the mapping to create the new 'sentiment' column
news_df['sentiment'] = news_df['finbert_score'].map(sentiment_mapping)

news_df

Unnamed: 0,story,date,storyId,company,ticker,sentiment_score,finbert_score,sentiment
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.76335,1,neutral
1,new york city ny accesswire july 29 2024 br...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.16265,2,positive
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.95170,2,positive
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.99970,2,positive
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg,0.99965,2,positive
...,...,...,...,...,...,...,...,...
985,tesco the uks largest supermarket chain has sp...,2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance,0.97300,1,neutral
986,tesco has been accused of giving struggling wo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99495,1,neutral
987,tesco boss ken murphy has seen his pay deal mo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99360,0,negative
988,tesco has apologised after a black publisher s...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability,0.02185,2,positive


In [25]:
# Define the weights sentiment labels
sentiment_mapping = {
    'negative': -1,
    'neutral': 0,
    'positive': 1
}

# Apply the mapping to create the new 'sentiment' column
news_df['weight'] = news_df['sentiment'].map(sentiment_mapping)

news_df

Unnamed: 0,story,date,storyId,company,ticker,sentiment_score,finbert_score,sentiment,weight
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.76335,1,neutral,0
1,new york city ny accesswire july 29 2024 br...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.16265,2,positive,1
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.95170,2,positive,1
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.99970,2,positive,1
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg,0.99965,2,positive,1
...,...,...,...,...,...,...,...,...,...
985,tesco the uks largest supermarket chain has sp...,2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance,0.97300,1,neutral,0
986,tesco has been accused of giving struggling wo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99495,1,neutral,0
987,tesco boss ken murphy has seen his pay deal mo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99360,0,negative,-1
988,tesco has apologised after a black publisher s...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability,0.02185,2,positive,1


In [26]:
# Define the composite score as the result of weight x sentiment_score

news_df['composite_score'] = news_df['weight'] * news_df['sentiment_score']
news_df 



Unnamed: 0,story,date,storyId,company,ticker,sentiment_score,finbert_score,sentiment,weight,composite_score
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.76335,1,neutral,0,0.00000
1,new york city ny accesswire july 29 2024 br...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.16265,2,positive,1,0.16265
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.95170,2,positive,1,0.95170
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.99970,2,positive,1,0.99970
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg,0.99965,2,positive,1,0.99965
...,...,...,...,...,...,...,...,...,...,...
985,tesco the uks largest supermarket chain has sp...,2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance,0.97300,1,neutral,0,0.00000
986,tesco has been accused of giving struggling wo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99495,1,neutral,0,0.00000
987,tesco boss ken murphy has seen his pay deal mo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99360,0,negative,-1,-0.99360
988,tesco has apologised after a black publisher s...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability,0.02185,2,positive,1,0.02185


In [27]:
# Group by company and calculate the average composite_score across the entire period
average_scores = news_df.groupby('company')['composite_score'].mean().reset_index()
average_scores

Unnamed: 0,company,composite_score
0,Asda,-0.004857
1,Ford,0.356515
2,Marks & Spencer,0.794787
3,Ocado,0.206816
4,Polestar,0.691638
5,Sainsburys,0.76165
6,Stellantis,0.323995
7,Tesco,0.600731
8,Tesla,0.413659
9,Toyota,0.445728


In [28]:
# Convert 'date' column to datetime
news_df['date'] = pd.to_datetime(news_df['date'])

# Extract month and year from 'date'
news_df['month'] = news_df['date'].dt.to_period('M')

news_df


Unnamed: 0,story,date,storyId,company,ticker,sentiment_score,finbert_score,sentiment,weight,composite_score,month
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.76335,1,neutral,0,0.00000,2024-07
1,new york city ny accesswire july 29 2024 br...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.16265,2,positive,1,0.16265,2024-07
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.95170,2,positive,1,0.95170,2024-07
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.99970,2,positive,1,0.99970,2024-07
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg,0.99965,2,positive,1,0.99965,2024-07
...,...,...,...,...,...,...,...,...,...,...,...
985,tesco the uks largest supermarket chain has sp...,2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance,0.97300,1,neutral,0,0.00000,2024-05
986,tesco has been accused of giving struggling wo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99495,1,neutral,0,0.00000,2024-05
987,tesco boss ken murphy has seen his pay deal mo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99360,0,negative,-1,-0.99360,2024-05
988,tesco has apologised after a black publisher s...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability,0.02185,2,positive,1,0.02185,2024-05


In [29]:
# Group by company and month, then calculate the average composite_score
monthly_average_scores = news_df.groupby(['company', 'month'])['composite_score'].mean().reset_index()
monthly_average_scores

Unnamed: 0,company,month,composite_score
0,Asda,2024-06,0.0
1,Asda,2024-07,-0.005667
2,Ford,2024-05,-0.9981
3,Ford,2024-06,0.420789
4,Ford,2024-07,0.360452
5,Marks & Spencer,2024-05,0.493237
6,Marks & Spencer,2024-06,0.7597
7,Marks & Spencer,2024-07,0.857465
8,Ocado,2024-05,0.9968
9,Ocado,2024-07,0.185465


In [30]:
# Convert 'date' column to datetime
news_df['date'] = pd.to_datetime(news_df['date'])

# Extract week of the year and year from 'date'
news_df['week_of_year'] = news_df['date'].dt.isocalendar().week
news_df['year'] = news_df['date'].dt.year

news_df

Unnamed: 0,story,date,storyId,company,ticker,sentiment_score,finbert_score,sentiment,weight,composite_score,month,week_of_year,year
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.76335,1,neutral,0,0.00000,2024-07,31,2024
1,new york city ny accesswire july 29 2024 br...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg,0.16265,2,positive,1,0.16265,2024-07,31,2024
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.95170,2,positive,1,0.95170,2024-07,31,2024
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg,0.99970,2,positive,1,0.99970,2024-07,31,2024
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg,0.99965,2,positive,1,0.99965,2024-07,31,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,tesco the uks largest supermarket chain has sp...,2024-05-15,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance,0.97300,1,neutral,0,0.00000,2024-05,20,2024
986,tesco has been accused of giving struggling wo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99495,1,neutral,0,0.00000,2024-05,20,2024
987,tesco boss ken murphy has seen his pay deal mo...,2024-05-14,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance,0.99360,0,negative,-1,-0.99360,2024-05,20,2024
988,tesco has apologised after a black publisher s...,2024-05-20,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability,0.02185,2,positive,1,0.02185,2024-05,21,2024


In [31]:
# Set display options to show more rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns


In [33]:
# Group by company and week of the year, then calculate the average composite_score
weekly_average_composite_scores = news_df.groupby(['company', 'year', 'week_of_year'])['composite_score'].mean().reset_index()


weekly_average_composite_scores.to_csv('../Data/Output/obj1_weekly_average_composite_scores.csv', index=False)

weekly_average_composite_scores

Unnamed: 0,company,year,week_of_year,composite_score
0,Asda,2024,23,0.0
1,Asda,2024,28,-0.98
2,Asda,2024,30,0.4815
3,Ford,2024,18,-0.9981
4,Ford,2024,25,0.368219
5,Ford,2024,26,0.441817
6,Ford,2024,27,0.49202
7,Ford,2024,28,0.421264
8,Ford,2024,29,0.464823
9,Ford,2024,30,0.226375
