This notebook has the following purposes:
- Clean the raw news stories downloaded from Eikon
- Run the sentiment analysis
- Aggregate the data to determine insights

In [2]:
# Import libraries

import pandas as pd
import numpy as np
import os
import string
import torch
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
import re

# Load FinBERT tokenizer and model
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Initialize sentiment analysis pipeline
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
[nltk_data] Downloading package stopwords to /Users/luca/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
file_path = '../Data/Input/Eikon/refinitiv_stories_raw.csv'
fname_out = '../Data/Output/news_df.csv'

# Read the CSV file into a DataFrame
news_df = pd.read_csv(file_path)

# Remove columns from the 8th column onwards
news_df = news_df.iloc[:, :5]  # Keep only the first 7 columns (0-indexed)

news_df


Unnamed: 0,story,date,storyId,company,ticker
0,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 18:45:56.488000+00:00,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
1,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 14:01:47.535000+00:00,urn:newsml:newswire.refinitiv.com:20240729:nAC...,Ford,esg
2,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 13:40:20.688000+00:00,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
3,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 13:17:18.603000+00:00,urn:newsml:newsroom.refinitiv.com:20240729:nVM...,Ford,esg
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 12:45:00.106000+00:00,urn:newsml:newswire.refinitiv.com:20240729:nGN...,Ford,esg
...,...,...,...,...,...
5161,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-15 20:11:27.466000+00:00,urn:newsml:newswire.refinitiv.com:20240515:nNR...,Tesco,governance
5162,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14 19:32:32.800000+00:00,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
5163,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14 15:09:52.109000+00:00,urn:newsml:newswire.refinitiv.com:20240514:nNR...,Tesco,governance
5164,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-20 22:33:04.975000+00:00,urn:newsml:newswire.refinitiv.com:20240520:nNR...,Tesco,sustainability


In [4]:
# Drop storyId column

news_df = news_df.drop(['storyId'], axis= 1)
news_df

Unnamed: 0,story,date,company,ticker
0,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 18:45:56.488000+00:00,Ford,esg
1,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 14:01:47.535000+00:00,Ford,esg
2,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 13:40:20.688000+00:00,Ford,esg
3,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 13:17:18.603000+00:00,Ford,esg
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29 12:45:00.106000+00:00,Ford,esg
...,...,...,...,...
5161,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-15 20:11:27.466000+00:00,Tesco,governance
5162,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14 19:32:32.800000+00:00,Tesco,governance
5163,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14 15:09:52.109000+00:00,Tesco,governance
5164,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-20 22:33:04.975000+00:00,Tesco,sustainability


In [5]:
# Convert multiple columns to string type
columns_to_convert = ['story', 'company', 'ticker']
news_df[columns_to_convert] = news_df[columns_to_convert].astype(str)

In [6]:
# Convert the 'date' column to datetime, handling possible timezone info and various formats
# Convert the 'date' column to datetime format while considering the timezone info
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce')

# Format the datetime to 'YYYY-MM-DD HH:MM'
news_df['date'] = news_df['date'].dt.strftime('%Y-%m-%d')

news_df

Unnamed: 0,story,date,company,ticker
0,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,esg
1,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,esg
2,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,esg
3,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,esg
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,esg
...,...,...,...,...
5161,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-15,Tesco,governance
5162,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14,Tesco,governance
5163,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14,Tesco,governance
5164,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-20,Tesco,sustainability


In [7]:
# After inspecting the dataset, there are a few rows with compromised data. 
# Remove rows of dataset where date column doesn't have timestamp.

# Define a regular expression pattern to detect timestamps (format: YYYY-MM-DD HH:MM:SS)
timestamp_pattern = r'\d{4}-\d{2}-\d{2}'

# Filter the rows that contain timestamps in the 'date' column
news_df = news_df[news_df['date'].apply(lambda x: bool(re.match(timestamp_pattern, str(x))))]

# Reset the index of the filtered DataFrame
news_df = news_df.reset_index(drop=True)

news_df


Unnamed: 0,story,date,company,ticker
0,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,esg
1,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,esg
2,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,esg
3,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,esg
4,"<div class=""storyContent"" lang=""en""><style typ...",2024-07-29,Ford,esg
...,...,...,...,...
1078,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-15,Tesco,governance
1079,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14,Tesco,governance
1080,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-14,Tesco,governance
1081,"<div class=""storyContent"" lang=""en""><style typ...",2024-05-20,Tesco,sustainability


In [8]:
# Functions to remove text within <> and {}:

def remove_text_within_angle_brackets(text):
    if pd.notnull(text):  # Check if the text is not null
        return re.sub(r'<.*?>', '', text)
    return text  # Return the text as is if it's null

# Apply the function to the 'story' column
news_df['story'] = news_df['story'].apply(remove_text_within_angle_brackets)


def remove_text_within_curly_brackets(text):
    if pd.notnull(text):  # Check if the text is not null
        return re.sub(r'{.*?}', '', text)
    return text  # Return the text as is if it's null

# Apply the function to the 'story' column
news_df['story'] = news_df['story'].apply(remove_text_within_curly_brackets)

# Reset the index of the filtered DataFrame
news_df = news_df.reset_index(drop=True)

news_df


Unnamed: 0,story,date,company,ticker
0,".storyContent * LOS ANGELES, CA / ACCESSWIRE /...",2024-07-29,Ford,esg
1,".storyContent * NEW YORK CITY, NY / ACCESSWIRE...",2024-07-29,Ford,esg
2,.storyContent * FORD ALERT: Bragar Eagel &amp;...,2024-07-29,Ford,esg
3,".storyContent * FIRST ATLANTIC NICKEL CORP (""F...",2024-07-29,Ford,esg
4,".storyContent * PALM BEACH, Fla., July 29, 20...",2024-07-29,Ford,esg
...,...,...,...,...
1078,".storyContent * Tesco, the UK's largest superm...",2024-05-15,Tesco,governance
1079,.storyContent * Tesco has been accused of givi...,2024-05-14,Tesco,governance
1080,.storyContent * Tesco boss Ken Murphy has seen...,2024-05-14,Tesco,governance
1081,.storyContent * Tesco has apologised after a B...,2024-05-20,Tesco,sustainability


In [9]:
# Remove '.storyContent * ' substring from the 'story' column
news_df['story'] = news_df['story'].str.replace('.storyContent * ', '', regex=False)

# Reset the index of the filtered DataFrame
news_df = news_df.reset_index(drop=True)

news_df


Unnamed: 0,story,date,company,ticker
0,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,Ford,esg
1,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,Ford,esg
2,"FORD ALERT: Bragar Eagel &amp; Squire, P.C. is...",2024-07-29,Ford,esg
3,"FIRST ATLANTIC NICKEL CORP (""FAN-V"")\nALASKA E...",2024-07-29,Ford,esg
4,"PALM BEACH, Fla., July 29, 2024 (GLOBE NEWSW...",2024-07-29,Ford,esg
...,...,...,...,...
1078,"Tesco, the UK's largest supermarket chain, has...",2024-05-15,Tesco,governance
1079,Tesco has been accused of giving struggling wo...,2024-05-14,Tesco,governance
1080,Tesco boss Ken Murphy has seen his pay deal mo...,2024-05-14,Tesco,governance
1081,Tesco has apologised after a Black publisher s...,2024-05-20,Tesco,sustainability


In [10]:
news_df

Unnamed: 0,story,date,company,ticker
0,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,Ford,esg
1,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,Ford,esg
2,"FORD ALERT: Bragar Eagel &amp; Squire, P.C. is...",2024-07-29,Ford,esg
3,"FIRST ATLANTIC NICKEL CORP (""FAN-V"")\nALASKA E...",2024-07-29,Ford,esg
4,"PALM BEACH, Fla., July 29, 2024 (GLOBE NEWSW...",2024-07-29,Ford,esg
...,...,...,...,...
1078,"Tesco, the UK's largest supermarket chain, has...",2024-05-15,Tesco,governance
1079,Tesco has been accused of giving struggling wo...,2024-05-14,Tesco,governance
1080,Tesco boss Ken Murphy has seen his pay deal mo...,2024-05-14,Tesco,governance
1081,Tesco has apologised after a Black publisher s...,2024-05-20,Tesco,sustainability


In [11]:
# Remove rows with empty cells

news_df = news_df.dropna(how='any')

# Reset the index of the filtered DataFrame
news_df = news_df.reset_index(drop=True)

news_df

Unnamed: 0,story,date,company,ticker
0,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,Ford,esg
1,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,Ford,esg
2,"FORD ALERT: Bragar Eagel &amp; Squire, P.C. is...",2024-07-29,Ford,esg
3,"FIRST ATLANTIC NICKEL CORP (""FAN-V"")\nALASKA E...",2024-07-29,Ford,esg
4,"PALM BEACH, Fla., July 29, 2024 (GLOBE NEWSW...",2024-07-29,Ford,esg
...,...,...,...,...
1078,"Tesco, the UK's largest supermarket chain, has...",2024-05-15,Tesco,governance
1079,Tesco has been accused of giving struggling wo...,2024-05-14,Tesco,governance
1080,Tesco boss Ken Murphy has seen his pay deal mo...,2024-05-14,Tesco,governance
1081,Tesco has apologised after a Black publisher s...,2024-05-20,Tesco,sustainability


In [12]:
# Function to count the number of words in a string
def word_count(text):
    if pd.notnull(text):  # Check if the text is not null
        return len(text.split())
    return 0  # Return 0 if the text is null

# Filter out rows where the word count in 'story' column is less than 15
news_df = news_df[news_df['story'].apply(word_count) >= 15]

# Filter out rows where any column is empty or contains only whitespace
news_df = news_df[(news_df != '') & (news_df != ' ')].dropna()

# Reset the index of the filtered DataFrame
news_df = news_df.reset_index(drop=True)

news_df

Unnamed: 0,story,date,company,ticker
0,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,Ford,esg
1,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,Ford,esg
2,"FORD ALERT: Bragar Eagel &amp; Squire, P.C. is...",2024-07-29,Ford,esg
3,"FIRST ATLANTIC NICKEL CORP (""FAN-V"")\nALASKA E...",2024-07-29,Ford,esg
4,"PALM BEACH, Fla., July 29, 2024 (GLOBE NEWSW...",2024-07-29,Ford,esg
...,...,...,...,...
985,"Tesco, the UK's largest supermarket chain, has...",2024-05-15,Tesco,governance
986,Tesco has been accused of giving struggling wo...,2024-05-14,Tesco,governance
987,Tesco boss Ken Murphy has seen his pay deal mo...,2024-05-14,Tesco,governance
988,Tesco has apologised after a Black publisher s...,2024-05-20,Tesco,sustainability


In [13]:
# Strip whitespace from all columns and replace empty strings with NaN
news_df = news_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
news_df.replace('', pd.NA, inplace=True)

# Drop rows with any NaN values
news_df.dropna(inplace=True)

# Reset the index of the filtered DataFrame
news_df = news_df.reset_index(drop=True)

news_df

Unnamed: 0,story,date,company,ticker
0,"LOS ANGELES, CA / ACCESSWIRE / July 29, 2024 /...",2024-07-29,Ford,esg
1,"NEW YORK CITY, NY / ACCESSWIRE / July 29, 2024...",2024-07-29,Ford,esg
2,"FORD ALERT: Bragar Eagel &amp; Squire, P.C. is...",2024-07-29,Ford,esg
3,"FIRST ATLANTIC NICKEL CORP (""FAN-V"")\nALASKA E...",2024-07-29,Ford,esg
4,"PALM BEACH, Fla., July 29, 2024 (GLOBE NEWSW...",2024-07-29,Ford,esg
...,...,...,...,...
985,"Tesco, the UK's largest supermarket chain, has...",2024-05-15,Tesco,governance
986,Tesco has been accused of giving struggling wo...,2024-05-14,Tesco,governance
987,Tesco boss Ken Murphy has seen his pay deal mo...,2024-05-14,Tesco,governance
988,Tesco has apologised after a Black publisher s...,2024-05-20,Tesco,sustainability


In [14]:
# Aggregate the 'company' column and calculate the total number of rows for each unique value
company_counts = news_df.groupby('company').size().reset_index(name='total_rows')
# Sort the results
company_counts = company_counts.sort_values(by='total_rows', ascending=False)


# Display the aggregated DataFrame
print(company_counts)

           company  total_rows
9           Toyota         324
1             Ford         261
8            Tesla         198
7            Tesco          58
2  Marks & Spencer          56
3            Ocado          38
6       Stellantis          31
5       Sainsburys           9
4         Polestar           8
0             Asda           7


In [15]:
# Group by both 'company' and 'ticker' columns and calculate the number of rows for each group
aggregated_counts = news_df.groupby(['company', 'ticker']).size().reset_index(name='total_rows')

# aggregated_counts = aggregated_counts.sort_values(by='total_rows', ascending=False)

# Display the aggregated DataFrame
print(aggregated_counts)

            company          ticker  total_rows
0              Asda     environment           1
1              Asda             esg           4
2              Asda  sustainability           2
3              Ford     environment           3
4              Ford             esg          99
5              Ford      governance          99
6              Ford          social          57
7              Ford  sustainability           3
8   Marks & Spencer     environment           4
9   Marks & Spencer             esg           3
10  Marks & Spencer          social          45
11  Marks & Spencer  sustainability           4
12            Ocado             esg           2
13            Ocado          social          36
14         Polestar     environment           4
15         Polestar             esg           1
16         Polestar  sustainability           3
17       Sainsburys     environment           3
18       Sainsburys             esg           4
19       Sainsburys  sustainability     

In [16]:
# Removing punctuation to optimise the polarity

# Define function to remove punctuation
def remove_punctuation(text):
    if pd.isna(text):
        return text
    return text.translate(str.maketrans('', '', string.punctuation))

# Apply the function to the 'story' column
news_df['story'] = news_df['story'].apply(remove_punctuation)

# Display the DataFrame
print(news_df[['story']])

                                                 story
0    LOS ANGELES CA  ACCESSWIRE  July 29 2024  The ...
1    NEW YORK CITY NY  ACCESSWIRE  July 29 2024  Br...
2    FORD ALERT Bragar Eagel amp Squire PC is Inves...
3    FIRST ATLANTIC NICKEL CORP FANV\nALASKA ENERGY...
4    PALM BEACH Fla July  29 2024  GLOBE NEWSWIRE  ...
..                                                 ...
985  Tesco the UKs largest supermarket chain has sp...
986  Tesco has been accused of giving struggling wo...
987  Tesco boss Ken Murphy has seen his pay deal mo...
988  Tesco has apologised after a Black publisher s...
989  Tesco Ireland the Republic of Ireland based su...

[990 rows x 1 columns]


In [17]:
# Convert the 'story' column to lowercase to optimise polarity
news_df['story'] = news_df['story'].str.lower()

# Display the updated DataFrame
print(news_df[['story']])

                                                 story
0    los angeles ca  accesswire  july 29 2024  the ...
1    new york city ny  accesswire  july 29 2024  br...
2    ford alert bragar eagel amp squire pc is inves...
3    first atlantic nickel corp fanv\nalaska energy...
4    palm beach fla july  29 2024  globe newswire  ...
..                                                 ...
985  tesco the uks largest supermarket chain has sp...
986  tesco has been accused of giving struggling wo...
987  tesco boss ken murphy has seen his pay deal mo...
988  tesco has apologised after a black publisher s...
989  tesco ireland the republic of ireland based su...

[990 rows x 1 columns]


In [18]:
news_df

Unnamed: 0,story,date,company,ticker
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,Ford,esg
1,new york city ny accesswire july 29 2024 br...,2024-07-29,Ford,esg
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,Ford,esg
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,Ford,esg
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,Ford,esg
...,...,...,...,...
985,tesco the uks largest supermarket chain has sp...,2024-05-15,Tesco,governance
986,tesco has been accused of giving struggling wo...,2024-05-14,Tesco,governance
987,tesco boss ken murphy has seen his pay deal mo...,2024-05-14,Tesco,governance
988,tesco has apologised after a black publisher s...,2024-05-20,Tesco,sustainability


In [19]:
# Save the dataframe

news_df.to_csv(fname_out, index=False)

In [19]:
# As the news stories are very long, let's use a text summarization model to condense the text 
# to a manageable length before feeding it into the sentiment classifier.
# Load the summarization pipeline with BART
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [20]:
# Let's summarise the content of the news articles

Text_Summary = []
for i in range(0, len(news_df['story'])):
  summary = summarizer(news_df['story'][i][:512], max_length=50, min_length=25, do_sample=False)[0]['summary_text']
  Text_Summary.append(summary)

Your max_length is set to 50, but your input_length is only 49. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 50, but your input_length is only 49. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 50, but your input_length is only 49. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 50, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max

In [23]:
# Let's add the summarised articles to the dataframe

news_df['Text Summary'] = Text_Summary

In [24]:
news_df.head()

Unnamed: 0,story,date,company,ticker,Text Summary
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,Ford,esg,The schall law firm a national shareholder rig...
1,new york city ny accesswire july 29 2024 br...,2024-07-29,Ford,esg,Investors who purchased ford securities are en...
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,Ford,esg,ford alert bragar eagel amp squire pc is inve...
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,Ford,esg,The development of awaruite deposits in canada...
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,Ford,esg,The refining of sulfide nickel deposits usuall...


In [27]:
# Let's initialise the 'stop words' function for English 

stop = stopwords.words('english')
stop[:5]

['i', 'me', 'my', 'myself', 'we']

In [28]:
# Let's remove 'stop words' from the summarised articles

news_df['Text Summary'] = news_df['Text Summary'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop]))

In [29]:
news_df.head()

Unnamed: 0,story,date,company,ticker,Text Summary
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,Ford,esg,The schall law firm national shareholder right...
1,new york city ny accesswire july 29 2024 br...,2024-07-29,Ford,esg,Investors purchased ford securities encouraged...
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,Ford,esg,ford alert bragar eagel amp squire pc investig...
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,Ford,esg,The development awaruite deposits canada may h...
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,Ford,esg,The refining sulfide nickel deposits usually r...


In [30]:
# Let's initialise FinBERT for sentiment analysis

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [32]:
# Let's run the sentiment analysis for each news article to obtain 'sentiment', 'label' and 'confidence score'.

news_df = news_df.assign(sentiment = lambda x: x['Text Summary'].apply(lambda s: classifier(s))).assign(label = lambda x: x['sentiment'].apply(lambda s: (s[0]['label'])),score = lambda x: x['sentiment'].apply(lambda s: (s[0]['score'])))

In [33]:
news_df.head()

Unnamed: 0,story,date,company,ticker,Text Summary,sentiment,label,score
0,los angeles ca accesswire july 29 2024 the ...,2024-07-29,Ford,esg,The schall law firm national shareholder right...,"[{'label': 'negative', 'score': 0.915385782718...",negative,0.915386
1,new york city ny accesswire july 29 2024 br...,2024-07-29,Ford,esg,Investors purchased ford securities encouraged...,"[{'label': 'neutral', 'score': 0.9397768378257...",neutral,0.939777
2,ford alert bragar eagel amp squire pc is inves...,2024-07-29,Ford,esg,ford alert bragar eagel amp squire pc investig...,"[{'label': 'neutral', 'score': 0.8667923808097...",neutral,0.866792
3,first atlantic nickel corp fanv\nalaska energy...,2024-07-29,Ford,esg,The development awaruite deposits canada may h...,"[{'label': 'neutral', 'score': 0.7519849538803...",neutral,0.751985
4,palm beach fla july 29 2024 globe newswire ...,2024-07-29,Ford,esg,The refining sulfide nickel deposits usually r...,"[{'label': 'neutral', 'score': 0.8543457984924...",neutral,0.854346


In [34]:
# Let's drop the story and sentiment columns

news_df.drop(['story','sentiment'], axis=1, inplace=True)

In [36]:
news_df.columns = ['Timestamp', 'Company', 'Ticker', 'Text Summary', 'Sentiment', 'Confidence Score']

In [37]:
news_df.head()

Unnamed: 0,Timestamp,Company,Ticker,Text Summary,Sentiment,Confidence Score
0,2024-07-29,Ford,esg,The schall law firm national shareholder right...,negative,0.915386
1,2024-07-29,Ford,esg,Investors purchased ford securities encouraged...,neutral,0.939777
2,2024-07-29,Ford,esg,ford alert bragar eagel amp squire pc investig...,neutral,0.866792
3,2024-07-29,Ford,esg,The development awaruite deposits canada may h...,neutral,0.751985
4,2024-07-29,Ford,esg,The refining sulfide nickel deposits usually r...,neutral,0.854346


In [None]:
# Define the mapping to convert the Sentiment labels into corresponding values 
sentiment_mapping = {
    'negative': -1,
    'neutral': 0,
    'positive' : 1
}

# Apply the mapping to create the new 'Sentiment Value' column
news_df['Sentiment Value'] = news_df['Sentiment'].map(sentiment_mapping)

news_df

In [59]:
# Define the composite score as the result of weight x sentiment_score

news_df['Composite Score'] = news_df['Confidence Score'] * news_df['Sentiment Value']
news_df 



Unnamed: 0,Timestamp,Company,Ticker,Text Summary,Sentiment,Confidence Score,Sentiment Value,Composite Score
0,2024-07-29,Ford,esg,The schall law firm national shareholder right...,negative,0.915386,-1,-0.915386
1,2024-07-29,Ford,esg,Investors purchased ford securities encouraged...,neutral,0.939777,0,0.000000
2,2024-07-29,Ford,esg,ford alert bragar eagel amp squire pc investig...,neutral,0.866792,0,0.000000
3,2024-07-29,Ford,esg,The development awaruite deposits canada may h...,neutral,0.751985,0,0.000000
4,2024-07-29,Ford,esg,The refining sulfide nickel deposits usually r...,neutral,0.854346,0,0.000000
...,...,...,...,...,...,...,...,...
985,2024-05-15,Tesco,governance,Tesco sparked controversy found ceo ken murphy...,negative,0.943846,-1,-0.943846
986,2024-05-14,Tesco,governance,Crisisken murphy given £99m pay perks double p...,positive,0.752490,1,0.752490
987,2024-05-14,Tesco,governance,Ken murphy received pay package worth £993 mil...,positive,0.810848,1,0.810848
988,2024-05-20,Tesco,sustainability,serlina boyd 42 two children branch hampshire....,neutral,0.810907,0,0.000000


In [60]:
# Group by company and calculate the average composite_score across the entire period
average_scores = news_df.groupby('Company')['Composite Score'].mean().reset_index()
average_scores

Unnamed: 0,Company,Composite Score
0,Asda,0.100442
1,Ford,-0.238266
2,Marks & Spencer,0.076164
3,Ocado,0.015891
4,Polestar,-0.18329
5,Sainsburys,0.244242
6,Stellantis,-0.280576
7,Tesco,-0.057992
8,Tesla,-0.141529
9,Toyota,-0.116886


In [61]:
# Convert 'date' column to datetime
news_df['Timestamp'] = pd.to_datetime(news_df['Timestamp'])

# Extract month and year from 'date'
news_df['Month'] = news_df['Timestamp'].dt.to_period('M')

news_df


Unnamed: 0,Timestamp,Company,Ticker,Text Summary,Sentiment,Confidence Score,Sentiment Value,Composite Score,Month
0,2024-07-29,Ford,esg,The schall law firm national shareholder right...,negative,0.915386,-1,-0.915386,2024-07
1,2024-07-29,Ford,esg,Investors purchased ford securities encouraged...,neutral,0.939777,0,0.000000,2024-07
2,2024-07-29,Ford,esg,ford alert bragar eagel amp squire pc investig...,neutral,0.866792,0,0.000000,2024-07
3,2024-07-29,Ford,esg,The development awaruite deposits canada may h...,neutral,0.751985,0,0.000000,2024-07
4,2024-07-29,Ford,esg,The refining sulfide nickel deposits usually r...,neutral,0.854346,0,0.000000,2024-07
...,...,...,...,...,...,...,...,...,...
985,2024-05-15,Tesco,governance,Tesco sparked controversy found ceo ken murphy...,negative,0.943846,-1,-0.943846,2024-05
986,2024-05-14,Tesco,governance,Crisisken murphy given £99m pay perks double p...,positive,0.752490,1,0.752490,2024-05
987,2024-05-14,Tesco,governance,Ken murphy received pay package worth £993 mil...,positive,0.810848,1,0.810848,2024-05
988,2024-05-20,Tesco,sustainability,serlina boyd 42 two children branch hampshire....,neutral,0.810907,0,0.000000,2024-05


In [62]:
# Group by company and month, then calculate the average composite_score
monthly_average_composite_scores = news_df.groupby(['Company', 'Month'])['Composite Score'].mean().reset_index()
monthly_average_composite_scores

Unnamed: 0,Company,Month,Composite Score
0,Asda,2024-06,0.682767
1,Asda,2024-07,0.003388
2,Ford,2024-05,0.0
3,Ford,2024-06,-0.329255
4,Ford,2024-07,-0.2293
5,Marks & Spencer,2024-05,0.235557
6,Marks & Spencer,2024-06,0.110732
7,Marks & Spencer,2024-07,0.03218
8,Ocado,2024-05,0.0
9,Ocado,2024-07,0.016321


In [63]:
# Convert 'date' column to datetime
news_df['Timestamp'] = pd.to_datetime(news_df['Timestamp'])

# Extract week of the year and year from 'date'
news_df['week_of_year'] = news_df['Timestamp'].dt.isocalendar().week
news_df['year'] = news_df['Timestamp'].dt.year

news_df

Unnamed: 0,Timestamp,Company,Ticker,Text Summary,Sentiment,Confidence Score,Sentiment Value,Composite Score,Month,week_of_year,year
0,2024-07-29,Ford,esg,The schall law firm national shareholder right...,negative,0.915386,-1,-0.915386,2024-07,31,2024
1,2024-07-29,Ford,esg,Investors purchased ford securities encouraged...,neutral,0.939777,0,0.000000,2024-07,31,2024
2,2024-07-29,Ford,esg,ford alert bragar eagel amp squire pc investig...,neutral,0.866792,0,0.000000,2024-07,31,2024
3,2024-07-29,Ford,esg,The development awaruite deposits canada may h...,neutral,0.751985,0,0.000000,2024-07,31,2024
4,2024-07-29,Ford,esg,The refining sulfide nickel deposits usually r...,neutral,0.854346,0,0.000000,2024-07,31,2024
...,...,...,...,...,...,...,...,...,...,...,...
985,2024-05-15,Tesco,governance,Tesco sparked controversy found ceo ken murphy...,negative,0.943846,-1,-0.943846,2024-05,20,2024
986,2024-05-14,Tesco,governance,Crisisken murphy given £99m pay perks double p...,positive,0.752490,1,0.752490,2024-05,20,2024
987,2024-05-14,Tesco,governance,Ken murphy received pay package worth £993 mil...,positive,0.810848,1,0.810848,2024-05,20,2024
988,2024-05-20,Tesco,sustainability,serlina boyd 42 two children branch hampshire....,neutral,0.810907,0,0.000000,2024-05,21,2024


In [31]:
# # Set display options to show more rows and columns
# pd.set_option('display.max_rows', None)  # Show all rows
# pd.set_option('display.max_columns', None)  # Show all columns


In [64]:
# Group by company and week of the year, then calculate the average composite_score
weekly_average_composite_scores = news_df.groupby(['Company', 'year', 'week_of_year'])['Composite Score'].mean().reset_index()

weekly_average_composite_scores.to_csv('../Data/Output/obj1_weekly_average_composite_scores.csv', index=False)

weekly_average_composite_scores

Unnamed: 0,Company,year,week_of_year,Composite Score
0,Asda,2024,23,0.682767
1,Asda,2024,28,0.883994
2,Asda,2024,30,-0.436915
3,Ford,2024,18,0.000000
4,Ford,2024,25,-0.227749
...,...,...,...,...
74,Toyota,2024,26,0.000000
75,Toyota,2024,28,0.091921
76,Toyota,2024,29,-0.152090
77,Toyota,2024,30,-0.178072


### END OF SECTION