# News Processing and Financial Sentiment Extraction Notebook

In this notebook, we would like to clean the raw news CSV file and use the processed news data for financial sentiment extraction using the FinBert model.

## News Processing

### Pip Install Commands

In [1]:
%pip install demoji
%pip install contractions

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Libraries

In [2]:
import demoji
import contractions
import pandas as pd

### Const Values

In [3]:
DATE_COLUMN_NAME = "Date"
TIME_COLUMN_NAME = "Time"
NEWS_COLUMN_NAME = "Tweet"
LEFTOVER_COLUMN_NAME = "Leftover"

NEWS_COLUMNS_LIST = [
    DATE_COLUMN_NAME,
    NEWS_COLUMN_NAME,
    LEFTOVER_COLUMN_NAME
]

RAW_AMPERSAND_VALUE = "&amp"

### Methods

In [4]:
def get_date_data(time_stamp):
    # get date data by space split
    date = time_stamp.split(" ")[0]
    return date

In [5]:
def get_time_data(time_stamp):
    # get date data by space and plus split
    time_stamp = time_stamp.split(" ")[1]
    time = time_stamp.split("+")[0]
    return time

In [6]:
def replace_raw_ampersand_value(text):
    # replace &amp with &
    text = text.replace(RAW_AMPERSAND_VALUE, "&")

    # replace &; with &
    text = text.replace("&;", "&")

    return text

In [7]:
def merge_news_leftover(row):
    news_separator = ""
    current_news_data = row[NEWS_COLUMN_NAME]
    amp_flag = RAW_AMPERSAND_VALUE in current_news_data

    if pd.notna(row[LEFTOVER_COLUMN_NAME]):
        if amp_flag:
            current_news_data = replace_raw_ampersand_value(
                current_news_data)
            
            # concat news data together without space
            news_separator = ""

        # concat news and leftover data together
        leftover_data = row[LEFTOVER_COLUMN_NAME]
        current_news_data += f"{news_separator}{leftover_data}"
    
    return current_news_data

In [8]:
def process_news_data(news_data):
    # delete any new lines and use only the first main title
    news_data = news_data.split("\n")[0]

    # delete any links for news data
    news_data = news_data.split("http")[0]

    # delete any emoji characters
    news_data = demoji.replace(news_data, repl="")
    
    news_data = replace_raw_ampersand_value(news_data)
    
    # expend contractions in news data
    news_data = contractions.fix(news_data, slang=False)
    
    # delete any spaces at the start or end of the news data
    news_data = news_data.strip()

    return news_data

### Code

Loading news dataframe and deleting any columns that are not Date or news data

In [9]:
news_df = pd.read_csv("../Data/tweets.csv")

In [10]:
for current_column in news_df.columns:
    if current_column not in NEWS_COLUMNS_LIST:
        news_df = news_df.drop(current_column, axis=1)

Splitting Date data to separate Date and Time columns

In [11]:
# adding a new column for the time
news_df[TIME_COLUMN_NAME] = news_df[DATE_COLUMN_NAME].apply(
    get_time_data)

# amend existing date column
news_df[DATE_COLUMN_NAME] = news_df[DATE_COLUMN_NAME].apply(
    get_date_data)

Processing the news data

In [12]:
news_df[NEWS_COLUMN_NAME] = news_df[NEWS_COLUMN_NAME].apply(
    process_news_data)

Concatenating misaligned news data with the main news column

In [13]:
news_df[NEWS_COLUMN_NAME] = news_df.apply(
    merge_news_leftover, 
    axis=1)

news_df = news_df.drop(LEFTOVER_COLUMN_NAME, axis=1)

In [14]:
news_df[NEWS_COLUMN_NAME] = news_df[NEWS_COLUMN_NAME].apply(
    process_news_data)

In [15]:
news_df.head()

Unnamed: 0,Date,Tweet,Time
0,2023-04-10,"Tesla to open a new Megafactory in Shanghai, C...",01:00:00
1,2023-04-10,#5things: China holds military drills around T...,00:06:05
2,2023-04-09,WATCH: Tesla Chief Executive Elon Musk is maki...,22:10:00
3,2023-04-09,Tesla’s Model S and X are starting to show the...,21:41:02
4,2023-04-09,"How the market’s biggest companies, from Apple...",20:00:00


Saving processed dataframe

In [16]:
news_df.to_csv("../Data/processed_tweets.csv")

## Financial Sentiment Extraction

### Pip Install Commands

In [17]:
%pip install torch
%pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Libraries

In [18]:
import torch
from enum import Enum
from torch.nn.functional import softmax
from transformers import BertTokenizer, BertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


### Const Values

In [19]:
SENTIMENT_COLUMN_NAME = "Sentiment"

class Sentiment(Enum):
    POSITIVE = 0
    NEGATIVE = 1
    NEUTRAL  = 2

TOKENIZER = BertTokenizer.from_pretrained(
    'ProsusAI/finbert')

MODEL = BertForSequenceClassification.from_pretrained(
    'ProsusAI/finbert')

### Methods

In [20]:
def get_news_from_date(news_df, date):
    return news_df[news_df[DATE_COLUMN_NAME].astype(str) == date]

In [21]:
def get_sentiment_probabilities(news_data):
    # tokenize input
    inputs = TOKENIZER(news_data, 
                       return_tensors='pt', 
                       truncation=True, 
                       padding=True)

    # perform prediction
    with torch.no_grad():
        outputs = MODEL(**inputs)

    # get logits and apply softmax to get probabilities
    logits = outputs.logits
    probabilities = softmax(logits, dim=1)

    # convert pytorch tensor to numpy array
    return probabilities.numpy()[0]

In [22]:
def get_sentiment_score(date):
    sentiment_category = 0
    psitive_sentiment_avg = 0
    negative_sentiment_avg = 0

    # get news from given date
    date_news = get_news_from_date(news_df, date)
    amount_of_news = len(date_news)

    # in case no news were published on given date, return 0
    if amount_of_news == 0:
        return sentiment_category

    for i in range(amount_of_news):
        current_news = date_news[NEWS_COLUMN_NAME].iloc[i]
        sentiment_probabilities = get_sentiment_probabilities(
            current_news)
        
        # sum positive probability
        psitive_sentiment_avg += sentiment_probabilities[
            Sentiment.POSITIVE.value]

        # sum negative probability
        negative_sentiment_avg += sentiment_probabilities[
            Sentiment.NEGATIVE.value]

    # calculate average of positive and negative probabilities sums  
    psitive_sentiment_avg /= amount_of_news
    negative_sentiment_avg /= amount_of_news
    
    # compare average values and assign sentiment category
    if psitive_sentiment_avg > negative_sentiment_avg:
        sentiment_category = 1
    else:
        sentiment_category = -1
    
    return sentiment_category

### Code

Loading TESLA stock data frame

In [23]:
tesla_stock_df = pd.read_csv("../Data/TSLA.csv")
tesla_stock_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,3.8,5.0,3.508,4.778,4.778,93831500
1,2010-06-30,5.158,6.084,4.66,4.766,4.766,85935500
2,2010-07-01,5.0,5.184,4.054,4.392,4.392,41094000
3,2010-07-02,4.6,4.62,3.742,3.84,3.84,25699000
4,2010-07-06,4.0,4.0,3.166,3.222,3.222,34334500


Getting sentiment score for each trading day in TESLA stock data frame

In [24]:
tesla_stock_df[SENTIMENT_COLUMN_NAME] = tesla_stock_df[DATE_COLUMN_NAME].apply(get_sentiment_score)

Saving the processed TESLA stock data frame

In [30]:
tesla_stock_df.to_csv("../Data/processed_TSLA.csv")