<a href="https://colab.research.google.com/github/nikhilmenon06/Financial-News-Classification-using-BERT/blob/main/FinancialTextClassify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import tensorflow_text as text

In [2]:
data = pd.read_csv('/Sentences_50Agree.txt', sep='.@', names=['text','label'], encoding = "ISO-8859-1")

  return func(*args, **kwargs)


In [3]:
display(data)

Unnamed: 0,text,label
0,"According.. to Gran , the company has no plans...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive
...,...,...
4841,LONDON MarketWatch -- Share prices ended lower...,negative
4842,Rinkuskiai 's beer sales fell by 6.5 per cent ...,neutral
4843,Operating profit fell to EUR 35.4 mn from EUR ...,negative
4844,Net sales of the Paper segment decreased to EU...,negative


In [9]:
# Cleaning the data 1. Lower Casing 2. Stripping extra spaces 3. Replace anything that is not a word or a whitespace with a ''
data['clean_text'] = data['text'].str.lower()
data['clean_text'] = data['clean_text'].str.strip()
data['clean_text'] = data['clean_text'].str.replace('[^\w \s]','', regex = True)

In [8]:
display(data)

Unnamed: 0,text,label,clean_text
0,"According.. to Gran , the company has no plans...",neutral,according to gran the company has no plans to...
1,Technopolis plans to develop in stages an area...,neutral,technopolis plans to develop in stages an area...
2,The international electronic industry company ...,negative,the international electronic industry company ...
3,With the new production plant the company woul...,positive,with the new production plant the company woul...
4,According to the company 's updated strategy f...,positive,according to the company s updated strategy fo...
...,...,...,...
4841,LONDON MarketWatch -- Share prices ended lower...,negative,london marketwatch share prices ended lower i...
4842,Rinkuskiai 's beer sales fell by 6.5 per cent ...,neutral,rinkuskiai s beer sales fell by 65 per cent to...
4843,Operating profit fell to EUR 35.4 mn from EUR ...,negative,operating profit fell to eur 354 mn from eur 6...
4844,Net sales of the Paper segment decreased to EU...,negative,net sales of the paper segment decreased to eu...


In [10]:
# print(data.groupby('label').describe())

# Create encoded label: Positive: [1,0,0]  Neutral:[0,1,0]  Negative:[0,0,1]
data['label_enc'] = data['label'].apply(lambda x: [1,0,0] if x == 'positive' else [0,1,0] if x == 'neutral' else [0,0,1])
display(data)


Unnamed: 0,text,label,clean_text,label_enc
0,"According.. to Gran , the company has no plans...",neutral,according to gran the company has no plans to...,"[0, 1, 0]"
1,Technopolis plans to develop in stages an area...,neutral,technopolis plans to develop in stages an area...,"[0, 1, 0]"
2,The international electronic industry company ...,negative,the international electronic industry company ...,"[0, 0, 1]"
3,With the new production plant the company woul...,positive,with the new production plant the company woul...,"[1, 0, 0]"
4,According to the company 's updated strategy f...,positive,according to the company s updated strategy fo...,"[1, 0, 0]"
...,...,...,...,...
4841,LONDON MarketWatch -- Share prices ended lower...,negative,london marketwatch share prices ended lower i...,"[0, 0, 1]"
4842,Rinkuskiai 's beer sales fell by 6.5 per cent ...,neutral,rinkuskiai s beer sales fell by 65 per cent to...,"[0, 1, 0]"
4843,Operating profit fell to EUR 35.4 mn from EUR ...,negative,operating profit fell to eur 354 mn from eur 6...,"[0, 0, 1]"
4844,Net sales of the Paper segment decreased to EU...,negative,net sales of the paper segment decreased to eu...,"[0, 0, 1]"


In [11]:
# Stratify ensures that test and train set are balanced in terms of ratio of classes (Note: This is not the same as class balance)
X_train, X_test, Y_train, Y_test = train_test_split(data['clean_text'], data['label_enc'], test_size = 0.30, stratify= data['label'])

In [12]:
print(Y_train.value_counts())
print(Y_test.value_counts())

[0, 1, 0]    2015
[1, 0, 0]     954
[0, 0, 1]     423
Name: label_enc, dtype: int64
[0, 1, 0]    864
[1, 0, 0]    409
[0, 0, 1]    181
Name: label_enc, dtype: int64


In [13]:
# URLS for Small BERT preprocesser(model to transform text into BERT input conpatible format) and encoder(actual BERT model)

encoder_url = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2'
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [14]:
#KerasLayer wraps a saved model as a keras layer. 
#Loading the preprocessor model and encoder model as layers. The layers can be treated like function pointers.

bert_preprocess = hub.KerasLayer(preprocess_url)
bert_enc = hub.KerasLayer(encoder_url)

In [37]:
# Test Code to demonstrate the Bert preprocesser 

text_test = ["it is such an honour","king has a lot of power"]

#Preprocess the text and transform into BERT input form. Layer/Model can be treated like a function pointer and returns a dict object.

text_test_pre = bert_preprocess(text_test)
print("Keys are:")
print(text_test_pre.keys())
print("Return type:")
print(type(text_test_pre))
print("Input word ID")
print(text_test_pre["input_word_ids"])
print("Input Mask")
print(text_test_pre["input_mask"])

Keys are:
dict_keys(['input_word_ids', 'input_mask', 'input_type_ids'])
Return type:
<class 'dict'>
Input word ID
tf.Tensor(
[[ 101 2009 2003 2107 2019 6225  102    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [ 101 2332 2038 1037 2843 1997 2373  102    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    

# Preparing text for BERT
### The BERT preprocessing layer takes a set of texts as input and transforms them into a format that is compaitble to be used as input for the BERT encoder model.
### The BERT preprocessing layer returns a dictionary object with 3 keys.


1.   **input_word_ids** - Represents each token in the text segment as an ID from a dictionary(already trained and exists), along with the ID for the special [CLS] token at the beginning and the [SEP] token representing the end of the segment.
2.   **input_mask** - Positional Embedding, i.e, it shows whether a given token position contians actual token or if it a zero padded position.
3. **input_type_ids** - Sentence Embedding, i.e, it shows which segment the token belongs to. Evident only if there are 2 segments in a given text.

### Each of this above is a tensor of shape (dataset size, seq_length). In this particular small BERT model, the sequence length is 128 (in the actual BERT model, it is 512). Padding is added upto 128, if the sequence length is lesser than 128 to make every data point consistent in terms of length.




In [39]:
bert_output = bert_enc(text_test_pre)

In [41]:
print(bert_output.keys())

dict_keys(['pooled_output', 'encoder_outputs', 'sequence_output', 'default'])
