# Load Document and Pre-process Data

Import all the required packages that is required for the set of tasks.

Note: The overall results are available at the bottom of the notebook (Last 4 cells)

In [5]:
!pip install nltk gensim
!python -m spacy download en_core_web_sm
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import gensim.downloader as api
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import statistics as st
import pandas as pd
import math
import re

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [6]:
# Function to read the sample.txt data
def read_text_file(file_path):
    try:
        with open(file_path, 'r') as file:
            text = file.read()
        return text
    except FileNotFoundError:
        print("File not found.")
        return None

# Read the txt file
file_path = "sample.txt"  # Change this to the path of your text document
text = read_text_file(file_path)

The data seems to be like a social media dataset, very similar to twitter. Data should be preprocessed accordingly.

In [7]:
#Function to split the tweets in to array of tweets
def split_into_tweets(text):
    pattern = re.compile(r'(RT\s)?@\w+:?\s?')
    tweets = pattern.split(text)
    # Remove None and Text values "RT "
    tweets = [x for x in tweets if x is not None and x != 'RT ']
    tweets = [tweet.strip() for tweet in tweets]
    return tweets


In [8]:
tweets = split_into_tweets(text)

#Verification Commands
print("Array length of tweets:")
len(tweets)

Array length of tweets:


408

In [9]:
# Preprocessing function
def preprocess(document):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    words = word_tokenize(document.lower())
    filtered_words = [stemmer.stem(word) for word in words if word not in stop_words and word.isalnum()]
    return filtered_words

# Identify Tags

## Gold Tagging

In [10]:
# Gold as a keyword
keywords = ['gold']

In [11]:
# Filter documents based on keywords
filtered_documents = []
for document in tweets:
    words = preprocess(document)
    if any(keyword in words for keyword in keywords):
        filtered_documents.append(document)

In [12]:
len(filtered_documents)

220

In [13]:
# Loading a pre-trained word2vec model
model = api.load("glove-wiki-gigaword-50")



In [14]:
# A function to check if the document is related to financial commodities
def is_financial_context(doc, model):
    financial_keywords = ['finance', 'investment', 'market', 'economy', 'stock', 'trade']
    doc_tokens = preprocess(doc)
    for token in doc_tokens:
        for keyword in financial_keywords:
            try:
                if model.similarity(token, keyword) > 0.8:  # Adjust threshold as needed, decided 0.8 after trial and error method
                    return True
            except KeyError:  # In case token is not in the vocabulary
                continue
    return False

In [15]:
# Apply the function to filter documents and fetch gold referred as financial commodity documents
financial_documents = [doc for doc in filtered_documents if is_financial_context(doc, model)]

## Silver Tag

In [16]:
# Silver as a keyword
silver_keywords = ['silver']

In [17]:
# Filter documents based on keywords
silver_filtered_documents = []
for document in tweets:
    words = preprocess(document)
    if any(keyword in words for keyword in silver_keywords):
        silver_filtered_documents.append(document)

In [18]:
len(silver_filtered_documents)

74

In [19]:
# Apply the function to filter documents and fetch silver referred as financial commodity documents
silver_financial_documents = [doc for doc in silver_filtered_documents if is_financial_context(doc, model)]


Going ahead to understand how different sentiment analyzer models are performing, we will be using VADER and BERT.

# VADER Sentiment Analysis

In [20]:
# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

## Gold Sentiments

In [35]:
# Array of sentences
vgold_sentences = financial_documents
vgold_final_scores = []
# Perform sentiment analysis for each sentence
for sentence in vgold_sentences:
    # Get sentiment scores
    scores = sid.polarity_scores(sentence)
    # Print the sentence and its sentiment scores
    vgold_final_scores.append(scores['compound'])

## Silver Sentiments

In [22]:
# Array of sentences
vsilver_sentences = silver_financial_documents
vsilver_final_scores = []
# Perform sentiment analysis for each sentence
for sentence in vsilver_sentences:
    # Get sentiment scores
    scores = sid.polarity_scores(sentence)
    # Print the sentence and its sentiment scores
    vsilver_final_scores.append(scores['compound'])


# BERT Sentiment Analysis

In [23]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax
from transformers import pipeline
import torch

In [24]:
# Load BERT tokenizer and model, we will be using a pretained model that is avaiable in the huggingface site
bert_model = BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
bert_tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis",max_length=4056)
bert_model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/369 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/464k [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [25]:
def analyze_sentiment_bert(sentence):
    # Tokenize input
    inputs = bert_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    # Forward pass
    outputs = bert_model(**inputs)
    # Get probabilities
    probabilities = softmax(outputs.logits, dim=1)
    # Get predicted label (0 for negative, 1 for positive)
    predicted_label = torch.argmax(probabilities, dim=1).item()
    return probabilities.tolist()[0], predicted_label

In [26]:
def max_count_value(arr):
    # Count occurrences of 0 and 1
    count_1 = arr.count(1)
    threshold = math.ceil(len(arr)/2)

    # Return the value with maximum count
    if count_1 > threshold:
        return 1
    else:
        # If counts are equal, return 0
        return 0

## Gold Sentiments

In [27]:
# Array of sentences
sentences = financial_documents

In [29]:
# Perform sentiment analysis for each sentence using BERT
bgold_predicted_label_arr = []
for sentence in sentences:
  probabilities, predicted_label = analyze_sentiment_bert(sentence)
  bgold_predicted_label_arr.append(predicted_label)

## Silver Sentiments

In [30]:
# Array of sentences
sentences = silver_financial_documents

In [31]:
bsilver_predicted_label_arr = []
for sentence in sentences:
  probabilities, predicted_label = analyze_sentiment_bert(sentence)
  bsilver_predicted_label_arr.append(predicted_label)

# Results

In [32]:
def overall_sentiment(isVader,input_arr):
  if isVader:
    average_sentiment_score = st.mean(input_arr)
    if average_sentiment_score >= 0.5:
      print("Overall Sentiment to Gold based on VADER is:\t Positive")
    else:
      print("Overall Sentiment to Gold based on VADER is:\t Positive")
  else:
    if max_count_value(input_arr) == 1:
      print("Overall Sentiment to Gold based on BERT is:\t Positive")
    else:
      print("Overall Sentiment to Gold based on BERT is:\t Positive")

In [37]:
# VADER Overall Gold Sentiments
overall_sentiment(True, vgold_final_scores)

Overall Sentiment to Gold based on VADER is:	 Positive


In [38]:
# VADER Overall Silver Sentiments
overall_sentiment(True, vsilver_final_scores)

Overall Sentiment to Gold based on VADER is:	 Positive


In [39]:
# BERT Overall Gold Sentiments
overall_sentiment(False, bgold_predicted_label_arr)

Overall Sentiment to Gold based on BERT is:	 Positive


In [40]:
# BERT Overall Silver Sentiments
overall_sentiment(False, bsilver_predicted_label_arr)

Overall Sentiment to Gold based on BERT is:	 Positive


The above sentiments towards gold and silver seems to be mostly positive. That clearly indicates that it's a good investment plan to invest in Gold and Silver according to the dataset.