# Sentiment Analysis

## Set up

### Write to excel

In [None]:
import time
import os
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

numerical_excel_path = '/content/drive/My Drive/Colab Notebooks/numerical_vTest.xlsx'
contextual_excel_path = '/content/drive/My Drive/Colab Notebooks/contextual_vTest.xlsx'


def write_to_excel(result, sheet_name, excel_path):
    # Define the column names
    df = pd.DataFrame(result)
    df.columns = ['title', 'sentiment', 'confidence']

    with pd.ExcelWriter(excel_path, mode='a', engine='openpyxl', if_sheet_exists='new') as writer:
        df.to_excel(writer, sheet_name=sheet_name, index=False)

In [None]:
!ls -l {numerical_excel_path}

In [None]:
!unzip -t {numerical_excel_path}

### Reading news data - [Twitter Value](https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment)

In [None]:
numerical_filename = 'Numerical.xlsx'
search_path = '/content/drive/My Drive/Colab Notebooks'

file_path = None

# read numerical
for root, dirs, files in os.walk(search_path):
    if numerical_filename in files:
        file_path = os.path.join(root, numerical_filename)
        break
        numerical_df = pd.read_excel(file_path)

numerical_df = pd.read_excel(file_path)
# numerical_df.head()

numerical_titles = numerical_df.iloc[:, 0]
numerical_titles

In [None]:
contextual_filename = 'Contextual.xlsx'
search_path = '/content/drive/My Drive/Colab Notebooks'

file_path = None

# read contextual
for root, dirs, files in os.walk(search_path):
    if contextual_filename in files:
        file_path = os.path.join(root, contextual_filename)
        break
    # contextual_df = pd.read_excel(file_path)
contextual_df = pd.read_excel(file_path)

contextual_titles = contextual_df.iloc[:, 0]
contextual_titles

## Sentiment analysis function for each models

### 1. [TextBlob](https://github.com/sloria/TextBlob) - NaiveBayesAnalyzer

`NaiveBayesAnalyzer`: uses [`nltk`](https://github.com/nltk/nltk), this model was trained on a movie review corpus

In [None]:
!python -m textblob.download_corpora

In [None]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

def textblob(titles, excel_path):
  excel_result = []

  start_time = time.time()
  for title in titles:
      blob = TextBlob(title, analyzer=NaiveBayesAnalyzer())
      # print(title)
      # print(blob.sentiment)
      # print("-----------------------")
      # Unpack the sentiment into its own tuple
      sentiment_data = (blob.sentiment.classification, blob.sentiment.p_pos, blob.sentiment.p_neg)
      # Extend the original tuple (title, sentiment) with the unpacked sentiment data
      excel_result.append((title,) + sentiment_data)

  end_time = time.time()
  duration = end_time - start_time

  # The rest of your code remains the same
  df = pd.DataFrame(excel_result, columns=['title', 'classification', 'p_pos', 'p_neg'])

  # Now you can write df to the Excel file as before
  with pd.ExcelWriter(excel_path, mode='a', engine='openpyxl', if_sheet_exists='new') as writer:
      df.to_excel(writer, sheet_name="TextBlob")

  return duration

### 2. [VADER](https://github.com/cjhutto/vaderSentiment)

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from zipfile import BadZipFile

custom_nltk_path = '/content/drive/My Drive/Colab Notebooks/nltk_data'
nltk.data.path.append(custom_nltk_path)

try:
    nltk.download('vader_lexicon', force=True)
except BadZipFile as e:
    print(f"Failed to open: {e.filename}")


def vader(titles, excel_path):

  # Initialize the VADER sentiment intensity analyzer
  sia = SentimentIntensityAnalyzer()

  excel_result = []

  # Iterate through each title and perform sentiment analysis
  start_time = time.time()
  for title in titles:
      # Get the sentiment scores
      sentiment_scores = sia.polarity_scores(title)

      # Determine sentiment type and confidence
      compound_score = sentiment_scores['compound']
      if compound_score >= 0.05:
          generalized_sentiment = 'positive'
      elif compound_score <= -0.05:
          generalized_sentiment = 'negative'
      else:
          generalized_sentiment = 'neutral'

      # Confidence interpretation
      confidence = abs(compound_score)  # The absolute value of the compound score as confidence

      # Print the result in the specified format
      # print(title)
      # print(f"{{'label': '{generalized_sentiment}', 'score': {confidence:.4f}}}")
      # print("-------------------------")
      excel_result.append((title, generalized_sentiment, confidence))


  end_time = time.time()
  duration = end_time - start_time

  write_to_excel(excel_result, "VADER", excel_path)

  return duration

### 3. fine-tuned [DistilRoBERTa](https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis) (Transformers!)

This is a transformer model that has been fine-tuned on the [financial phrasebank](https://huggingface.co/datasets/financial_phrasebank) dataset.

In [None]:
from transformers import pipeline

def distilroberta(titles, excel_path):
  MODEL = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
  pipe = pipeline("text-classification", model=MODEL)
  excel_result = []

  start_time = time.time()

  for title in numerical_titles:
    # print(title)
    # print(pipe(title))
    # print("-------------------------")
    data = pipe(title)
    excel_result.append((title, data[0]['label'], data[0]['score']))

  end_time = time.time()
  duration = end_time - start_time

  write_to_excel(excel_result, "DistilRoBERTa", excel_path)

  return duration

### 4. [FinBERT](https://huggingface.co/yiyanghkust/finbert-tone) - 2022

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

HF_TOKEN = "HF_TOKEN"

def finbert(titles, excel_path):
  finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
  tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

  sentiment_analysis = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

  titles = [str(title) for title in titles]
  sentiment_results = sentiment_analysis(titles)

  start_time = time.time()
  excel_result = []

  for title in titles:
    sentiment_result = sentiment_analysis(title)
    # print(title)
    # print(sentiment_result)
    # print("-------------------------")
    excel_result.append((title, sentiment_result[0]['label'], sentiment_result[0]['score']))

  end_time = time.time()
  duration = end_time - start_time

  write_to_excel(excel_result, "FinBERT", excel_path)

  return duration

### 5. [BART Large Mnli](https://huggingface.co/facebook/bart-large-mnli)

In [None]:
!pip install transformers

In [None]:
from transformers import pipeline

def bart(titles, excel_path):
  # Initialize the zero-shot classification pipeline
  classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

  # Candidate labels for sentiment analysis
  candidate_labels = ["positive", "negative", "neutral"]

  # Classify the sentiment of each title
  start_time = time.time()
  excel_result = []

  for title in numerical_titles:
      # Perform zero-shot classification
      result = classifier(title, candidate_labels)
      sentiment = result['labels'][0]  # The label with the highest score
      confidence = result['scores'][0]  # The confidence of the top label

      # Print the result
      # print(title)
      # print(f"[{{'label': '{sentiment}', 'score': {confidence:.4f}}}]")
      # print("-------------------------")
      excel_result.append((title, sentiment, confidence))

  end_time = time.time()
  duration = end_time - start_time

  write_to_excel(excel_result, "BART Large Mini", excel_path)

  return duration

### 6. [BERT](https://huggingface.co/docs/transformers/model_doc/bert)

since BERT itself doesn't natively support zero-shot classification in the way models like facebook/bart-large-mnli do, we'll use a model that has been fine-tuned for sequence classification tasks and can infer sentiment directly. One such model is [nlptown/bert-base-multilingual-uncased-sentiment](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment), which can classify text into positive, negative, or neutral sentiments and is available on the Hugging Face model hub.

In [None]:
from transformers import pipeline

def bert(titles, excel_path):
  # Initialize the sentiment analysis pipeline with a BERT model fine-tuned for sentiment
  # Here we use 'nlptown/bert-base-multilingual-uncased-sentiment' which outputs sentiment scores
  classifier = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

  excel_result = []

  # Classify the sentiment of each title
  start_time = time.time()

  for title in numerical_titles:
      # Perform sentiment analysis
      results = classifier(title)

      # The 'nlptown/bert-base-multilingual-uncased-sentiment' model provides labels and scores directly
      sentiment = results[0]['label']
      # Convert labels provided by this model into a more general form (positive, negative, neutral)
      if sentiment in ["1 star", "2 stars"]:
          generalized_sentiment = "negative"
      elif sentiment in ["4 stars", "5 stars"]:
          generalized_sentiment = "positive"
      else:
          generalized_sentiment = "neutral"
      confidence = results[0]['score']

      # Print the result
      # print(title)
      # print(f"[{{'label': '{generalized_sentiment}', 'score': {confidence:.4f}}}]")
      # print("-------------------------")
      excel_result.append((title, generalized_sentiment, confidence))

  end_time = time.time()
  duration = end_time - start_time

  write_to_excel(excel_result, "BERT", excel_path)

  return duration

## Conducting sentiment analysis

In [None]:
# 1. Textblob
numerical_textblob_duration = textblob(numerical_titles, numerical_excel_path)
contextual_textblob_duration = textblob(contextual_titles, contextual_excel_path)
print(f"numerical_textblob_duration: {numerical_textblob_duration} seconds")
print(f"contextual_textblob_duration: {contextual_textblob_duration} seconds")

# 2. VADER
numerical_vader_duration = vader(numerical_titles, numerical_excel_path)
contextual_vader_duration = vader(contextual_titles, contextual_excel_path)
print(f"numerical_vader_duration: {numerical_vader_duration} seconds")
print(f"contextual_vader_duration: {contextual_vader_duration} seconds")

# 3. DistilRoBERTa
numerical_distilroberta_duration = distilroberta(numerical_titles, numerical_excel_path)
contextual_distilroberta_duration = distilroberta(contextual_titles, contextual_excel_path)
print(f"numerical_distilroberta_duration: {numerical_distilroberta_duration} seconds")
print(f"contextual_distilroberta_duration: {contextual_distilroberta_duration} seconds")

# 4. FinBERT
numerical_finbert_duration = finbert(numerical_titles, numerical_excel_path)
contextual_finbert_duration = finbert(contextual_titles, contextual_excel_path)
print(f"numerical_finbert_duration: {numerical_finbert_duration} seconds")
print(f"contextual_finbert_duration: {contextual_finbert_duration} seconds")

# 5. BART Large Mini
numerical_bart_duration = bart(numerical_titles, numerical_excel_path)
contextual_bart_duration = bart(contextual_titles, contextual_excel_path)
print(f"numerical_bart_duration: {numerical_bart_duration} seconds")
print(f"contextual_bart_duration: {contextual_bart_duration} seconds")

# 6. BERT
numerical_bert_duration = bert(numerical_titles, numerical_excel_path)
contextual_bert_duration = bert(contextual_titles, contextual_excel_path)
print(f"numerical_bert_duration: {numerical_bert_duration} seconds")
print(f"contextual_bert_duration: {contextual_bert_duration} seconds")