In [58]:

import numpy as np
import pandas as pd

import nltk
import re
import string
import time
import csv
import scipy
nltk.download('stopwords')
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig
from transformers import RobertaTokenizer, RobertaForSequenceClassification

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:

def clean_text_data(tweet):
  # Transform to lower case
  tweet = tweet.lower()
  #Remove url, square and round brackets, mentions
  tweet = re.sub(r"(http[s]?\://\S+)|([\[\(].*[\)\]])|([@]\S+)", "",tweet)
  #Remove numbers
  tweet = re.sub(r'\d', '', tweet) #reconsider this step
  #Remove stop words
  tweet = ' '.join([word for word in tweet.split() if word not in cachedStopWords  and  word.isascii()])
  #Remove Punctuations
  tweet = tweet.translate(str.maketrans('', '','!"#$%&.\'()*+,-/:;<=>?@[\]^_`{|}~'))
  #Remove extra spaces
  tweet = re.sub('[\s]+', ' ', tweet)

  return tweet

In [60]:
finbert_tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")
finbert_model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [61]:
labels = finbert_model.config.id2label.values()
def getSenimentScoreByFinBERT(clean_text):
  clean_data = finbert_tokenizer(clean_text, padding=True, truncation=True, return_tensors='pt')
  logits = finbert_model(**clean_data).logits
  scores = {k: v for k, v in zip(labels, scipy.special.softmax(logits.detach().numpy().squeeze()))}

  return scores

In [62]:
 positive_sent = clean_text_data('Bullish on Bitcoin! sats stacking and feeling confident about the future of this innovative tech. #Bitcoin #Cryptocurrency')

In [63]:
negative_sent = clean_text_data('#Bitcoin down another 10% today. ‍♂️  This volatility is insane! Who can trust a currency that swings wildly like this?  More like a digital rollercoaster than a store of value.  #cryptocrash  #bubblebursting')
negative_sent

'bitcoin another today volatility insane trust currency swings wildly like this like digital rollercoaster store value cryptocrash bubblebursting'

In [64]:
result_1 = getSenimentScoreByFinBERT(negative_sent)
result_1

{'positive': 0.037037645, 'negative': 0.083885126, 'neutral': 0.8790772}

In [65]:
result_2 = getSenimentScoreByFinBERT(positive_sent)
result_2

{'positive': 0.7741774, 'negative': 0.011159301, 'neutral': 0.21466333}

In [66]:
roberta_tokenizer = RobertaTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
roberta_model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [67]:
labels = roberta_model.config.id2label.values()
from scipy import special
def getSenimentScorebyTwRoberta(clean_text):
  clean_data = roberta_tokenizer(clean_text, padding=True, truncation=True, return_tensors='pt')
  logits = roberta_model(**clean_data).logits
  scores = {k: v for k, v in zip(labels, scipy.special.softmax(logits.detach().numpy().squeeze()))}
  return scores

In [68]:
result_3 = getSenimentScorebyTwRoberta(negative_sent)
result_3

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'negative': 0.31796905, 'neutral': 0.580458, 'positive': 0.10157295}

In [69]:
result_4 = getSenimentScorebyTwRoberta(positive_sent)
result_4

{'negative': 0.006130634, 'neutral': 0.26067412, 'positive': 0.73319525}

In [70]:
df = pd.DataFrame(data = {'FinBert_Neg': result_1, 'FinBert_Pos': result_2, 'TwRoberta_Neg': result_3, 'TwRoberta_Pos': result_4} )
df

Unnamed: 0,FinBert_Neg,FinBert_Pos,TwRoberta_Neg,TwRoberta_Pos
positive,0.037038,0.774177,0.101573,0.733195
negative,0.083885,0.011159,0.317969,0.006131
neutral,0.879077,0.214663,0.580458,0.260674
