<a href="https://colab.research.google.com/github/nakhimchea/sentiment_analysis_ipynb/blob/main/SmartSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# *Import Libraries*

**SmartSA: Analysis of Tweeter and Apply to Strategy**

In [None]:
!pip install transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

import time
import numpy
import pandas

**Twitter Scraping**

In [None]:
!pip install snscrape
import snscrape.modules.twitter as twitter

# *Loading RoBERTa model*

**Get Model**

In [69]:
RoBERTa = 'cardiffnlp/twitter-roberta-base-sentiment'
model = AutoModelForSequenceClassification.from_pretrained(RoBERTa)

**Initialize Tokenizer**

In [70]:
tokenizer = AutoTokenizer.from_pretrained(RoBERTa)

**Sparse Labels**

In [71]:
labels = ['Negative', 'Neutral', 'Positive']

# *Getting Data from Social Network (Twitter)*

**Search Query**

In [77]:
query = 'Bitcoin BTC Ethereum ETH (cryptocurrency) min_replies:0 min_faves:0 min_retweets:0 since_time:{0} until_time:{1}'.format(str(1663002000 - 246400), str(1663002000)) #int(time.time())
limit = 10000

**Query Tweets and Preprocessing**

In [78]:
tweets = []
for tweet in twitter.TwitterSearchScraper(query).get_items():
  # print(tweet)
  
  #preprocessing tweets
  tweetWords = []
  for word in tweet.content.split(' '):
    if word.startswith('@') and len(word) > 1:
      word = '@user'
    elif word.startswith('http'):
      word = 'http'
    elif (word.startswith('#') or word.startswith('$')) and len(word) > 1:
      word = '#mention'
    elif word.startswith('-') and len(word) > 1:
      word = 'bad'
    elif word.startswith('+') and len(word) > 1:
      word = 'good'
    tweetWords.append(word)
  tweetContent = ' '.join(tweetWords)

  toBeRemoved = True
  for word in tweetContent.split(' '):
    if word.startswith('Bitcoin') or word.startswith('bitcoin') or \
       word.startswith('BTC') or word.startswith('ETH') or \
       word.startswith('Ethereum') or word.startswith('ethereum') or \
       word.startswith('Cryptocurrency') or word.startswith('cryptocurrency'):
       toBeRemoved = False

  if tweetContent.startswith('#mention price update') or \
     tweetContent.startswith('1 Bitcoin') or \
     tweetContent.startswith('Current #mention price Alert'):
    toBeRemoved = True

  if len(tweets) == limit:
    break
  elif toBeRemoved:
    continue
  else:
    tweets.append([tweet.date, tweet.username, tweetContent])

tweetsDF = pandas.DataFrame(tweets, columns=['Date', 'User', 'Tweet'])

**Print DataFrame**

In [79]:
print(tweetsDF)

                         Date            User  \
0   2022-09-12 16:56:31+00:00   cryptoalerted   
1   2022-09-12 16:18:08+00:00    CurrencyNews   
2   2022-09-12 16:05:35+00:00        coin3net   
3   2022-09-12 16:04:03+00:00  FoxCryptoNews2   
4   2022-09-12 16:00:02+00:00   tycoonsuccess   
..                        ...             ...   
226 2022-09-09 21:25:10+00:00    CurrencyNews   
227 2022-09-09 21:07:54+00:00        coin3net   
228 2022-09-09 20:56:07+00:00   cryptoalerted   
229 2022-09-09 20:55:42+00:00    CryptoCoinz_   
230 2022-09-09 20:54:07+00:00    CurrencyNews   

                                                 Tweet  
0    As the Ethereum Merge Draws Near, Google Launc...  
1    Crypto's Big Number: Daily Bitcoin Mining Reve...  
2    While the banks were closed, Bitcoin reached 5...  
3    Bitcoin bounced up before the US announced CPI...  
4    Ethereum Merge/ETH 2.0\n\nThe Merge is expecte...  
..                                                 ...  
226  Bitcoin

# *Sentiment Analysis*

**Tweet Classifications**

In [80]:
target = []
for index in range(len(tweetsDF.Tweet)):
  encodedTweet = tokenizer(tweetsDF.Tweet[index], return_tensors='pt')
  roBERTaBottleNeck = model(**encodedTweet)
  probabilities = softmax(roBERTaBottleNeck[0][0].detach().numpy())
  #probabilities = [probabilities[0]+probabilities[1], probabilities[1]+probabilities[2]]

  #newLabels = ['Negative', 'Positive']
  target.append([tweetsDF.Tweet[index], labels[numpy.argmax(probabilities)]])

**Classification Table**

In [81]:
targetTable = pandas.DataFrame(target, columns=['Tweet', 'Annotation'])
print(targetTable)

                                                 Tweet Annotation
0    As the Ethereum Merge Draws Near, Google Launc...    Neutral
1    Crypto's Big Number: Daily Bitcoin Mining Reve...    Neutral
2    While the banks were closed, Bitcoin reached 5...   Positive
3    Bitcoin bounced up before the US announced CPI...   Positive
4    Ethereum Merge/ETH 2.0\n\nThe Merge is expecte...    Neutral
..                                                 ...        ...
226  Bitcoin spikes 10% on news of potential bannin...    Neutral
227  3 reasons why Bitcoin traders should be bullis...    Neutral
228  3 Bitcoin price metrics suggest Sept. 9’s 10% ...    Neutral
229  Why The Ethereum Price Could Rally Above #ment...    Neutral
230  Market Wrap: Bitcoin Notches Biggest Gain in 6...   Positive

[231 rows x 2 columns]


**Normalize Result**

In [82]:
countLabels = [0, 0, 0]
for index in range(0, len(targetTable.Annotation)):
  if targetTable.Annotation[index] == labels[0]:
    countLabels[0] += 1
  elif targetTable.Annotation[index] == labels[1]:
    countLabels[1] += 1
  elif targetTable.Annotation[index] == labels[2]:
    countLabels[2] += 1

if(sum(countLabels) != 0):
  print('Probability      : {:.2f} {:.2f} {:.2f}'.format(countLabels[0]/sum(countLabels), countLabels[1]/sum(countLabels), countLabels[2]/sum(countLabels)))
  print('Final Analysis   : {}'.format(labels[numpy.argmax(countLabels)]))
  print('Confidentiality  : {:.2f}%'.format(countLabels[numpy.argmax(countLabels)]/sum(countLabels)*100))

Probability      : 0.06 0.75 0.19
Final Analysis   : Neutral
Confidentiality  : 74.89%
