In [None]:
pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.61.13-py3-none-any.whl (221 kB)
[K     |████████████████████████████████| 221 kB 8.1 MB/s 
[?25hCollecting wandb>=0.10.32
  Downloading wandb-0.12.1-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 52.7 MB/s 
[?25hCollecting tensorboardx
  Downloading tensorboardX-2.4-py2.py3-none-any.whl (124 kB)
[K     |████████████████████████████████| 124 kB 37.7 MB/s 
[?25hCollecting transformers>=4.2.0
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 51.6 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 37.0 MB/s 
[?25hCollecting streamlit
  Downloading streamlit-0.87.0-py2.py3-none-any.whl (8.0 MB)
[K     |████████████████████████████████| 8.0 MB 12.7 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17

In [None]:
import math
import random
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px


import seaborn as sns
from datetime import datetime
import time

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer


from simpletransformers.classification import ClassificationModel, ClassificationArgs
import torch
cuda_available = torch.cuda.is_available()


plt.style.use('classic')
%matplotlib inline
sns.set()

from google.colab import drive
drive.mount('/gdrive')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Mounted at /gdrive


### Functions

In [None]:
def filter_df(df, df_column, lower_bound=False, upper_bound=False):
    if lower_bound and upper_bound:
        if lower_bound <= upper_bound:
            mask = (df_column >= lower_bound) & (df_column <= upper_bound)
        else:
            mask = (df_column >= lower_bound) | (df_column <= upper_bound)
    elif lower_bound:
        mask = (df_column >= lower_bound)
    elif upper_bound:
        mask = (df_column <= upper_bound)
    else:
        return df
    return df.loc[mask]


def tweets_cleaner(text):

    text = ' '.join(text.split('.'))
    text = re.sub('\/',' ',text)
    text = text.strip('\'"')
    text = re.sub(r'@([^\s]+)',r'\1',text)
    text = re.sub(r'\\',' ',text)
    text = text.lower()
    text = re.sub('[\s]+', ' ', text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
    text = re.sub(r'((http)\S+)','',text)
    text = re.sub(r'\s+', ' ', re.sub('[^A-Za-z]', ' ', text.strip().lower())).strip()
    text = re.sub(r'\W+', ' ', text.strip().lower()).strip()

    return text

# 2019

## Datasets

In [None]:
df_tweets_2019 = pd.read_csv("/gdrive/My Drive/TextAnalytics/datasets/df_tweets_2019.csv")

df_tweets_2019['date'] = pd.to_datetime(df_tweets_2019['date'])
df_tweets_2019 = df_tweets_2019.set_index(['date']).sort_index()

df_tweets_2019

Unnamed: 0_level_0,text
date,Unnamed: 1_level_1
2019-05-06 00:00:01,h long btc btc short btc btc ls vs vs
2019-05-06 00:00:01,current prices and changes in the last hour bt...
2019-05-06 00:00:02,binance btt btc npxs btc dent btc bcn btc hot btc
2019-05-06 00:00:03,btc eth etc bch xrp xem lsk mona bitcoin bitfl...
2019-05-06 00:00:05,total market cap btc btc dominance update time...
...,...
2019-09-26 23:59:39,bought lots of bags on sale over the past hour...
2019-09-26 23:59:53,here is my bitcoin prediction of tomorrow pred...
2019-09-26 23:59:55,pay me in bitcoin using my t co pmgta gg t co ...
2019-09-26 23:59:57,the future is open source stake amp earn rewar...


## Vader Sentiment

In [None]:
vader = SentimentIntensityAnalyzer()

In [None]:
df_tweets_2019["vader_polarity"] = df_tweets_2019["text"].apply(lambda text: vader.polarity_scores(str(text)).get('compound'))

In [None]:
def fromPolarity_toSentiment(polarity):
    if polarity>=0.7:
        return 1
    elif polarity<=-0.3:
        return -1
    else:
        return 0

df_tweets_2019["vader_sentiment"] = df_tweets_2019["vader_polarity"].apply(fromPolarity_toSentiment)

In [None]:
df_tweets_2019.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2389238 entries, 2019-05-06 00:00:01 to 2019-09-27 00:00:00
Data columns (total 3 columns):
 #   Column           Dtype  
---  ------           -----  
 0   text             object 
 1   vader_polarity   float64
 2   vader_sentiment  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 72.9+ MB


In [None]:
df_tweets_2019

Unnamed: 0_level_0,text,vader_polarity,vader_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-05-06 00:00:01,h long btc btc short btc btc ls vs vs,0.0000,0
2019-05-06 00:00:01,current prices and changes in the last hour bt...,0.0000,0
2019-05-06 00:00:02,binance btt btc npxs btc dent btc bcn btc hot btc,0.0000,0
2019-05-06 00:00:03,btc eth etc bch xrp xem lsk mona bitcoin bitfl...,0.0000,0
2019-05-06 00:00:05,total market cap btc btc dominance update time...,0.2023,0
...,...,...,...
2019-09-26 23:59:39,bought lots of bags on sale over the past hour...,0.0000,0
2019-09-26 23:59:53,here is my bitcoin prediction of tomorrow pred...,0.0000,0
2019-09-26 23:59:55,pay me in bitcoin using my t co pmgta gg t co ...,0.2023,0
2019-09-26 23:59:57,the future is open source stake amp earn rewar...,0.8176,1


In [None]:
df_tweets_2019["vader_sentiment"].value_counts()

 0    1870236
 1     273835
-1     245167
Name: vader_sentiment, dtype: int64

In [None]:
df_tweets_2019["vader_sentiment"].value_counts(normalize=True)

 0    0.782775
 1    0.114612
-1    0.102613
Name: vader_sentiment, dtype: float64

In [None]:
df_tweets_2019["vader_sentiment"].value_counts() # con range [-1; -0.5], [-0.5; 0.5], [0.5, 1]

 0    1736076
 1     520066
-1     133096
Name: vader_sentiment, dtype: int64

In [None]:
df_tweets_polarized_2019 = df_tweets_2019.loc[df_tweets_2019["vader_sentiment"] != 0]

In [None]:
df_tweets_polarized_2019

Unnamed: 0_level_0,text,vader_polarity,vader_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-05-06 02:25:41,wish token giveaway retweet and like this twee...,0.7351,1
2019-05-06 02:29:30,wish is a decentralized reward platform powere...,0.9451,1
2019-05-06 03:34:48,new tasks added to contest enter to win huge c...,0.8442,1
2019-05-06 05:29:29,achain congrats our community member artemiycr...,0.8625,1
2019-05-06 05:50:31,join on kingcuan our bot will automatically ma...,0.7506,1
...,...,...,...
2019-09-26 23:57:50,on september daps will make its long awaited t...,0.7884,1
2019-09-26 23:58:06,judge denies release of canadian man accused o...,-0.8316,-1
2019-09-26 23:58:30,dutchsunset alexkjanssen alexcobb this message...,-0.4588,-1
2019-09-26 23:58:46,pezosaso aaronlevi ltc apompliano changing the...,-0.5994,-1


In [None]:
df_tweets_polarized_2019["vader_sentiment"] = df_tweets_polarized_2019["vader_sentiment"].replace(-1, 0)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
del df_tweets_2019

In [None]:
df_tweets_polarized_2019.to_csv('/gdrive/My Drive/TextAnalytics/datasets/df_tweets_polarized_2019.csv')

## Transformers Sentiment

In [None]:
!pip install simpletransformers

In [None]:
df_tweets_polarized_2019 = pd.read_csv("/gdrive/My Drive/TextAnalytics/datasets/df_tweets_polarized_2019.csv")
#df_tweets_polarized_2019 = pd.read_csv(r"C:\Users\aine2\Downloads\df_tweets_polarized_2019.csv")

df_tweets_polarized_2019['date'] = pd.to_datetime(df_tweets_polarized_2019['date'])
df_tweets_polarized_2019 = df_tweets_polarized_2019.set_index(['date']).sort_index()

df_tweets_polarized_2019

Unnamed: 0_level_0,text,vader_polarity,vader_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-05-06 02:25:41,wish token giveaway retweet and like this twee...,0.7351,1
2019-05-06 02:29:30,wish is a decentralized reward platform powere...,0.9451,1
2019-05-06 03:34:48,new tasks added to contest enter to win huge c...,0.8442,1
2019-05-06 05:29:29,achain congrats our community member artemiycr...,0.8625,1
2019-05-06 05:50:31,join on kingcuan our bot will automatically ma...,0.7506,1
...,...,...,...
2019-09-26 23:57:50,on september daps will make its long awaited t...,0.7884,1
2019-09-26 23:58:06,judge denies release of canadian man accused o...,-0.8316,0
2019-09-26 23:58:30,dutchsunset alexkjanssen alexcobb this message...,-0.4588,0
2019-09-26 23:58:46,pezosaso aaronlevi ltc apompliano changing the...,-0.5994,0


In [None]:
# https://simpletransformers.ai/docs/classification-data-formats/

df_train_transformers_2019 = df_tweets_polarized_2019[["text", "vader_sentiment"]].loc[df_tweets_polarized_2019.index <= "2019-09-01 00:00:00"].copy()

df_train_transformers_2019.columns = ["text", "labels"]

In [None]:
df_train_transformers_2019

Unnamed: 0_level_0,text,labels
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-05-06 02:25:41,wish token giveaway retweet and like this twee...,1
2019-05-06 02:29:30,wish is a decentralized reward platform powere...,1
2019-05-06 03:34:48,new tasks added to contest enter to win huge c...,1
2019-05-06 05:29:29,achain congrats our community member artemiycr...,1
2019-05-06 05:50:31,join on kingcuan our bot will automatically ma...,1
...,...,...
2019-08-31 23:58:48,iam platform curated tweet report telegram tok...,0
2019-08-31 23:59:01,i m guessing leo will always hold due to reput...,0
2019-08-31 23:59:17,for if you transactpurposefully transactneedfu...,0
2019-08-31 23:59:43,i ve been using coinbase which makes it really...,1


In [None]:
df_test_transformers_2019 = df_tweets_polarized_2019[["text", "vader_sentiment"]].loc[df_tweets_polarized_2019.index > "2019-09-01 00:00:00"].copy()

df_test_transformers_2019.columns = ["text", "labels"]

In [None]:
df_test_transformers_2019

Unnamed: 0_level_0,text,labels
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-09-01 00:00:02,txs size kb stripped kb time reward btc fees b...,1
2019-09-01 00:00:02,binance margin trade bnb delta exchange upto x...,0
2019-09-01 00:00:02,you suffer but why bitcoin just hit time to re...,0
2019-09-01 00:00:03,join the signal premium sign up here t co npgo...,1
2019-09-01 00:00:26,bitcoin a very easy way to earn money do you w...,1
...,...,...
2019-09-26 23:57:50,on september daps will make its long awaited t...,1
2019-09-26 23:58:06,judge denies release of canadian man accused o...,0
2019-09-26 23:58:30,dutchsunset alexkjanssen alexcobb this message...,0
2019-09-26 23:58:46,pezosaso aaronlevi ltc apompliano changing the...,0


In [None]:
df_validation_transformers_2019 = df_train_transformers_2019.loc[df_train_transformers_2019.index > "2019-08-21 00:00:00"].copy()

df_train_transformers_2019 = df_train_transformers_2019.loc[df_train_transformers_2019.index <= "2019-08-21 00:00:00"].copy()

In [None]:
df_train_transformers_2019

Unnamed: 0_level_0,text,labels
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-05-06 02:25:41,wish token giveaway retweet and like this twee...,1
2019-05-06 02:29:30,wish is a decentralized reward platform powere...,1
2019-05-06 03:34:48,new tasks added to contest enter to win huge c...,1
2019-05-06 05:29:29,achain congrats our community member artemiycr...,1
2019-05-06 05:50:31,join on kingcuan our bot will automatically ma...,1
...,...,...
2019-08-20 23:59:02,t dd kanuuker homelessonmoon bitcoinerrorlog b...,0
2019-08-20 23:59:08,mohtasem bgiradji jesusxrp i m sorry huh you s...,0
2019-08-20 23:59:30,who is making big moves yes lit and ftm are do...,1
2019-08-20 23:59:35,people dont believe in local international wea...,1


In [None]:
df_validation_transformers_2019

Unnamed: 0_level_0,text,labels
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-08-21 00:00:04,crypto h losers egt volume btc bcd volume btc ...,0
2019-08-21 00:00:05,btc top last h rvn st npxs st gxc st xin st wo...,0
2019-08-21 00:00:32,facebook s libra coin will be a useful and wel...,1
2019-08-21 00:00:32,who s firing oracle space x who s hiring sony ...,0
2019-08-21 00:00:44,i don t know but anything for randomness since...,0
...,...,...
2019-08-31 23:58:48,iam platform curated tweet report telegram tok...,0
2019-08-31 23:59:01,i m guessing leo will always hold due to reput...,0
2019-08-31 23:59:17,for if you transactpurposefully transactneedfu...,0
2019-08-31 23:59:43,i ve been using coinbase which makes it really...,1


### Transformers Fine Tuning

In [None]:
# https://simpletransformers.ai/docs/classification-models/#training-a-classification-model

model_args = ClassificationArgs()
model_args.num_train_epochs = 10
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True
model_args.use_early_stopping = True
model_args.overwrite_output_dir = True

model_args.best_model_dir = "/gdrive/My Drive/TextAnalytics/models/fineTuned_transformers_sentiment"
#model_args.best_model_dir = r"C:\Users\aine2\Desktop\fineTuned_transformers_sentiment"

In [None]:
"""
transformers_model = ClassificationModel("bert", "bert-base-uncased", args=model_args, use_cuda=cuda_available)

transformers_model.train_model(df_train_transformers, eval_df=df_validation_transformers)
"""

'\ntransformers_model = ClassificationModel("bert", "bert-base-uncased", args=model_args, use_cuda=cuda_available)\n\ntransformers_model.train_model(df_train_transformers, eval_df=df_validation_transformers)\n'

In [None]:
#https://huggingface.co/siebert/sentiment-roberta-large-english

transformers_model = ClassificationModel("roberta", "siebert/sentiment-roberta-large-english", args=model_args, use_cuda=cuda_available)

transformers_model.train_model(df_train_transformers_2019, eval_df=df_validation_transformers_2019)

In [None]:
# import di un modello salvato

transformers_model = ClassificationModel("roberta", "/gdrive/My Drive/TextAnalytics/models/fineTuned_transformers_sentiment", use_cuda=cuda_available)
#transformers_model = ClassificationModel("roberta", r"C:\Users\aine2\Downloads\fineTuned_transformers_sentiment", use_cuda=cuda_available)

In [None]:
transformers_model.predict(["bitcoin to the moon", "bitcoin flop"])[0]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[1, 0]

In [None]:
transformers_model.eval_model(df_test_transformers_2019)

In [None]:
df_tweets_polarized_2019["transformers_sentiment"] = transformers_model.predict(list(df_tweets_polarized_2019["text"]))[0]

  0%|          | 0/519002 [00:00<?, ?it/s]

  0%|          | 0/64876 [00:00<?, ?it/s]

In [None]:
df_tweets_polarized_2019

Unnamed: 0_level_0,text,vader_polarity,vader_sentiment,transformers_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-05-06 02:25:41,wish token giveaway retweet and like this twee...,0.7351,1,1
2019-05-06 02:29:30,wish is a decentralized reward platform powere...,0.9451,1,1
2019-05-06 03:34:48,new tasks added to contest enter to win huge c...,0.8442,1,1
2019-05-06 05:29:29,achain congrats our community member artemiycr...,0.8625,1,1
2019-05-06 05:50:31,join on kingcuan our bot will automatically ma...,0.7506,1,1
...,...,...,...,...
2019-09-26 23:57:50,on september daps will make its long awaited t...,0.7884,1,1
2019-09-26 23:58:06,judge denies release of canadian man accused o...,-0.8316,0,0
2019-09-26 23:58:30,dutchsunset alexkjanssen alexcobb this message...,-0.4588,0,0
2019-09-26 23:58:46,pezosaso aaronlevi ltc apompliano changing the...,-0.5994,0,0


In [None]:
df_tweets_polarized_2019.to_csv('/gdrive/My Drive/TextAnalytics/datasets/df_tweets_sentiment_2019.csv')
#df_tweets_polarized_2019.to_csv(r"C:\Users\aine2\Desktop\df_tweets_sentiment_2019.csv")

# 2021

## Datasets

In [None]:
df_tweets_2021 = pd.read_csv("/gdrive/My Drive/TextAnalytics/datasets/df_tweets_2021.csv")

df_tweets_2021['date'] = pd.to_datetime(df_tweets_2021['date'])
df_tweets_2021 = df_tweets_2021.set_index(['date']).sort_index()

df_tweets_2021

Unnamed: 0_level_0,user_name,user_created,user_followers,user_friends,user_favourites,user_verified,text,hashtags,source
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-02-05 10:52:04,Iconic Holding,2021-01-05 13:22:24,301.0,1075,361,False,debunking bitcoin myths by patrick lowry crypt...,"['Bitcoin', 'cryptocurrency', 'bitcoin', 'cryp...",Twitter Web App
2021-02-05 10:52:04,Iconic Holding,2021-01-05 13:22:24,301.0,1075,361,False,weekend read keen to learn about crypto assets...,['crypto'],Twitter Web App
2021-02-05 10:52:06,Iconic Holding,2021-01-05 13:22:24,301.0,1075,361,False,bloomberg lp cryptooutlook with mikemcglone cr...,"['CryptoOutlook', 'cryptocurrency', 'bitcoin',...",Twitter Web App
2021-02-05 10:52:07,Iconic Holding,2021-01-05 13:22:24,301.0,1075,361,False,blockchain by delrayman forbes forbescrypto cr...,"['Blockchain', 'cryptocurrency', 'bitcoin', 'c...",Twitter Web App
2021-02-05 10:52:26,Nick Doevendans,2020-06-12 16:50:07,37.0,123,410,False,reddcoin rdd reddcoin to the moon altcoin turn...,"['reddcoin', 'rdd', 'altcoin', 'turnreddcoinin...",Twitter for iPhone
...,...,...,...,...,...,...,...,...,...
2021-07-30 23:59:56,Nitin Dass ∞/21M,2020-10-25 12:23:58,265.0,415,15391,False,hodlingcarla wooohoooo k in sight bitcoin t co...,['bitcoin'],Twitter for iPhone
2021-07-30 23:59:56,The Last Sat,2020-09-14 04:00:03,3.0,18,11,False,before you know it we ll be under sats left t ...,"['Bitcoin', 'stayhumblestacksats']",Twitter for Android
2021-07-30 23:59:57,Air アーロン,2021-02-15 21:42:55,64.0,451,920,False,btc just hit jackie robinson,['btc'],Twitter for Android
2021-07-30 23:59:58,Greg Pearson,2021-05-31 21:50:18,8.0,12,3,False,btc archive yes if the weekly close tomorrow i...,['BTC'],Twitter Web App


## Vader Sentiment

In [None]:
vader = SentimentIntensityAnalyzer()

In [None]:
df_tweets_2021["vader_polarity"] = df_tweets_2021["text"].apply(lambda text: vader.polarity_scores(str(text)).get('compound'))

In [None]:
def fromPolarity_toSentiment(polarity):
    if polarity>=0.8:
        return 1
    elif polarity<=-0.2:
        return -1
    else:
        return 0

df_tweets_2021["vader_sentiment"] = df_tweets_2021["vader_polarity"].apply(fromPolarity_toSentiment)

In [None]:
df_tweets_2021.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 649001 entries, 2021-02-05 10:52:04 to 2021-07-30 23:59:59
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   user_name        649001 non-null  object 
 1   user_created     649001 non-null  object 
 2   user_followers   649001 non-null  float64
 3   user_friends     649001 non-null  int64  
 4   user_favourites  649001 non-null  int64  
 5   user_verified    649001 non-null  bool   
 6   text             649001 non-null  object 
 7   hashtags         635818 non-null  object 
 8   source           649001 non-null  object 
 9   vader_polarity   649001 non-null  float64
 10  vader_sentiment  649001 non-null  int64  
dtypes: bool(1), float64(2), int64(3), object(5)
memory usage: 55.1+ MB


In [None]:
df_tweets_2021

Unnamed: 0_level_0,user_name,user_created,user_followers,user_friends,user_favourites,user_verified,text,hashtags,source,vader_polarity,vader_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-02-05 10:52:04,Iconic Holding,2021-01-05 13:22:24,301.0,1075,361,False,debunking bitcoin myths by patrick lowry crypt...,"['Bitcoin', 'cryptocurrency', 'bitcoin', 'cryp...",Twitter Web App,0.0000,0
2021-02-05 10:52:04,Iconic Holding,2021-01-05 13:22:24,301.0,1075,361,False,weekend read keen to learn about crypto assets...,['crypto'],Twitter Web App,0.4939,0
2021-02-05 10:52:06,Iconic Holding,2021-01-05 13:22:24,301.0,1075,361,False,bloomberg lp cryptooutlook with mikemcglone cr...,"['CryptoOutlook', 'cryptocurrency', 'bitcoin',...",Twitter Web App,0.0000,0
2021-02-05 10:52:07,Iconic Holding,2021-01-05 13:22:24,301.0,1075,361,False,blockchain by delrayman forbes forbescrypto cr...,"['Blockchain', 'cryptocurrency', 'bitcoin', 'c...",Twitter Web App,0.0000,0
2021-02-05 10:52:26,Nick Doevendans,2020-06-12 16:50:07,37.0,123,410,False,reddcoin rdd reddcoin to the moon altcoin turn...,"['reddcoin', 'rdd', 'altcoin', 'turnreddcoinin...",Twitter for iPhone,0.0000,0
...,...,...,...,...,...,...,...,...,...,...,...
2021-07-30 23:59:56,Nitin Dass ∞/21M,2020-10-25 12:23:58,265.0,415,15391,False,hodlingcarla wooohoooo k in sight bitcoin t co...,['bitcoin'],Twitter for iPhone,0.0000,0
2021-07-30 23:59:56,The Last Sat,2020-09-14 04:00:03,3.0,18,11,False,before you know it we ll be under sats left t ...,"['Bitcoin', 'stayhumblestacksats']",Twitter for Android,0.3182,0
2021-07-30 23:59:57,Air アーロン,2021-02-15 21:42:55,64.0,451,920,False,btc just hit jackie robinson,['btc'],Twitter for Android,0.0000,0
2021-07-30 23:59:58,Greg Pearson,2021-05-31 21:50:18,8.0,12,3,False,btc archive yes if the weekly close tomorrow i...,['BTC'],Twitter Web App,0.6486,0


In [None]:
df_tweets_2021["vader_sentiment"].value_counts()

 0    477941
 1     87419
-1     83641
Name: vader_sentiment, dtype: int64

In [None]:
df_tweets_2021["vader_sentiment"].value_counts(normalize=True)

 0    0.736426
 1    0.134698
-1    0.128877
Name: vader_sentiment, dtype: float64

In [None]:
df_tweets_2021["vader_sentiment"].value_counts()  # con range [-1; -0.5], [-0.5; 0.5], [0.5, 1]

 0    408680
 1    207540
-1     32781
Name: vader_sentiment, dtype: int64

In [None]:
df_tweets_polarized_2021 = df_tweets_2021.loc[df_tweets_2021["vader_sentiment"] != 0]

In [None]:
df_tweets_polarized_2021

Unnamed: 0_level_0,user_name,user_created,user_followers,user_friends,user_favourites,user_verified,text,hashtags,source,vader_polarity,vader_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-02-05 11:14:18,EmilyNews,2017-01-16 10:58:53,439.0,1,6,False,warning powerearn outside project fast scam si...,"['EmilyNews', 'invest', 'HYIPs', 'bitcoin', 'c...",IFTTT,-0.7269,0
2021-02-05 11:20:58,Trading MarcoDaCosta,2010-03-01 19:35:17,5367.0,927,34484,False,twitter ceo jack dorsey has fired up a full bi...,"['Twitter', 'Dorsey', 'Bitcoin', 'BTC']",Twitter for iPhone,-0.5574,0
2021-02-05 11:25:28,Kris Ninakos,2018-09-01 12:06:16,741.0,1315,3996,False,a possible big move for btc bitcoin is coming ...,"['BTC', 'BITCOIN']",Twitter for Android,-0.2023,0
2021-02-05 11:39:20,Emanuel Siddhartha,2021-02-02 14:01:33,15.0,55,59,False,no vcs no ico no unlimited supply no pump and ...,"['znn', 'aliens', 'BTC', 'Bitcoin']",Twitter Web App,-0.8910,0
2021-02-05 11:44:02,Bitcoin Mate (BTC News App),2015-03-08 06:31:18,3312.0,1976,201,False,bitcoin s wild ride renews worries about its m...,,Twibble.io,-0.4215,0
...,...,...,...,...,...,...,...,...,...,...,...
2021-07-30 23:59:16,Alt Center Signals,2020-10-27 15:34:54,197.0,2,8,False,binance futures srm usdt all take profit targe...,"['SRM', 'Signals', 'CryptoSignals', 'Crypto', ...",IFTTT,0.8271,1
2021-07-30 23:59:36,George Brien,2018-07-27 18:41:18,12443.0,140,13523,False,bitcoin at k will hit k soon hodl yours don t ...,['Bitcoin'],Twitter for iPhone,-0.5106,0
2021-07-30 23:59:38,HODL21,2011-09-23 14:41:38,477.0,914,8322,False,i m actually one of the few people i know who ...,['bitcoin'],Twitter for iPhone,-0.2960,0
2021-07-30 23:59:50,David Barge,2009-09-11 20:20:46,53.0,138,2584,False,bitcoin cruising by k no problem,['Bitcoin'],Twitter for Android,-0.5994,0


In [None]:
df_tweets_polarized_2021["vader_sentiment"] = df_tweets_polarized_2021["vader_sentiment"].replace(-1, 0)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
del df_tweets_2021

In [None]:
df_tweets_polarized_2021.to_csv('/gdrive/My Drive/TextAnalytics/datasets/df_tweets_polarized_2021.csv')

## Transformers Sentiment

In [None]:
df_tweets_polarized_2021 = pd.read_csv("/gdrive/My Drive/TextAnalytics/datasets/df_tweets_polarized_2021.csv")
#df_tweets_polarized_2021 = pd.read_csv(r"C:\Users\aine2\Downloads\df_tweets_polarized_2021.csv")

df_tweets_polarized_2021['date'] = pd.to_datetime(df_tweets_polarized_2021['date'])
df_tweets_polarized_2021 = df_tweets_polarized_2021.set_index(['date']).sort_index()

df_tweets_polarized_2021

Unnamed: 0_level_0,user_name,user_created,user_followers,user_friends,user_favourites,user_verified,text,hashtags,source,vader_polarity,vader_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-02-05 11:14:18,EmilyNews,2017-01-16 10:58:53,439.0,1,6,False,warning powerearn outside project fast scam si...,"['EmilyNews', 'invest', 'HYIPs', 'bitcoin', 'c...",IFTTT,-0.7269,0
2021-02-05 11:20:58,Trading MarcoDaCosta,2010-03-01 19:35:17,5367.0,927,34484,False,twitter ceo jack dorsey has fired up a full bi...,"['Twitter', 'Dorsey', 'Bitcoin', 'BTC']",Twitter for iPhone,-0.5574,0
2021-02-05 11:25:28,Kris Ninakos,2018-09-01 12:06:16,741.0,1315,3996,False,a possible big move for btc bitcoin is coming ...,"['BTC', 'BITCOIN']",Twitter for Android,-0.2023,0
2021-02-05 11:39:20,Emanuel Siddhartha,2021-02-02 14:01:33,15.0,55,59,False,no vcs no ico no unlimited supply no pump and ...,"['znn', 'aliens', 'BTC', 'Bitcoin']",Twitter Web App,-0.8910,0
2021-02-05 11:44:02,Bitcoin Mate (BTC News App),2015-03-08 06:31:18,3312.0,1976,201,False,bitcoin s wild ride renews worries about its m...,,Twibble.io,-0.4215,0
...,...,...,...,...,...,...,...,...,...,...,...
2021-07-30 23:59:16,Alt Center Signals,2020-10-27 15:34:54,197.0,2,8,False,binance futures srm usdt all take profit targe...,"['SRM', 'Signals', 'CryptoSignals', 'Crypto', ...",IFTTT,0.8271,1
2021-07-30 23:59:36,George Brien,2018-07-27 18:41:18,12443.0,140,13523,False,bitcoin at k will hit k soon hodl yours don t ...,['Bitcoin'],Twitter for iPhone,-0.5106,0
2021-07-30 23:59:38,HODL21,2011-09-23 14:41:38,477.0,914,8322,False,i m actually one of the few people i know who ...,['bitcoin'],Twitter for iPhone,-0.2960,0
2021-07-30 23:59:50,David Barge,2009-09-11 20:20:46,53.0,138,2584,False,bitcoin cruising by k no problem,['Bitcoin'],Twitter for Android,-0.5994,0


In [None]:
# import di un modello salvato

transformers_model = ClassificationModel("roberta", "/gdrive/My Drive/TextAnalytics/models/fineTuned_transformers_sentiment", use_cuda=cuda_available)
#transformers_model = ClassificationModel("roberta", r"C:\Users\aine2\Downloads\fineTuned_transformers_sentiment", use_cuda=cuda_available)

In [None]:
df_tweets_polarized_2021["transformers_sentiment"] = transformers_model.predict(list(df_tweets_polarized_2021["text"]))[0]

  0%|          | 0/171060 [00:00<?, ?it/s]

  0%|          | 0/21383 [00:00<?, ?it/s]

In [None]:
df_tweets_polarized_2021

Unnamed: 0_level_0,user_name,user_created,user_followers,user_friends,user_favourites,user_verified,text,hashtags,source,vader_polarity,vader_sentiment,transformers_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-02-05 11:14:18,EmilyNews,2017-01-16 10:58:53,439.0,1,6,False,warning powerearn outside project fast scam si...,"['EmilyNews', 'invest', 'HYIPs', 'bitcoin', 'c...",IFTTT,-0.7269,0,0
2021-02-05 11:20:58,Trading MarcoDaCosta,2010-03-01 19:35:17,5367.0,927,34484,False,twitter ceo jack dorsey has fired up a full bi...,"['Twitter', 'Dorsey', 'Bitcoin', 'BTC']",Twitter for iPhone,-0.5574,0,1
2021-02-05 11:25:28,Kris Ninakos,2018-09-01 12:06:16,741.0,1315,3996,False,a possible big move for btc bitcoin is coming ...,"['BTC', 'BITCOIN']",Twitter for Android,-0.2023,0,1
2021-02-05 11:39:20,Emanuel Siddhartha,2021-02-02 14:01:33,15.0,55,59,False,no vcs no ico no unlimited supply no pump and ...,"['znn', 'aliens', 'BTC', 'Bitcoin']",Twitter Web App,-0.8910,0,1
2021-02-05 11:44:02,Bitcoin Mate (BTC News App),2015-03-08 06:31:18,3312.0,1976,201,False,bitcoin s wild ride renews worries about its m...,,Twibble.io,-0.4215,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-30 23:59:16,Alt Center Signals,2020-10-27 15:34:54,197.0,2,8,False,binance futures srm usdt all take profit targe...,"['SRM', 'Signals', 'CryptoSignals', 'Crypto', ...",IFTTT,0.8271,1,1
2021-07-30 23:59:36,George Brien,2018-07-27 18:41:18,12443.0,140,13523,False,bitcoin at k will hit k soon hodl yours don t ...,['Bitcoin'],Twitter for iPhone,-0.5106,0,0
2021-07-30 23:59:38,HODL21,2011-09-23 14:41:38,477.0,914,8322,False,i m actually one of the few people i know who ...,['bitcoin'],Twitter for iPhone,-0.2960,0,1
2021-07-30 23:59:50,David Barge,2009-09-11 20:20:46,53.0,138,2584,False,bitcoin cruising by k no problem,['Bitcoin'],Twitter for Android,-0.5994,0,1


In [None]:
df_tweets_polarized_2021.to_csv('/gdrive/My Drive/TextAnalytics/datasets/df_tweets_sentiment_2021.csv')
#df_tweets_polarized_2021.to_csv(r"C:\Users\aine2\Desktop\df_tweets_sentiment_2021.csv")