In [34]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline, AutoTokenizer, AutoModel

In [35]:
# Read the total file and choose the ones who mention bitcoin. We will use "buy" and "sell" as labels for the sentiment

tweets = pd.read_csv('./twitter_batch1_200904.csv')
tweets['text'] = tweets['text'].astype('str') 
tweets = tweets.drop(['to','id','permalink','hashtags','mentions','geo','Unnamed: 0'], axis=1)
tweetscrypto = tweets[tweets['text'].str.contains("bitcoin|Bitcoin|BTC|btc|bitcoins|Bitcoins")].reset_index()
tokenizer2 = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
labels = ['buy','sell']

In [None]:
tweetscrypto.head()

In [None]:
classifier = pipeline('zero-shot-classification',model="nlptown/bert-base-multilingual-uncased-sentiment",tokenizer=tokenizer2)

In [None]:
# I collect in a file all the tweets from bitcoin and the sentiments. The process takes around 15 hours.

for i in range (1,len(tweetscrypto)):
          
    with open('out2.txt', 'a') as f:
        
        print(tweetscrypto.iloc[i,2],classifier(tweetscrypto.iloc[i,1], labels), file=f).round()

In [None]:
# I take the file with the tweets + sentiment analysis and do some feature engineering for the time series:

data = pd.read_csv('out.txt',
                   sep="{'sequence': | 'scores':",
                   header=None,
                   engine='python')

data['Date']=data[0]
data[2]=data[2].str.slice(2,-2)
data['Text']=data[1].str.slice(1, -29)
data[['Buy', 'Sell']] = data[2].str.split(', ', n=1, expand=True)
data=data.drop([0], axis=1)
data=data.drop([1], axis=1)
data=data.drop([2], axis=1)
data['Date'] = pd.to_datetime(data['Date'],format="%Y-%m-%d")
data['Buy'] = pd.to_numeric(data['Buy'])
data['Sell'] = pd.to_numeric(data['Sell'])
data.head()

In [None]:
data.shape

In [None]:
# I group "Buy" and "Sell" by months... results are very bad, every month is the same sentiment.
# We have to think about another model - strategy


data_mean_buy = data['Buy'].groupby([data['Date'].dt.year, data['Date'].dt.month]).mean()
data_mean_sell = data['Sell'].groupby([data['Date'].dt.year, data['Date'].dt.month]).mean()

In [14]:
data_mean_buy

Date  Date
2018  1       0.560052
      2       0.558301
      3       0.560147
      4       0.562752
      5       0.566009
      6       0.562527
      7       0.562748
      8       0.562440
      9       0.561411
      10      0.563321
      11      0.558296
      12      0.563149
2019  1       0.561614
      2       0.565831
      3       0.566630
      4       0.562719
      5       0.562504
      6       0.565415
      7       0.565155
      8       0.561044
      9       0.562437
      10      0.559397
      11      0.562965
      12      0.562632
Name: Buy, dtype: float64

In [15]:
data_mean_sell

Date  Date
2018  1       0.439948
      2       0.441699
      3       0.439853
      4       0.437248
      5       0.433991
      6       0.437473
      7       0.437252
      8       0.437560
      9       0.438589
      10      0.436679
      11      0.441704
      12      0.436851
2019  1       0.438386
      2       0.434169
      3       0.433370
      4       0.437281
      5       0.437496
      6       0.434585
      7       0.434845
      8       0.438956
      9       0.437563
      10      0.440603
      11      0.437035
      12      0.437368
Name: Sell, dtype: float64

In [7]:
len(tweetscrypto)

144744

# LET'S CLEAN THE TEXT:

In [36]:
tweetscrypto.head(15)

Unnamed: 0,index,name,text,date,retweets,favorites
0,350,NextBillion,News via @livemint: @RBI bars banks from links...,2018-04-10 06:00:00+00:00,0,0
1,353,NextBillion,News via @livemint: @RBI bars banks from links...,2018-04-09 18:00:00+00:00,0,0
2,379,NextBillion,Newsfeed from @payments_source : How an Africa...,2018-04-02 17:58:40+00:00,0,0
3,2375,hmeisler,Is that Bitcoin? Not my thing,2019-04-11 20:20:00+00:00,0,1
4,3855,hmeisler,I have zero opinion on bitcoin.,2019-05-15 17:43:43+00:00,0,1
5,3858,hmeisler,bitcoin? you have the wrong girl here.,2019-05-15 17:06:11+00:00,0,3
6,3864,hmeisler,For months on end my timeline was silent on Bi...,2019-05-15 16:51:22+00:00,9,104
7,5179,hmeisler,not of the sort i use. but honestly i think if...,2019-06-21 18:42:24+00:00,0,0
8,5185,hmeisler,"I look at the stock market, not bitcoin. Sorry...",2019-06-21 18:31:06+00:00,0,0
9,5188,hmeisler,1) It's Helene with an E 2) I have no opinion ...,2019-06-21 18:13:57+00:00,0,0


In [37]:
# we remove all the labels (words starting with "@") and all the links (words starting with "http")


tweetscrypto['clean_text']=tweetscrypto['text'].str.replace('(\@\w+.*?)',"")


def clean_text(X):
    X = X.split()
    X_new = [x for x in X if not x.startswith("http")]
    return ' '.join(X_new)

tweetscrypto['clean_text'] = tweetscrypto['clean_text'].apply(clean_text)


import string 


def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

tweetscrypto['clean_text'] = tweetscrypto['clean_text'].apply(remove_punctuation)


# we apply lowercase to all capital letters:

tweetscrypto['clean_text']=tweetscrypto['clean_text'].apply(lambda x: x.lower())

In [38]:
tweetscrypto.head(15)

Unnamed: 0,index,name,text,date,retweets,favorites,clean_text
0,350,NextBillion,News via @livemint: @RBI bars banks from links...,2018-04-10 06:00:00+00:00,0,0,news via bars banks from links to cryptocurre...
1,353,NextBillion,News via @livemint: @RBI bars banks from links...,2018-04-09 18:00:00+00:00,0,0,news via bars banks from links to cryptocurre...
2,379,NextBillion,Newsfeed from @payments_source : How an Africa...,2018-04-02 17:58:40+00:00,0,0,newsfeed from how an african startup built a ...
3,2375,hmeisler,Is that Bitcoin? Not my thing,2019-04-11 20:20:00+00:00,0,1,is that bitcoin not my thing
4,3855,hmeisler,I have zero opinion on bitcoin.,2019-05-15 17:43:43+00:00,0,1,i have zero opinion on bitcoin
5,3858,hmeisler,bitcoin? you have the wrong girl here.,2019-05-15 17:06:11+00:00,0,3,bitcoin you have the wrong girl here
6,3864,hmeisler,For months on end my timeline was silent on Bi...,2019-05-15 16:51:22+00:00,9,104,for months on end my timeline was silent on bi...
7,5179,hmeisler,not of the sort i use. but honestly i think if...,2019-06-21 18:42:24+00:00,0,0,not of the sort i use but honestly i think if ...
8,5185,hmeisler,"I look at the stock market, not bitcoin. Sorry...",2019-06-21 18:31:06+00:00,0,0,i look at the stock market not bitcoin sorry i...
9,5188,hmeisler,1) It's Helene with an E 2) I have no opinion ...,2019-06-21 18:13:57+00:00,0,0,1 its helene with an e 2 i have no opinion on ...


In [39]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [40]:
tweetscrypto['text'][2]

'Newsfeed from @payments_source : How an African startup built a #bitcoin network while crypto’s reputation crumbled #cryptocurrency https://nextbillion.net/news/african-startup-built-bitcoin-network-cryptos-reputation-crumbled/'

In [41]:
tweetscrypto['clean_text'][2]

'newsfeed from  how an african startup built a bitcoin network while crypto’s reputation crumbled cryptocurrency'

In [42]:
stop_words2 = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'into', 'through', 'during', 'to', 'in', 'on', 'off', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'most', 'some', 'such', 'nor', 'only', 'own', 'same', 'so', 'than', 'very', 's', 't', 'can', 'just', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y',  ]
print(stop_words2)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'into', 'through', 'during', 'to', 'in', 'on', 'off', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'most', 'some', 'such', 'nor', 'only', 'own', 'same', 'so', 'than', 'very', 's', 't', 'can', 'just', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y']


In [43]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
 
  
# remove stopwords function 
def remove_stopwords(text):
    stop_words = set(stop_words2) 
    word_tokens = word_tokenize(text) 
    filtered_text = [word for word in word_tokens if word not in stop_words] 
    return filtered_text 

tweetscrypto['clean_text']=tweetscrypto['clean_text'].apply(lambda x: remove_stopwords(x))

In [44]:
tweetscrypto.head(15)

Unnamed: 0,index,name,text,date,retweets,favorites,clean_text
0,350,NextBillion,News via @livemint: @RBI bars banks from links...,2018-04-10 06:00:00+00:00,0,0,"[news, via, bars, banks, from, links, cryptocu..."
1,353,NextBillion,News via @livemint: @RBI bars banks from links...,2018-04-09 18:00:00+00:00,0,0,"[news, via, bars, banks, from, links, cryptocu..."
2,379,NextBillion,Newsfeed from @payments_source : How an Africa...,2018-04-02 17:58:40+00:00,0,0,"[newsfeed, from, african, startup, built, bitc..."
3,2375,hmeisler,Is that Bitcoin? Not my thing,2019-04-11 20:20:00+00:00,0,1,"[is, bitcoin, not, thing]"
4,3855,hmeisler,I have zero opinion on bitcoin.,2019-05-15 17:43:43+00:00,0,1,"[zero, opinion, bitcoin]"
5,3858,hmeisler,bitcoin? you have the wrong girl here.,2019-05-15 17:06:11+00:00,0,3,"[bitcoin, wrong, girl]"
6,3864,hmeisler,For months on end my timeline was silent on Bi...,2019-05-15 16:51:22+00:00,9,104,"[months, end, timeline, was, silent, bitcoin, ..."
7,5179,hmeisler,not of the sort i use. but honestly i think if...,2019-06-21 18:42:24+00:00,0,0,"[not, sort, use, honestly, think, want, bitcoi..."
8,5185,hmeisler,"I look at the stock market, not bitcoin. Sorry...",2019-06-21 18:31:06+00:00,0,0,"[look, stock, market, not, bitcoin, sorry, not..."
9,5188,hmeisler,1) It's Helene with an E 2) I have no opinion ...,2019-06-21 18:13:57+00:00,0,0,"[1, helene, e, 2, no, opinion, bitcoin]"


In [45]:
def untokenize(text):
    return [TreebankWordDetokenizer().detokenize(token) for token in text]

In [46]:
tweetscrypto['clean_text2']=tweetscrypto['clean_text'].apply(lambda x: " ".join(x))

In [47]:
tweetscrypto.head(15)

Unnamed: 0,index,name,text,date,retweets,favorites,clean_text,clean_text2
0,350,NextBillion,News via @livemint: @RBI bars banks from links...,2018-04-10 06:00:00+00:00,0,0,"[news, via, bars, banks, from, links, cryptocu...",news via bars banks from links cryptocurrencie...
1,353,NextBillion,News via @livemint: @RBI bars banks from links...,2018-04-09 18:00:00+00:00,0,0,"[news, via, bars, banks, from, links, cryptocu...",news via bars banks from links cryptocurrencie...
2,379,NextBillion,Newsfeed from @payments_source : How an Africa...,2018-04-02 17:58:40+00:00,0,0,"[newsfeed, from, african, startup, built, bitc...",newsfeed from african startup built bitcoin ne...
3,2375,hmeisler,Is that Bitcoin? Not my thing,2019-04-11 20:20:00+00:00,0,1,"[is, bitcoin, not, thing]",is bitcoin not thing
4,3855,hmeisler,I have zero opinion on bitcoin.,2019-05-15 17:43:43+00:00,0,1,"[zero, opinion, bitcoin]",zero opinion bitcoin
5,3858,hmeisler,bitcoin? you have the wrong girl here.,2019-05-15 17:06:11+00:00,0,3,"[bitcoin, wrong, girl]",bitcoin wrong girl
6,3864,hmeisler,For months on end my timeline was silent on Bi...,2019-05-15 16:51:22+00:00,9,104,"[months, end, timeline, was, silent, bitcoin, ...",months end timeline was silent bitcoin nonstop...
7,5179,hmeisler,not of the sort i use. but honestly i think if...,2019-06-21 18:42:24+00:00,0,0,"[not, sort, use, honestly, think, want, bitcoi...",not sort use honestly think want bitcoin perso...
8,5185,hmeisler,"I look at the stock market, not bitcoin. Sorry...",2019-06-21 18:31:06+00:00,0,0,"[look, stock, market, not, bitcoin, sorry, not...",look stock market not bitcoin sorry not journa...
9,5188,hmeisler,1) It's Helene with an E 2) I have no opinion ...,2019-06-21 18:13:57+00:00,0,0,"[1, helene, e, 2, no, opinion, bitcoin]",1 helene e 2 no opinion bitcoin


In [48]:
import csv
tweetscrypto.to_csv('clean_tweets.csv', index=False, quoting=csv.QUOTE_ALL)

# THIS MODEL DOES NOT SEEM TO PERFORM WELL ON THE TWEETS. LET'S TRY WITH THE 5 STARS MODEL

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

classifier = pipeline(task="sentiment-analysis", model=model,tokenizer=tokenizer, return_all_scores=True)

In [2]:
# Read the total file and choose the ones who mention bitcoin. We will use "buy" and "sell" as labels for the sentiment

tweets = pd.read_csv('./twitter_batch2_200909.csv')
tweets['text'] = tweets['text'].astype('str') 
tweets = tweets.drop(['to','retweets','favorites','id','permalink','hashtags','mentions','geo','Unnamed: 0'], axis=1)
tweetscrypto = tweets[tweets['text'].str.contains("bitcoin|Bitcoin")]


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
tweetscrypto

Unnamed: 0,name,text,date
350,NextBillion,News via @livemint: @RBI bars banks from links...,2018-04-10 06:00:00+00:00
353,NextBillion,News via @livemint: @RBI bars banks from links...,2018-04-09 18:00:00+00:00
379,NextBillion,Newsfeed from @payments_source : How an Africa...,2018-04-02 17:58:40+00:00
2375,hmeisler,Is that Bitcoin? Not my thing,2019-04-11 20:20:00+00:00
3855,hmeisler,I have zero opinion on bitcoin.,2019-05-15 17:43:43+00:00
...,...,...,...
5215648,GCGodfrey,India lifts ban on crypto currencies.. Still c...,2020-03-04 12:22:35
5215681,GCGodfrey,Should you invest in Bitcoin? Wait for the sur...,2020-02-26 15:35:51
5215712,GCGodfrey,Because it just wasn’t hot enough last time .....,2020-02-11 18:02:55
5216019,GCGodfrey,#Bitcoin on @SkyNews with @MoolaTeam https://m...,2018-01-16 10:25:57


In [8]:
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

classifier = pipeline(task="sentiment-analysis", model=model,tokenizer=tokenizer, return_all_scores=True)

In [15]:
for i in range (85267,len(tweetscrypto)):
          
    with open('out_news_sentiment_tweets_5stars.txt', 'a') as f:     #I put the "2" to not erase the file if we execute this
        
        print(tweetscrypto.iloc[i,2],';;;;',tweetscrypto.iloc[i,1],';;;;',classifier(tweetscrypto.iloc[i,1]), file=f)

In [5]:
tweetscrypto.head()

Unnamed: 0,name,text,date
350,NextBillion,News via @livemint: @RBI bars banks from links...,2018-04-10 06:00:00+00:00
353,NextBillion,News via @livemint: @RBI bars banks from links...,2018-04-09 18:00:00+00:00
379,NextBillion,Newsfeed from @payments_source : How an Africa...,2018-04-02 17:58:40+00:00
2375,hmeisler,Is that Bitcoin? Not my thing,2019-04-11 20:20:00+00:00
3855,hmeisler,I have zero opinion on bitcoin.,2019-05-15 17:43:43+00:00


In [25]:
data = pd.read_csv('out_news_sentiment_tweets_5stars.txt',
                   sep=";;;;",
                   header=None,
                   engine='python')

In [26]:
data

Unnamed: 0,0,1,2
0,2018-04-09 18:00:00+00:00,News via @livemint: @RBI bars banks from link...,"[[{'label': '1 star', 'score': 0.658743977546..."
1,2018-04-02 17:58:40+00:00,Newsfeed from @payments_source : How an Afric...,"[[{'label': '1 star', 'score': 0.685619950294..."
2,2019-04-11 20:20:00+00:00,Is that Bitcoin? Not my thing,"[[{'label': '1 star', 'score': 0.564680814743..."
3,2019-05-15 17:43:43+00:00,I have zero opinion on bitcoin.,"[[{'label': '1 star', 'score': 0.716885983943..."
4,2019-05-15 17:06:11+00:00,bitcoin? you have the wrong girl here.,"[[{'label': '1 star', 'score': 0.646257460117..."
...,...,...,...
132098,2020-03-04 12:22:35,India lifts ban on crypto currencies.. Still ...,"[[{'label': '1 star', 'score': 0.503425657749..."
132099,2020-02-26 15:35:51,Should you invest in Bitcoin? Wait for the su...,"[[{'label': '1 star', 'score': 0.329764157533..."
132100,2020-02-11 18:02:55,Because it just wasn’t hot enough last time ....,"[[{'label': '1 star', 'score': 0.395248651504..."
132101,2018-01-16 10:25:57,#Bitcoin on @SkyNews with @MoolaTeam https://...,"[[{'label': '1 star', 'score': 0.222680196166..."


In [27]:
data['Date']=data[0]
data['Tweet']=data[1]
data['Sentiment']=data[2]
data=data.drop([0,1,2], axis=1)

In [28]:
data.head()

Unnamed: 0,Date,Tweet,Sentiment
0,2018-04-09 18:00:00+00:00,News via @livemint: @RBI bars banks from link...,"[[{'label': '1 star', 'score': 0.658743977546..."
1,2018-04-02 17:58:40+00:00,Newsfeed from @payments_source : How an Afric...,"[[{'label': '1 star', 'score': 0.685619950294..."
2,2019-04-11 20:20:00+00:00,Is that Bitcoin? Not my thing,"[[{'label': '1 star', 'score': 0.564680814743..."
3,2019-05-15 17:43:43+00:00,I have zero opinion on bitcoin.,"[[{'label': '1 star', 'score': 0.716885983943..."
4,2019-05-15 17:06:11+00:00,bitcoin? you have the wrong girl here.,"[[{'label': '1 star', 'score': 0.646257460117..."


In [29]:
data['start_1stars'] = data['Sentiment'].str.find("'1 star', 'score': ") + len("'1 star', 'score': ")
data['end_1stars'] = data['Sentiment'].str.find("}, {'label': '2 stars'")
data['1_stars']= data.apply(lambda x: x['Sentiment'][slice(x['start_1stars'], x['end_1stars'])], axis=1)
data['1_stars']= data.apply(lambda x: x['Sentiment'][slice(x['start_1stars'], x['end_1stars'])], axis=1)
data['1_stars'] = data['1_stars'].astype('float64') 
data['1_stars'] = round(data['1_stars'],2)
data=data.drop(['start_1stars','end_1stars'], axis=1)


data['start_2stars'] = data['Sentiment'].str.find("'2 stars', 'score': ") + len("'2 stars', 'score': ")
data['end_2stars'] = data['Sentiment'].str.find("}, {'label': '3 stars'")
data['2_stars']= data.apply(lambda x: x['Sentiment'][slice(x['start_2stars'], x['end_2stars'])], axis=1)
data['2_stars'] = data['2_stars'].astype('float64') 
data['2_stars'] = round(data['2_stars'],2)
data=data.drop(['start_2stars','end_2stars'], axis=1)


data['start_3stars'] = data['Sentiment'].str.find("'3 stars', 'score': ") + len("'3 stars', 'score': ")
data['end_3stars'] = data['Sentiment'].str.find("}, {'label': '4 stars'")
data['3_stars']= data.apply(lambda x: x['Sentiment'][slice(x['start_3stars'], x['end_3stars'])], axis=1)
data['3_stars'] = data['3_stars'].astype('float64') 
data['3_stars'] = round(data['3_stars'],2)
data=data.drop(['start_3stars','end_3stars'], axis=1)

data['start_4stars'] = data['Sentiment'].str.find("'4 stars', 'score': ") + len("'4 stars', 'score': ")
data['end_4stars'] = data['Sentiment'].str.find("}, {'label': '5 stars'")
data['4_stars']= data.apply(lambda x: x['Sentiment'][slice(x['start_4stars'], x['end_4stars'])], axis=1)
data['4_stars'] = data['4_stars'].astype('float64') 
data['4_stars'] = round(data['4_stars'],2)
data=data.drop(['start_4stars','end_4stars'], axis=1)

data['start_5stars'] = data['Sentiment'].str.find("'5 stars', 'score': ") + len("'5 stars', 'score': ")
data['end_5stars'] = data['Sentiment'].str.find("}]]")
data['5_stars']= data.apply(lambda x: x['Sentiment'][slice(x['start_5stars'], x['end_5stars'])], axis=1)
data['5_stars'] = data['5_stars'].astype('float64') 
data['5_stars'] = round(data['5_stars'],2)
data=data.drop(['start_5stars','end_5stars'], axis=1)




data.head()

Unnamed: 0,Date,Tweet,Sentiment,1_stars,2_stars,3_stars,4_stars,5_stars
0,2018-04-09 18:00:00+00:00,News via @livemint: @RBI bars banks from link...,"[[{'label': '1 star', 'score': 0.658743977546...",0.66,0.2,0.09,0.03,0.02
1,2018-04-02 17:58:40+00:00,Newsfeed from @payments_source : How an Afric...,"[[{'label': '1 star', 'score': 0.685619950294...",0.69,0.2,0.08,0.02,0.01
2,2019-04-11 20:20:00+00:00,Is that Bitcoin? Not my thing,"[[{'label': '1 star', 'score': 0.564680814743...",0.56,0.29,0.13,0.01,0.0
3,2019-05-15 17:43:43+00:00,I have zero opinion on bitcoin.,"[[{'label': '1 star', 'score': 0.716885983943...",0.72,0.15,0.08,0.03,0.02
4,2019-05-15 17:06:11+00:00,bitcoin? you have the wrong girl here.,"[[{'label': '1 star', 'score': 0.646257460117...",0.65,0.19,0.09,0.03,0.04


In [30]:
data=data.drop(['Sentiment'], axis=1)
data['Date'] = pd.to_datetime(data['Date'],format="%Y-%m-%d")
data['Tweet'] = data['Tweet'].astype('str') 

In [39]:
# WE USE quoting=csv.quote so that it writes the values between " ", and then we don't have problems with
# tweets where a comma is included as part of the tweet.

import csv
data.to_csv('./sentiments_tweets.csv', index=False, quoting=csv.QUOTE_ALL)

# LET'S USE A FINANCIAL MODEL (BERT)

In [89]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("ipuneetrathore/bert-base-cased-finetuned-finBERT")

model = AutoModelForSequenceClassification.from_pretrained("ipuneetrathore/bert-base-cased-finetuned-finBERT")

In [90]:
classifier = pipeline(task="sentiment-analysis", model=model,tokenizer=tokenizer, return_all_scores=True)

In [132]:
import pandas as pd

tweets_good_finbert = pd.read_csv('./FINBERT/good_tweets_finbert.csv')


In [133]:
tweets_good_finbert.head(5)

Unnamed: 0,Date,Tweet
0,2020-08-12 08:00:02+00:00,"""Bitcoin is unstoppable."" #bitcoin #bunnies"
1,2020-08-10 08:00:02+00:00,"""it's definitely not a bubble"" #bitcoin #bunn..."
2,2020-05-15 22:20:42+00:00,I do think Bitcoin ends up being the new lead...
3,2018-11-22 19:03:33+00:00,I'm endlessly thankful for the wonderful Bitc...
4,2020-07-27 01:40:54+00:00,Bitcoin is incredibly undervalued at $10k.


In [134]:
tweets_good_finbert['Tweet'] = tweets_good_finbert['Tweet'].astype('str') 

In [135]:
for i in range (0,len(tweets_good_finbert)):
          
    with open('./FINBERT/good_tweets_finbert_final.csv', 'a') as f:     #I put the "2" to not erase the file if we execute this
        
        print(tweets_good_finbert.iloc[i,0],';;;;',tweets_good_finbert.iloc[i,1],';;;;',classifier(tweets_good_finbert.iloc[i,1]), file=f)

In [136]:
data = pd.read_csv('./FINBERT/good_tweets_finbert_final.csv',
                   sep=";;;;",
                   header=None,
                   engine='python')

In [137]:
data['Date']=data[0]
data['Tweet']=data[1]
data['Sentiment']=data[2]
data=data.drop([0,1,2], axis=1)

In [138]:
data['start_LABEL0'] = data['Sentiment'].str.find("'LABEL_0', 'score': ") + len("'LABEL_0', 'score': ")
data['end_LABEL0'] = data['Sentiment'].str.find("}, {'label': 'LABEL_1'")
data['0_stars']= data.apply(lambda x: x['Sentiment'][slice(x['start_LABEL0'], x['end_LABEL0'])], axis=1)
data['0_stars'] = data['0_stars'].astype('float64') 
data['0_stars'] = round(data['0_stars'],3)
data=data.drop(['start_LABEL0','end_LABEL0'], axis=1)


data['start_LABEL1'] = data['Sentiment'].str.find("'LABEL_1', 'score': ") + len("'LABEL_1', 'score': ")
data['end_LABEL1'] = data['Sentiment'].str.find("}, {'label': 'LABEL_2'")
data['1_star']= data.apply(lambda x: x['Sentiment'][slice(x['start_LABEL1'], x['end_LABEL1'])], axis=1)
data['1_star'] = data['1_star'].astype('float64') 
data['1_star'] = round(data['1_star'],2)
data=data.drop(['start_LABEL1','end_LABEL1'], axis=1)




data['start_LABEL2'] = data['Sentiment'].str.find("'LABEL_2', 'score': ") + len("'LABEL_2', 'score': ")
data['end_LABEL2'] = data['Sentiment'].str.find("}]]")
data['2_stars']= data.apply(lambda x: x['Sentiment'][slice(x['start_LABEL2'], x['end_LABEL2'])], axis=1)
data['2_stars'] = data['2_stars'].astype('float64') 
data['2_stars'] = round(data['2_stars'],2)
data=data.drop(['start_LABEL2','end_LABEL2','Sentiment'], axis=1)



In [139]:
data

Unnamed: 0,Date,Tweet,0_stars,1_star,2_stars
0,2020-08-12 08:00:02+00:00,"""Bitcoin is unstoppable."" #bitcoin #bunnies",0.0,1.00,0.00
1,2020-08-10 08:00:02+00:00,"""it's definitely not a bubble"" #bitcoin #bun...",0.0,1.00,0.00
2,2020-05-15 22:20:42+00:00,I do think Bitcoin ends up being the new lea...,0.0,1.00,0.00
3,2018-11-22 19:03:33+00:00,I'm endlessly thankful for the wonderful Bit...,0.0,0.00,1.00
4,2020-07-27 01:40:54+00:00,Bitcoin is incredibly undervalued at $10k.,1.0,0.00,0.00
...,...,...,...,...,...
65,2020-02-05 19:36:50,If each millionaire on Earth wanted a Bitcoi...,0.0,1.00,0.00
66,2019-09-27 17:17:43+00:00,The average lifespan of fiat currency is 27 ...,0.0,1.00,0.00
67,2018-02-06 13:58:45,"Hey, congrats to bitcoin, really happy for y...",0.0,0.46,0.54
68,2020-01-07 07:31:58+00:00,#Bitcoin is up 13% in the last 3 days.,0.0,0.00,1.00


In [140]:
import csv
data.to_csv('./FINBERT/good_tweets_finbert_final2.csv', index=False, quoting=csv.QUOTE_ALL)

In [141]:
tweets_bad_finbert = pd.read_csv('./FINBERT/bad_tweets_finbert.csv')
tweets_bad_finbert['Tweet'] = tweets_bad_finbert['Tweet'].astype('str') 
tweets_bad_finbert.head(5)

Unnamed: 0,Date,Tweet
0,2018-10-02 02:14:41+00:00,Bitcoin is screwed. Sell everything. Thanks f...
1,2019-11-25 04:24:57+00:00,"He meant it as ""don't acquire bitcoins""."
2,2019-09-24 19:34:09+00:00,Clearly Bitcoin is going down because democra...
3,2020-01-02 17:02:22+00:00,Bitcoin falls below $7000 for the first time ...
4,2018-09-06 05:42:08+00:00,Asia stocks head for 1y low on trade & EM anx...


In [142]:
for i in range (0,len(tweets_bad_finbert)):
          
    with open('./FINBERT/bad_tweets_finbert_final.csv', 'a') as f:     #I put the "2" to not erase the file if we execute this
        
        print(tweets_bad_finbert.iloc[i,0],';;;;',tweets_bad_finbert.iloc[i,1],';;;;',classifier(tweets_bad_finbert.iloc[i,1]), file=f)

In [143]:
data2 = pd.read_csv('./FINBERT/bad_tweets_finbert_final.csv',
                   sep=";;;;",
                   header=None,
                   engine='python')

In [144]:
data2['Date']=data2[0]
data2['Tweet']=data2[1]
data2['Sentiment']=data2[2]
data2=data2.drop([0,1,2], axis=1)

In [145]:
data2['start_LABEL0'] = data2['Sentiment'].str.find("'LABEL_0', 'score': ") + len("'LABEL_0', 'score': ")
data2['end_LABEL0'] = data2['Sentiment'].str.find("}, {'label': 'LABEL_1'")
data2['0_stars']= data2.apply(lambda x: x['Sentiment'][slice(x['start_LABEL0'], x['end_LABEL0'])], axis=1)
data2['0_stars'] = data2['0_stars'].astype('float64') 
data2['0_stars'] = round(data2['0_stars'],3)
data2=data2.drop(['start_LABEL0','end_LABEL0'], axis=1)


data2['start_LABEL1'] = data2['Sentiment'].str.find("'LABEL_1', 'score': ") + len("'LABEL_1', 'score': ")
data2['end_LABEL1'] = data2['Sentiment'].str.find("}, {'label': 'LABEL_2'")
data2['1_star']= data2.apply(lambda x: x['Sentiment'][slice(x['start_LABEL1'], x['end_LABEL1'])], axis=1)
data2['1_star'] = data2['1_star'].astype('float64') 
data2['1_star'] = round(data2['1_star'],2)
data2=data2.drop(['start_LABEL1','end_LABEL1'], axis=1)




data2['start_LABEL2'] = data2['Sentiment'].str.find("'LABEL_2', 'score': ") + len("'LABEL_2', 'score': ")
data2['end_LABEL2'] = data2['Sentiment'].str.find("}]]")
data2['2_stars']= data2.apply(lambda x: x['Sentiment'][slice(x['start_LABEL2'], x['end_LABEL2'])], axis=1)
data2['2_stars'] = data2['2_stars'].astype('float64') 
data2['2_stars'] = round(data2['2_stars'],2)
data2=data2.drop(['start_LABEL2','end_LABEL2','Sentiment'], axis=1)

In [146]:
data2

Unnamed: 0,Date,Tweet,0_stars,1_star,2_stars
0,2018-10-02 02:14:41+00:00,Bitcoin is screwed. Sell everything. Thanks ...,0.000,1.00,0.00
1,2019-11-25 04:24:57+00:00,"He meant it as ""don't acquire bitcoins"".",0.000,1.00,0.00
2,2019-09-24 19:34:09+00:00,Clearly Bitcoin is going down because democr...,1.000,0.00,0.00
3,2020-01-02 17:02:22+00:00,Bitcoin falls below $7000 for the first time...,1.000,0.00,0.00
4,2018-09-06 05:42:08+00:00,Asia stocks head for 1y low on trade & EM an...,0.342,0.66,0.00
...,...,...,...,...,...
59,2019-05-04 15:54:59,I'm happy bitcoin just dumped. Got tired of ...,0.001,0.99,0.01
60,2019-11-11 07:00:48+00:00,If you believe #bitcoin behaves as a safe ha...,0.000,1.00,0.00
61,2019-11-25 08:30:26+00:00,#Bitcoin is down 30% in just the last 20 day...,1.000,0.00,0.00
62,2019-11-22 15:11:02+00:00,#Bitcoin is down a whopping 48% since late J...,0.631,0.37,0.00


In [147]:
import csv
data2.to_csv('./FINBERT/bad_tweets_finbert_final2.csv', index=False, quoting=csv.QUOTE_ALL)