# Sample tweets sentiment analysis

In [1]:
import vaderSentiment
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


### Load sample data

In [2]:
sample = pd.read_csv("../data/tweets/tweets_sample.txt", sep='|')
sample

Unnamed: 0,name,handle,timestamp,location,text
0,chillipalmer™,@ynnmedianetwork,2015-07-06T09:35:08.000Z,"Southeast Dallas, Dallas",Twitter Reacts To JPP Fireworks Rumors http://...
1,Tyles,@tyles,2015-07-06T23:50:53.000Z,"California, USA",Congratulations to \n@obsolescence\n and ptrev...
2,Ken J Gonzalez,@That_nigga_Puto,2015-07-07T05:13:07.000Z,"Holyoke, MA",The 4th was too too real ! Thanks for awsome m...
3,Wole Wolexy,@iamWolexy,2015-07-07T03:34:18.000Z,"Mesquite, TX","Perfection is not attainable, But If we chase ..."
4,Lloy Ball #1,@LTPer,2015-07-06T19:20:43.000Z,"Indiana, USA",@LBpineapple\n sand vb training #evp #Plattsburgh
...,...,...,...,...,...
23411,Erin Sweeney,@e_sween15,2015-07-06T13:24:14.000Z,"Marysville, OH",I would https://twitter.com/exhllarating/stat...
23412,HomerJesse,@homerleal,2015-07-07T00:53:23.000Z,"San Antonio, TX",Monday funday!! #loveit #awesome #tispy #sanan...
23413,511NYMidHudson,@511NYMidHudson,2015-07-06T13:00:36.000Z,"Croton-on-Hudson, NY",Construction on #NorthSouthDivisionStreet BOTH...
23414,kels,@KelseyKielich,2015-07-06T22:11:20.000Z,"Moore, OK",You make me happy & it's Monday @ ur my crush ...


### Functions for preprocessing tweets

In [3]:
def preprocess_tweet(tweet):
    """remove user tags and urls"""
    tweet_words = []

    for word in tweet.split(' '):
        word = '@user' if word.startswith('@') and len(word) > 1 else word            
        word = 'http' if word.startswith('http') else word  # careful! sometimes emojis are not separated from URLs by spaces, need to take into account
        tweet_words.append(word)
        
    return " ".join(tweet_words)


### roBERTa

In [4]:
roberta = "cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

labels = ['Negative', 'Neutral', 'Positive']

In [11]:
#sample_tweet = sample['text'].iloc[12014]
sample_tweet = 'I am SO incredibly excited to get Away from here for 5 days😩 I just need a break... ¿You know?'  # this evaluates wrong sentiment
preprocessed = preprocess_tweet(sample_tweet)
print(preprocessed)

processed = tokenizer(preprocessed, return_tensors='pt')

output = model(**processed)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

for i in range(len(scores)):
    print(labels[i], ":", scores[i])

I am SO incredibly excited to get Away from here for 5 days😩 I just need a break... ¿You know?
Negative : 0.0050168945
Neutral : 0.011381342
Positive : 0.98360175
