#

In [1]:
# https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest?text=you+might+be+a+if...+%23%23sjw+%23liberal+%23politics
# Twitter-roBERTa-base for Sentiment Analysis - UPDATED (2022)

In [2]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import pandas as pd


In [3]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


In [4]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL, local_files_only=True)
config = AutoConfig.from_pretrained(MODEL)

In [5]:
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def predict(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    return softmax(scores)

In [7]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


In [18]:
df_train_cleaned = pd.read_csv("..\\..\\..\\data\\twitter_hate-speech\\train_cleaned.csv")
df_train_cleaned = df_train_cleaned[df_train_cleaned.tweet_cleaned.notna()]
df_train_cleaned.head()

Unnamed: 0,id,label,tweet,tweet_cleaned,user_handle,hashtags,emojis
0,8886,0,@user #cinemaaawards final rehearsals!! geari...,cinema award final rehearsal gear evening butt...,1,"['#cinemaaawards', '#butterflies', '#stage']",
1,909,0,istg this is the best cheese ta but dayum expe...,tg good cheese day um expensive,0,[],
2,27613,0,this was amazing. the weather was not. #musica...,amazing weather musical london matilda west en...,0,"['#musical', '#london', '#matilda', '#westend'...",
3,15999,0,yes! #talented #sexy ‘criminal minds’ casts ...,yes talented sexy ' criminal mind ' cast serie...,2,"['#talented', '#sexy']",
4,23817,0,want to be while being #successful? see how ...,want successful see work life balance help,2,"['#successful', '#worklifebalance']",


In [9]:
# df_train_cleaned.head(5).tweet_cleaned.apply(predict)

In [10]:
# predict("@user #cinemaaawards final rehearsals!! gearing up for the evening!! #butterflies #stage  ! hope u all like it")

### Predict "tweet_cleaned"

In [20]:
df_train_cleaned["tweet_cleaned_roberta_predictions"] = df_train_cleaned["tweet_cleaned"].apply(predict)

In [21]:
df_train_cleaned

Unnamed: 0,id,label,tweet,tweet_cleaned,user_handle,hashtags,emojis,tweet_cleaned_roberta_predictions
0,8886,0,@user #cinemaaawards final rehearsals!! geari...,cinema award final rehearsal gear evening butt...,1,"['#cinemaaawards', '#butterflies', '#stage']",,"[0.011631846, 0.8329207, 0.15544751]"
1,909,0,istg this is the best cheese ta but dayum expe...,tg good cheese day um expensive,0,[],,"[0.17947705, 0.32253772, 0.49798524]"
2,27613,0,this was amazing. the weather was not. #musica...,amazing weather musical london matilda west en...,0,"['#musical', '#london', '#matilda', '#westend'...",,"[0.0014606817, 0.025883349, 0.97265595]"
3,15999,0,yes! #talented #sexy ‘criminal minds’ casts ...,yes talented sexy ' criminal mind ' cast serie...,2,"['#talented', '#sexy']",,"[0.0022572489, 0.062179178, 0.93556356]"
4,23817,0,want to be while being #successful? see how ...,want successful see work life balance help,2,"['#successful', '#worklifebalance']",,"[0.0322919, 0.43325263, 0.5344555]"
...,...,...,...,...,...,...,...,...
20174,16407,0,joshwin is always like this! 😂😂 what's wrong w...,josh win always like face tear joy tear joy wr...,0,[],"__face_with_tears_of_joy__,__face_with_tears_o...","[0.3770243, 0.42588902, 0.19708675]"
20175,6526,0,#makaveli #day i hit 1000 plays with this ...,makaveli day hit play one hear even thugge lil...,0,"['#makaveli', '#day', '#kingtutkafafi']",,"[0.018977983, 0.52240974, 0.45861232]"
20176,8002,1,video men and women malayalees xxx pictureban...,video man woman malaya lee xxx picture banglad...,0,[],,"[0.017293481, 0.8894819, 0.09322464]"
20177,323,0,hahaha.. this is me last #friday &amp; #weird,hahaha last friday weird,0,"['#friday', '#weird']",,"[0.18371728, 0.61801475, 0.19826792]"


### Predict "tweet"

In [22]:
df_train_cleaned["tweet_uncleaned_roberta_predictions"] = df_train_cleaned["tweet"].apply(predict)

### Save prediction results

In [26]:
df_train_cleaned.to_csv("roberta_results.csv")

In [27]:
results = pd.read_csv("roberta_results.csv")
results.head()

Unnamed: 0.1,Unnamed: 0,id,label,tweet,tweet_cleaned,user_handle,hashtags,emojis,tweet_cleaned_roberta_predictions,tweet_uncleaned_roberta_predictions
0,0,8886,0,@user #cinemaaawards final rehearsals!! geari...,cinema award final rehearsal gear evening butt...,1,"['#cinemaaawards', '#butterflies', '#stage']",,[0.01163185 0.8329207 0.15544751],[0.00121513 0.01826618 0.9805187 ]
1,1,909,0,istg this is the best cheese ta but dayum expe...,tg good cheese day um expensive,0,[],,[0.17947705 0.32253772 0.49798524],[0.41535568 0.22157288 0.3630715 ]
2,2,27613,0,this was amazing. the weather was not. #musica...,amazing weather musical london matilda west en...,0,"['#musical', '#london', '#matilda', '#westend'...",,[0.00146068 0.02588335 0.97265595],[0.00485588 0.02015036 0.9749937 ]
3,3,15999,0,yes! #talented #sexy ‘criminal minds’ casts ...,yes talented sexy ' criminal mind ' cast serie...,2,"['#talented', '#sexy']",,[0.00225725 0.06217918 0.93556356],[0.00162262 0.04763915 0.95073825]
4,4,23817,0,want to be while being #successful? see how ...,want successful see work life balance help,2,"['#successful', '#worklifebalance']",,[0.0322919 0.43325263 0.5344555 ],[0.0041232 0.24599265 0.7498841 ]
