In [None]:
# https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest?text=you+might+be+a+if...+%23%23sjw+%23liberal+%23politics
# Twitter-roBERTa-base for Sentiment Analysis - UPDATED (2022)

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import pandas as pd
from sklearn.metrics import classification_report


In [None]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL, local_files_only=True)
config = AutoConfig.from_pretrained(MODEL)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

In [None]:
def predict(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    return softmax(scores)

In [None]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


### Data

In [None]:
df_train_cleaned = pd.read_csv("..\\..\\..\\data\\twitter_hate-speech\\train_cleaned.csv")
df_train_cleaned = df_train_cleaned[df_train_cleaned.tweet_cleaned.notna()]
df_train_cleaned.head()

In [None]:
df_train_cleaned_rnn = pd.read_csv("..\\..\\..\\data\\twitter_hate-speech\\train_cleaned_rnn.csv")
df_train_cleaned_rnn = df_train_cleaned_rnn[df_train_cleaned_rnn.tweet_cleaned.notna()]
df_train_cleaned_rnn.head()

### Result DF

In [None]:
results = df_train_cleaned[["label", "tweet", "tweet_cleaned"]].copy()
results["tweet_cleaned_rnn"] = df_train_cleaned_rnn["tweet_cleaned"]
results.head()

### Predict "tweet_cleaned"

In [None]:
# results["tweet_cleaned_roberta_predictions"] = df_train_cleaned["tweet_cleaned"].apply(predict)

### Predict "tweet"

In [None]:
# results["tweet_uncleaned_roberta_predictions"] = df_train_cleaned["tweet"].apply(predict)

### Predict "tweet_cleaned_rnn"

In [None]:
results["tweet_cleaned_rnn_roberta_predictions"] = df_train_cleaned_rnn["tweet_cleaned"].apply(predict)

### Save prediction results

In [None]:
results.to_csv("roberta_result.csv")

In [None]:
results = pd.read_csv("roberta_result.csv", index_col=0)
results.head()

### Compare Results

In [10]:
result = pd.read_csv("roberta_result.csv", index_col=0)
result.head()

Unnamed: 0,label,tweet,tweet_cleaned,tweet_cleaned_rnn,tweet_cleaned_roberta_predictions,tweet_uncleaned_roberta_predictions,tweet_cleaned_rnn_roberta_predictions
0,0,@user #cinemaaawards final rehearsals!! geari...,cinema award final rehearsal gear evening butt...,cinemaaawards final rehearsals gearing up for ...,[0.01163185 0.8329207 0.15544751],[0.00121513 0.01826618 0.9805187 ],[0.00206753 0.07825466 0.91967785]
1,0,istg this is the best cheese ta but dayum expe...,tg good cheese day um expensive,istg this is the best cheese ta but dayum expe...,[0.17947705 0.32253772 0.49798524],[0.41535568 0.22157288 0.3630715 ],[0.3564552 0.21312353 0.4304212 ]
2,0,this was amazing. the weather was not. #musica...,amazing weather musical london matilda west en...,this was amazing the weather was not musical l...,[0.00146068 0.02588335 0.97265595],[0.00485588 0.02015036 0.9749937 ],[0.00503316 0.01394822 0.98101866]
3,0,yes! #talented #sexy ‘criminal minds’ casts ...,yes talented sexy ' criminal mind ' cast serie...,yes talented sexy ‘criminal minds’ casts as se...,[0.00225725 0.06217918 0.93556356],[0.00162262 0.04763915 0.95073825],[0.00205533 0.05398567 0.943959 ]
4,0,want to be while being #successful? see how ...,want successful see work life balance help,want to be while being successful see how work...,[0.0322919 0.43325263 0.5344555 ],[0.0041232 0.24599265 0.7498841 ],[0.01058821 0.2610585 0.72835326]


In [29]:
result.dropna(inplace=True)

In [30]:
def get_prediction_label(values):
    values = values.replace("[", "").replace("]", "")
    split = values.split(" ")
    float_split = []
    for i in range(len(split)):
        if (len(split[i]) > 0):
            float_split.append(float(split[i]))
    series = pd.Series(float_split)
    max_pred = series.max()
    if float_split[0] == max_pred:
        return 1
    else:
        return 0

In [31]:
result['tweet_cleaned_roberta_predictions_max'] = result['tweet_cleaned_roberta_predictions'].apply(get_prediction_label)

In [32]:
result['tweet_uncleaned_roberta_predictions_max'] = result['tweet_uncleaned_roberta_predictions'].apply(get_prediction_label)

In [33]:
result['tweet_cleaned_rnn_roberta_predictions_max'] = result['tweet_cleaned_rnn_roberta_predictions'].apply(get_prediction_label)

In [34]:
result

Unnamed: 0,label,tweet,tweet_cleaned,tweet_cleaned_rnn,tweet_cleaned_roberta_predictions,tweet_uncleaned_roberta_predictions,tweet_cleaned_rnn_roberta_predictions,tweet_cleaned_roberta_predictions_max,tweet_uncleaned_roberta_predictions_max,tweet_cleaned_rnn_roberta_predictions_max
0,0,@user #cinemaaawards final rehearsals!! geari...,cinema award final rehearsal gear evening butt...,cinemaaawards final rehearsals gearing up for ...,[0.01163185 0.8329207 0.15544751],[0.00121513 0.01826618 0.9805187 ],[0.00206753 0.07825466 0.91967785],0,0,0
1,0,istg this is the best cheese ta but dayum expe...,tg good cheese day um expensive,istg this is the best cheese ta but dayum expe...,[0.17947705 0.32253772 0.49798524],[0.41535568 0.22157288 0.3630715 ],[0.3564552 0.21312353 0.4304212 ],0,1,0
2,0,this was amazing. the weather was not. #musica...,amazing weather musical london matilda west en...,this was amazing the weather was not musical l...,[0.00146068 0.02588335 0.97265595],[0.00485588 0.02015036 0.9749937 ],[0.00503316 0.01394822 0.98101866],0,0,0
3,0,yes! #talented #sexy ‘criminal minds’ casts ...,yes talented sexy ' criminal mind ' cast serie...,yes talented sexy ‘criminal minds’ casts as se...,[0.00225725 0.06217918 0.93556356],[0.00162262 0.04763915 0.95073825],[0.00205533 0.05398567 0.943959 ],0,0,0
4,0,want to be while being #successful? see how ...,want successful see work life balance help,want to be while being successful see how work...,[0.0322919 0.43325263 0.5344555 ],[0.0041232 0.24599265 0.7498841 ],[0.01058821 0.2610585 0.72835326],0,0,0
...,...,...,...,...,...,...,...,...,...,...
20173,0,"@user it's laughable to see all these ""celebri...","laughable see "" celebrity "" ali funeral funera...",is so typical could you be anymore just look ...,[0.3770243 0.42588902 0.19708675],[0.8019254 0.17868435 0.01939025],[0.9280245 0.06675169 0.00522374],0,1,1
20174,0,joshwin is always like this! 😂😂 what's wrong w...,josh win always like face tear joy tear joy wr...,going to give some a whirl ;healthy,[0.01897798 0.52240974 0.45861232],[0.00553541 0.0707955 0.92366904],[0.02291691 0.5061897 0.47089338],0,0,0
20175,0,#makaveli #day i hit 1000 plays with this ...,makaveli day hit play one hear even thugge lil...,on flipside of praise for reminder that reale...,[0.01729348 0.8894819 0.09322464],[0.01132669 0.815968 0.17270525],[0.01583459 0.26939365 0.71477175],0,0,0
20176,1,video men and women malayalees xxx pictureban...,video man woman malaya lee xxx picture banglad...,cbfc wants outwit makers of udtapunjab hence ...,[0.18371728 0.61801475 0.19826792],[0.15314105 0.46144542 0.38541353],[0.6473514 0.3379626 0.01468601],0,0,1


In [35]:
y_true = result.label
y_pred_clean = result.tweet_cleaned_roberta_predictions_max
y_pred_unclean = result.tweet_uncleaned_roberta_predictions_max
y_pred_clean_rnn = result.tweet_cleaned_rnn_roberta_predictions_max

In [36]:
print(classification_report(y_true, y_pred_clean))

              precision    recall  f1-score   support

           0       0.94      0.81      0.87     18845
           1       0.07      0.22      0.11      1331

    accuracy                           0.77     20176
   macro avg       0.50      0.51      0.49     20176
weighted avg       0.88      0.77      0.82     20176



In [37]:
print(classification_report(y_true, y_pred_unclean))

              precision    recall  f1-score   support

           0       0.94      0.75      0.83     18845
           1       0.07      0.28      0.11      1331

    accuracy                           0.72     20176
   macro avg       0.50      0.51      0.47     20176
weighted avg       0.88      0.72      0.79     20176



In [38]:
print(classification_report(y_true, y_pred_clean_rnn))

              precision    recall  f1-score   support

           0       0.94      0.74      0.83     18845
           1       0.07      0.27      0.11      1331

    accuracy                           0.71     20176
   macro avg       0.50      0.51      0.47     20176
weighted avg       0.88      0.71      0.78     20176

