In [76]:
import re
from transformers import pipeline
classifier = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment",
    truncation=True,      # <--- important
    padding=True,         # <--- for batching
    max_length=512        
)

Device set to use mps:0


In [31]:
#positive
print(classifier("I didn't think this would be good, but it was amazing."))

#neutral
print(classifier("It was neutral."))

#negative
print(classifier("I hated the whole experience."))

[{'label': 'LABEL_2', 'score': 0.8122838735580444}]
[{'label': 'LABEL_1', 'score': 0.6917058825492859}]
[{'label': 'LABEL_0', 'score': 0.9754863381385803}]


In [32]:
import pandas as pd
df = pd.read_csv("data/test.csv", encoding = "latin-1")
df = df[["text", "sentiment"]]

In [88]:
texts = df["text"].astype(str).tolist()
results = classifier(texts, truncation=True, padding=True, max_length=512, batch_size=32)
df["roberta"] = results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["roberta"] = results


In [90]:
df.head()

Unnamed: 0,text,sentiment,label,roberta,roberta1
0,Last session of the day http://twitpic.com/67ezh,neutral,neutral,"{'label': 'LABEL_1', 'score': 0.9038955569267273}","{'label': 'LABEL_1', 'score': 0.9038954377174377}"
1,Shanghai is also really exciting (precisely -...,positive,positive,"{'label': 'LABEL_2', 'score': 0.9856458306312561}","{'label': 'LABEL_2', 'score': 0.9856458306312561}"
2,"Recession hit Veronique Branquinho, she has to...",negative,negative,"{'label': 'LABEL_0', 'score': 0.9084553122520447}","{'label': 'LABEL_0', 'score': 0.9084553718566895}"
3,happy bday!,positive,positive,"{'label': 'LABEL_2', 'score': 0.9839832782745361}","{'label': 'LABEL_2', 'score': 0.9839832782745361}"
4,http://twitpic.com/4w75p - I like it!!,positive,positive,"{'label': 'LABEL_2', 'score': 0.9848798513412476}","{'label': 'LABEL_2', 'score': 0.9848798513412476}"


Note: roberta is taking around 1.5 minutes to classify each piece of text in the dataset, and this is a relatively small dataset with really short text

In [45]:
df["name"] = df["roberta1"].apply(lambda x: x["label"])

def roberta_label(name):
    if name == 'LABEL_2':
        return "positive"
    elif name == 'LABEL_1':
        return "neutral"
    else:
        return "negative"
df["label"] = df["name"].apply(roberta_label)

df = df[["text", "sentiment", "label"]]

TP = FP = TN = FN = T0 = F0 = 0
def compare(row):
    global TP, FP, TN, FN, T0, F0

    trueVal = row["sentiment"]
    robertaVal = row["label"]

    if robertaVal == "positive":
        if robertaVal == trueVal:
            TP += 1
        else:
            FP += 1
    elif robertaVal == "negative":
        if robertaVal == trueVal:
            TN += 1
        else:
            FN += 1
    else:
        if robertaVal == trueVal:
            T0 += 1
        else:
            F0 += 1

df.apply(compare, axis = 1)

accuracy = (TP+TN+T0)/(TP+FP+TN+FN+T0+F0)

print("Accuracy: ", accuracy)

mistakeDF = df[df["sentiment"] != df["label"]].copy()

mistakeDF.head(15)

Accuracy:  0.7178834182229767


Unnamed: 0,text,sentiment,label
9,What did you get? My day is alright.. haven`...,neutral,positive
12,.. and you`re on twitter! Did the tavern bore...,neutral,negative
16,Miss you,negative,neutral
19,"I`m going into a spiritual stagnentation, its ...",neutral,negative
26,"hey peoples, dont you just hate being grounded...",neutral,negative
37,So I really need to put the laptop down & star...,neutral,negative
38,I`m sorry at least it`s Friday?,negative,neutral
41,I always forget SOMETHING when I travel. I am...,neutral,negative
42,Should have left car and walked home! I might ...,neutral,negative
46,i miss my old phone it worked so good until i ...,neutral,positive


Dataset #2

In [74]:
#from datasets import load_dataset

#ds = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")
#ds = ds["train"]
#df2 = ds.to_pandas()

splits = {'train': 'train_df.csv', 'validation': 'val_df.csv', 'test': 'test_df.csv'}
df2 = pd.read_csv("hf://datasets/Sp1786/multiclass-sentiment-analysis-dataset/" + splits["train"])

df2 = df2.dropna(subset = ["text"])
df2 = df2[["text", "sentiment"]]

def cleaning(text):
    text = re.sub(r'https?:\/\/.\S', "", text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'/', '', text)
    text = re.sub(r'-', '', text)
    text = re.sub(r'_', '', text)
    return text

df2["text"] = df2["text"].apply(cleaning)
df2.head()

df2_shape = df2["text"].shape
count = df.count(axis = 1)
print(f"DataFrame shape (rows, columns): {df2_shape}")
print("non-nan elements per row", count)


DataFrame shape (rows, columns): (31232,)
non-nan elements per row 0       3
1       3
2       3
3       3
4       3
       ..
3529    3
3530    3
3531    3
3532    3
3533    3
Length: 3534, dtype: int64


In [79]:
#df2["roberta"]= df2["text"].apply(roberta_score)

texts = df2["text"].astype(str).tolist()
results = classifier(texts, truncation=True, padding=True, max_length=512, batch_size=32)
df2["roberta"] = results

In [81]:
df2.head()

Unnamed: 0,text,sentiment,roberta
0,"Cooking microwave pizzas, yummy",positive,"{'label': 'LABEL_2', 'score': 0.9240697026252747}"
1,Any plans of allowing sub tasks to show up in ...,neutral,"{'label': 'LABEL_1', 'score': 0.9161937832832336}"
2,"I love the humor, I just reworded it. Like sa...",positive,"{'label': 'LABEL_2', 'score': 0.9173114895820618}"
3,naw idk what ur talkin about,neutral,"{'label': 'LABEL_1', 'score': 0.5904852151870728}"
4,That sucks to hear. I hate days like that,negative,"{'label': 'LABEL_0', 'score': 0.9711525440216064}"


In [82]:
df2["name"] = df2["roberta"].apply(lambda x: x["label"])
df2["label"] = df2["name"].apply(roberta_label)
df2 = df2[["text", "sentiment", "label"]]

In [84]:
TP = FP = TN = FN = T0 = F0 = 0
df2.apply(compare, axis = 1)
accuracy2 = (TP+TN+T0)/(TP+FP+TN+FN+T0+F0)
print("Accuracy: ", accuracy2)

mistakeDF2 = df2[df2["sentiment"] != df2["label"]].copy()
mistakeDF2.head(15)

Accuracy:  0.701812243852459


Unnamed: 0,text,sentiment,label
18,Early monday cramming... yay. Only a few weeks...,neutral,positive
19,ok its FF soooo why isn`t anyone following MEE...,neutral,negative
22,gonna have a lazy day today,negative,neutral
29,"Guess I`m gonna try the nap thing again 2day, ...",negative,neutral
35,Not as useful as Wunderlist... Yet!! Would pre...,neutral,negative
36,"() UGH, i love tila. haters need to shut up",neutral,positive
40,is chilling at home,positive,neutral
52,Hebrew language doesn't work...,positive,negative
61,"6:29 pm ok, let`s go now through bowman strat...",negative,neutral
65,*hug*,positive,neutral


Reddit Dataset:

In [93]:
df3 = pd.read_csv("/Users/mnatali/Downloads/Reddit_Data.csv", encoding = "latin-1")
df3.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [95]:
df3 = df3.dropna(subset = ["clean_comment"])
texts = df3["clean_comment"].astype(str).tolist()
results = classifier(texts, truncation=True, padding=True, max_length=512, batch_size=32)
df3["roberta"] = results

KeyboardInterrupt: 