# Sentiment Analysis with Transformer from Hugging Face


In [16]:
from transformers import AutoModelForSequenceClassification, pipeline

model_name = 'jitesh/emotion-english'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("text-classification", model=model, tokenizer=model_name)



In [6]:
text = "how do i do this. I'm confused "

prediction = classifier(text)
print(prediction) #returns list
print(prediction[0], text)

[{'label': 'confuse', 'score': 0.9659364223480225}]
{'label': 'confuse', 'score': 0.9659364223480225} how do i do this. I'm confused 


In [None]:
emotion_labels = [
    "anger", "cheeky", "confuse", "curious", "disgust", "empathetic", "energetic",
    "fear", "grumpy", "guilty", "impatient", "joy", "love", "neutral", "sadness",
    "serious", "surprise", "suspicious", "think", "whiny"
]


In [20]:
import pandas as pd
cleaned_lines = pd.read_csv("lines_cleaned.csv")
cleaned_lines.columns
cleaned_lines.shape

(200369, 6)

For each row, classify the sentiment of the line

In [19]:

#print first 10 lines
for line in range(10):
    print(cleaned_lines['text'][line])


They do not!
They do to!
I hope so.
She okay?
Let's go.
Wow
Okay -- you're gonna need to learn how to lie.
No
Like my fear of wearing pastels?
What good stuff?


In [None]:
from tqdm import tqdm

In [23]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(classifier.model.name_or_path)

line_ids = []
sentiments = []

for line in tqdm(range(len(cleaned_lines))):
    text = cleaned_lines['text'][line]
    encoded = tokenizer.encode(text, truncation=True, max_length=512, return_tensors="pt")
    decoded = tokenizer.decode(encoded[0], skip_special_tokens=True)
    sentiment = classifier(decoded)
    
    line_ids.append(cleaned_lines["lineID"][line])
    sentiments.append(sentiment)




100%|██████████| 200369/200369 [1:03:03<00:00, 52.95it/s]


## Now that each corresponding line has a sentiment we want to merge it back into our dataframe

In [34]:
sentiments[0]

[{'label': 'anger', 'score': 0.9859177470207214}]

In [None]:
sentiment_labels = [s[0]["label"] for s in sentiments]
confidence = [s[0]['score'] for s in sentiments]
sentiments_df  = pd.DataFrame({'lineID': line_ids, 
              'sentiment': sentiment_labels, 
              'confidence': confidence})

Unnamed: 0,lineID,sentiment,confidence
0,L1045,anger,0.985918
1,L1044,anger,0.987566
2,L985,neutral,0.704899
3,L984,whiny,0.629378
4,L925,impatient,0.983746
...,...,...,...
200364,L665991,empathetic,0.498676
200365,L665990,joy,0.920127
200366,L665989,curious,0.917601
200367,L665988,joy,0.968985


In [38]:
line_sentiments_df = pd.merge(cleaned_lines, sentiments_df, on="lineID")


In [40]:
line_sentiments_df.columns

Index(['lineID', 'characterID', 'movieID', 'character', 'text', 'count',
       'sentiment', 'confidence'],
      dtype='object')

In [47]:
line_sentiments_df = line_sentiments_df.rename(columns={'count': 'num_lines_per_movie_count'})
line_sentiments_df.columns

Index(['lineID', 'characterID', 'movieID', 'character', 'text',
       'num_lines_per_movie_count', 'sentiment', 'confidence'],
      dtype='object')

In [None]:
line_sentiments_df.to_csv("line_sentiment.csv")
