In [None]:
# pip install transformers

In [None]:
# pip install accelerate -U

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from nltk.tokenize import TweetTokenizer
from scipy.special import softmax


In [None]:
data = pd.read_csv('preprocessed_data.csv')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287188 entries, 0 to 287187
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   287187 non-null  float64
 1   createdAt            287188 non-null  object 
 2   fullName             287149 non-null  object 
 3   userName             287188 non-null  object 
 4   profileImage         287188 non-null  object 
 5   fullText             287188 non-null  object 
 6   replyTo              287188 non-null  float64
 7   lang                 287188 non-null  object 
 8   quoteCount           287188 non-null  int64  
 9   retweetCount         287188 non-null  int64  
 10  replyCount           287188 non-null  int64  
 11  likeCount            287188 non-null  int64  
 12  viewCount            103504 non-null  float64
 13  sentimentLabel1      1790 non-null    float64
 14  sentimentLabel2      1201 non-null    float64
 15  sentimentLabel3  

In [None]:
#remove stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = nltk.word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

# Apply the remove_stopwords function to the 'cleaned_tweet' column
data['cleaned_tweet'] = data['cleaned_tweet'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Emotion Label Model

In [None]:
# load tokenizer and model, create trainer
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts

    def __len__(self):
        return len(self.tokenized_texts["input_ids"])

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [None]:
tweets = data['cleaned_tweet'].dropna().astype('str').tolist()

In [None]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(tweets,truncation=True,padding=True)

In [None]:
prediction_dataset = SimpleDataset(tokenized_texts)

In [None]:
predictions = trainer.predict(prediction_dataset)

In [None]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [None]:
temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))


In [None]:
anger = []
disgust = []
fear = []
joy = []
neutral = []
sadness = []
surprise = []

# extract scores (as many entries as exist in pred_texts)
for i in range(len(tweets)):
  anger.append(temp[i][0])
  disgust.append(temp[i][1])
  fear.append(temp[i][2])
  joy.append(temp[i][3])
  neutral.append(temp[i][4])
  sadness.append(temp[i][5])
  surprise.append(temp[i][6])

In [None]:
data['anger'] = anger
data['disgust'] = disgust
data['fear'] = fear
data['joy'] = joy
data['neutral'] = neutral
data['sadness'] = sadness
data['surprise'] = surprise
data['emotion_label'] = labels

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287188 entries, 0 to 287187
Data columns (total 28 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   287187 non-null  float64
 1   createdAt            287188 non-null  object 
 2   fullName             287149 non-null  object 
 3   userName             287188 non-null  object 
 4   profileImage         287188 non-null  object 
 5   fullText             287188 non-null  object 
 6   replyTo              287188 non-null  float64
 7   lang                 287188 non-null  object 
 8   quoteCount           287188 non-null  int64  
 9   retweetCount         287188 non-null  int64  
 10  replyCount           287188 non-null  int64  
 11  likeCount            287188 non-null  int64  
 12  viewCount            103504 non-null  float64
 13  sentimentLabel1      1790 non-null    float64
 14  sentimentLabel2      1201 non-null    float64
 15  sentimentLabel3  

In [None]:
data['emotion_label'].value_counts()

emotion_label
neutral     143590
anger        49114
sadness      27897
surprise     22168
joy          21979
fear         13746
disgust       8694
Name: count, dtype: int64

In [3]:
data.head()

Unnamed: 0,id,createdAt,fullName,userName,profileImage,fullText,replyTo,lang,quoteCount,retweetCount,...,cleaned_tweet_vader,tokens,anger,disgust,fear,joy,neutral,sadness,surprise,emotion_label
0,1.773239e+18,2024-03-28 06:43:19+00:00,Stella Patchouli,StellaPatch,https://pbs.twimg.com/profile_images/175397939...,@nytimes The curse on Musk iis working!\r\n🔥☔🐱🐱🤹💥,1772931578835939584,en,0,0,...,curse musk iis working,"['the', 'curse', 'on', 'musk', 'ii', 'working']",0.007012,0.004977,0.007745,0.00755,0.59275,0.312809,0.067157,neutral
1,1.772992e+18,2024-03-27 14:22:04+00:00,Boston Strong 🍀,bostonstronggg,https://pbs.twimg.com/profile_images/174182833...,@Sp4rksaflyin @nytimes These Electric Tesla ca...,1772942184456614144,en,0,0,...,electric tesla cars start new england cold fri...,"['these', 'electric', 'tesla', 'car', 'cannot'...",0.036987,0.001647,0.003299,0.211342,0.514728,0.020631,0.211366,neutral
2,1.772956e+18,2024-03-27 11:56:58+00:00,"Don't Start, Won't Be",GetBackToNo,https://pbs.twimg.com/profile_images/134400549...,@nytimes Is there something wrong with the Chi...,1772931578835939584,en,0,0,...,something wrong chinese communist party,"['is', 'there', 'something', 'wrong', 'with', ...",0.361652,0.008085,0.011581,0.002932,0.06683,0.513652,0.035267,sadness
3,1.772946e+18,2024-03-27 11:17:08+00:00,novus locus,HuttonRich55685,https://pbs.twimg.com/profile_images/170782492...,@nytimes I love my Chinese IPhone. Thank you c...,1772931578835939584,en,0,0,...,love chinese iphone thank comrade apple,"['i', 'love', 'my', 'chinese', 'iphone', 'than...",0.004483,0.000653,0.001027,0.759493,0.071811,0.01278,0.149753,joy
4,1.772946e+18,2024-03-27 11:16:54+00:00,Politics Barn,PoliticsBarn,https://pbs.twimg.com/profile_images/119881591...,@nytimes And we can read all about it on our m...,1772931578835939584,en,0,0,...,read madeinchina devices,"['and', 'we', 'can', 'read', 'all', 'about', '...",0.004016,0.002191,0.002639,0.026681,0.587482,0.037722,0.33927,neutral


In [None]:
data.to_csv('emotion_labeled_data.csv', index=False)