In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
df = pd.read_csv('text.csv.zip', compression='zip')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,label
count,416809.0,416809.0
mean,208404.0,1.554271
std,120322.538513,1.490453
min,0.0,0.0
25%,104202.0,0.0
50%,208404.0,1.0
75%,312606.0,3.0
max,416808.0,5.0


In [5]:
texts = df["text"].tolist()
labels = df["label"].tolist()

In [6]:
def build_vocab(texts, min_freq=1):
    counter = Counter()
    for text in texts:
        tokens = text.lower().split()
        counter.update(tokens)
    vocab = {word: idx for idx, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

In [7]:
vocab = build_vocab(texts, min_freq=1)

In [8]:
vocab

{'i': 0,
 'just': 1,
 'feel': 2,
 'really': 3,
 'helpless': 4,
 'and': 5,
 'heavy': 6,
 'hearted': 7,
 'ive': 8,
 'enjoyed': 9,
 'being': 10,
 'able': 11,
 'to': 12,
 'slouch': 13,
 'about': 14,
 'relax': 15,
 'unwind': 16,
 'frankly': 17,
 'needed': 18,
 'it': 19,
 'after': 20,
 'those': 21,
 'last': 22,
 'few': 23,
 'weeks': 24,
 'around': 25,
 'the': 26,
 'end': 27,
 'of': 28,
 'uni': 29,
 'expo': 30,
 'have': 31,
 'lately': 32,
 'started': 33,
 'find': 34,
 'myself': 35,
 'feeling': 36,
 'a': 37,
 'bit': 38,
 'listless': 39,
 'which': 40,
 'is': 41,
 'never': 42,
 'good': 43,
 'thing': 44,
 'gave': 45,
 'up': 46,
 'my': 47,
 'internship': 48,
 'with': 49,
 'dmrg': 50,
 'am': 51,
 'distraught': 52,
 'dont': 53,
 'know': 54,
 'so': 55,
 'lost': 56,
 'kindergarten': 57,
 'teacher': 58,
 'thoroughly': 59,
 'weary': 60,
 'job': 61,
 'having': 62,
 'taken': 63,
 'university': 64,
 'entrance': 65,
 'exam': 66,
 'suffered': 67,
 'from': 68,
 'anxiety': 69,
 'for': 70,
 'as': 71,
 'did': 72

In [9]:
def vectorize_text(text, vocab):
    tokens = text.lower().split()
    vectorized = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    return vectorized

vectorized_texts = [vectorize_text(text, vocab) for text in texts]
labels = [int(label) for label in labels]


In [10]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [11]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = [len(text) for text in texts]
    max_length = max(lengths)
    padded_texts = [text + [vocab['<PAD>']] * (max_length - len(text)) for text in texts]
    return torch.LongTensor(padded_texts), torch.LongTensor(labels)
    #return torch.tensor(padded_texts, dtype=torch.long), torch.tensor(labels, dtype=torch.long), torch.tensor(lengths, dtype=torch.long)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(vectorized_texts, labels, test_size=0.2, random_state=42)
train_databaset = EmotionDataset(X_train, y_train)
test_dataset = EmotionDataset(X_test, y_test)

train_dataloader = DataLoader(train_databaset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [13]:
from unicodedata import bidirectional


class BiLSTMEmotionClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(BiLSTMEmotionClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<PAD>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out[:, -1, :])
        return out

In [14]:
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 64
output_dim = len(set(labels))

In [15]:
model = BiLSTMEmotionClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion.to(device)

epochs = 20

for epoch in range(epochs):
    for batch in train_dataloader:
        texts, labels = batch
        texts, labels = texts.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')
    
# evaluation

correct_predictions = 0
total_predictions = 0
with torch.no_grad():
    for batch in test_dataloader:
        texts, labels = batch
        texts, labels = texts.to(device), labels.to(device)

        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()
        
# accuracy
accuracy = correct_predictions / total_predictions
print(f'Test Accuracy: {accuracy:.4f}')
        

Epoch 1/20, Loss: 1.1800
Epoch 2/20, Loss: 0.9903
Epoch 3/20, Loss: 0.0021
Epoch 4/20, Loss: 0.0067
Epoch 5/20, Loss: 0.0031
Epoch 6/20, Loss: 0.1301
Epoch 7/20, Loss: 0.0002
Epoch 8/20, Loss: 0.0002
Epoch 9/20, Loss: 0.1167
Epoch 10/20, Loss: 0.0000
Epoch 11/20, Loss: 0.2009
Epoch 12/20, Loss: 0.0015
Epoch 13/20, Loss: 0.1733
Epoch 14/20, Loss: 0.2000
Epoch 15/20, Loss: 0.0074
Epoch 16/20, Loss: 0.0002
Epoch 17/20, Loss: 0.1154
Epoch 18/20, Loss: 0.1917
Epoch 19/20, Loss: 0.0826
Epoch 20/20, Loss: 0.1142
Test Accuracy: 0.9233


In [21]:
def predict_emotion(text, model, vocab, device):
    model.eval()
    vectorized_text = vectorize_text(text, vocab)
    input_tensor = torch.LongTensor([vectorized_text]).to(device)
    with torch.no_grad():
        output = model(input_tensor)
        _, predicted = torch.max(output.data, 1)
    return predicted.item()

new_sentences = [
    "I am so happy and joyful today!",
    "I feel sad and lonely.",
    "I would not have expected that!",
    "I love spending time with my family.",
    "I am angry about the situation.",
    "I am scared of the dark."
]
for sentence in new_sentences:
    predicted_emotion = predict_emotion(sentence, model, vocab, device)
    label_name = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'][predicted_emotion]
    print(f"Sentence: '{sentence}' => Predicted Sentiment Label: {label_name}")

Sentence: 'I am so happy and joyful today!' => Predicted Sentiment Label: joy
Sentence: 'I feel sad and lonely.' => Predicted Sentiment Label: sadness
Sentence: 'I would not have expected that!' => Predicted Sentiment Label: sadness
Sentence: 'I love spending time with my family.' => Predicted Sentiment Label: joy
Sentence: 'I am angry about the situation.' => Predicted Sentiment Label: anger
Sentence: 'I am scared of the dark.' => Predicted Sentiment Label: fear
