In [79]:
import pandas as pd
import numpy as np

In [80]:
df = pd.read_csv(r"/content/twitter_training.csv" ,
                 header=None,
                names=['number','source','label','text'])

In [81]:
df.shape

(74682, 4)

In [82]:
df.head()

Unnamed: 0,number,source,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [83]:
df.tail()

Unnamed: 0,number,source,label,text
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...
74681,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


In [84]:
df['source'].value_counts()

Unnamed: 0_level_0,count
source,Unnamed: 1_level_1
Microsoft,2400
MaddenNFL,2400
TomClancysRainbowSix,2400
LeagueOfLegends,2394
CallOfDuty,2394
Verizon,2382
CallOfDutyBlackopsColdWar,2376
ApexLegends,2376
Facebook,2370
WorldOfCraft,2364


In [85]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Negative,22542
Positive,20832
Neutral,18318
Irrelevant,12990


In [86]:
import re
import collections
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [87]:
import re

url_re      = re.compile(r'https?://\S+|www\.\S+')
user_re     = re.compile(r'@\w+')
hashtag_re  = re.compile(r'#(\w+)')
repeat_re   = re.compile(r'(.)\1{2,}')  # e.g. loooove → lo love
emoji_re    = re.compile("["
                 u"\U0001F600-\U0001F64F"
                 u"\U0001F300-\U0001F5FF"
                 # … add ranges as desired
                 "]+", flags=re.UNICODE)

In [88]:
def clean(text):
    text = text.lower()
    text = url_re.sub('<URL>', text)
    text = user_re.sub('<USER>', text)
    text = hashtag_re.sub(r'\1', text)
    text = emoji_re.sub('', text)
    text = re.sub(r'&amp;',' and ', text)
    text = re.sub(r'[^\w\s<>]',' ', text)       # keep <URL> <USER>
    text = repeat_re.sub(r'\1\1', text)         # loooove→loove
    text = re.sub(r'\s+',' ', text).strip()
    return text

In [89]:
def tokenize(text):
    return text.split()

In [90]:
df = df.dropna(subset=['text']).copy()

In [91]:
df['tokens'] = df['text'].map(clean).map(tokenize)

In [92]:
label2idx = {'Negative':0, 'Neutral':1, 'Positive':2 ,'Irrelevant':3}
df['label_id'] = df['label'].map(label2idx)

In [93]:
train_df, test_df = train_test_split(df, test_size=0.2,
                                     stratify=df['label_id'],
                                     random_state=42)

In [94]:
train_df, val_df  = train_test_split(train_df, test_size=0.1,
                                     stratify=train_df['label_id'],
                                     random_state=42)

In [95]:
counter = collections.Counter()
for tokens in train_df['tokens']:
    counter.update(tokens)

In [96]:
vocab_size = 20000
most_common = counter.most_common(vocab_size-2)   # reserve 0,1
itos = ['<PAD>', '<UNK>'] + [w for w,_ in most_common]
stoi = {w:i for i,w in enumerate(itos)}

In [97]:
PAD_IDX = stoi['<PAD>']
UNK_IDX = stoi['<UNK>']

In [98]:
max_len = 50

def encode_and_pad(tokens):
    ids = [stoi.get(t, UNK_IDX) for t in tokens]
    if len(ids) < max_len:
        ids = ids + [PAD_IDX]*(max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids


In [99]:
for split in (train_df, val_df, test_df):
    split['input_ids'] = split['tokens'].map(encode_and_pad)

In [100]:
class TweetSentimentDataset(Dataset):
    def __init__(self, df):
        self.x = df['input_ids'].tolist()
        self.y = df['label_id'].tolist()
    def __len__(self):
        return len(self.y)
    def __getitem__(self, i):
        return torch.tensor(self.x[i], dtype=torch.long), \
               torch.tensor(self.y[i], dtype=torch.long)


In [101]:
batch_size = 32
train_ds = TweetSentimentDataset(train_df)
val_ds   = TweetSentimentDataset(val_df)
test_ds  = TweetSentimentDataset(test_df)


In [102]:
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size)
test_loader  = DataLoader(test_ds,  batch_size)

In [103]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [104]:
class BiLSTMAttentionSentiment(nn.Module):
    def __init__(self,
                 vocab_size,
                 emb_dim,
                 hidden_dim,
                 output_dim,
                 padding_idx,
                 dropout=0.2):
        super().__init__()
        # 1) embedding
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_dim,
            padding_idx=padding_idx
        )
        # 2) bi‐LSTM encoder
        self.encoder = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        # 3) attention: score = v^T tanh(W h_i + b)
        self.attn_W = nn.Linear(hidden_dim*2, hidden_dim*2)
        self.attn_v = nn.Linear(hidden_dim*2, 1, bias=False)
        # 4) classifier on context vector
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids):
        # input_ids: [batch_size, seq_len]
        mask = (input_ids != PAD_IDX)                # [B, T]
        emb = self.dropout(self.embedding(input_ids)) # [B, T, E]

        # run Bi‐LSTM
        outputs, _ = self.encoder(emb)                # outputs: [B, T, 2H]

        # compute attention scores
        #  a) energy: [B, T, 2H]
        energy = torch.tanh(self.attn_W(outputs))
        #  b) scores: [B, T, 1] → squeeze → [B, T]
        scores = self.attn_v(energy).squeeze(-1)

        # mask out PAD positions before softmax
        scores = scores.masked_fill(~mask, -1e9)

        # normalized weights [B, T]
        attn_weights = F.softmax(scores, dim=1)

        # context vector = weighted sum [B, 2H]
        #   unsqueeze weights for broadcast: [B, T, 1]
        context = torch.sum(attn_weights.unsqueeze(-1) * outputs, dim=1)

        # final classification
        context = self.dropout(context)
        logits = self.fc(context)                     # [B, output_dim]
        return logits, attn_weights


In [105]:
vocab_size = len(itos)
emb_dim     = 100
hidden_dim  = 128
output_dim  = 4    # negative/neutral/positive
PAD_IDX     = stoi['<PAD>']

In [106]:
model = BiLSTMAttentionSentiment(
    vocab_size, emb_dim, hidden_dim, output_dim, PAD_IDX
)

In [107]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [108]:
model.to(device)

BiLSTMAttentionSentiment(
  (embedding): Embedding(20000, 100, padding_idx=0)
  (encoder): LSTM(100, 128, batch_first=True, bidirectional=True)
  (attn_W): Linear(in_features=256, out_features=256, bias=True)
  (attn_v): Linear(in_features=256, out_features=1, bias=False)
  (fc): Linear(in_features=256, out_features=4, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [109]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
criterion.to(device)


CrossEntropyLoss()

In [110]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss, total_correct, total_count = 0., 0, 0
    for input_ids, labels in loader:
        input_ids, labels = input_ids.to(device), labels.to(device)
        optimizer.zero_grad()
        logits, _ = model(input_ids)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        preds = logits.argmax(dim=1)
        total_correct += (preds == labels).sum().item()
        total_count   += labels.size(0)

    avg_loss = total_loss / total_count
    accuracy = total_correct / total_count
    return avg_loss, accuracy

In [111]:
def eval_epoch(model, loader, criterion):
    model.eval()
    total_loss, total_correct, total_count = 0., 0, 0
    with torch.no_grad():
        for input_ids, labels in loader:
            input_ids, labels = input_ids.to(device), labels.to(device)
            logits, _ = model(input_ids)
            loss = criterion(logits, labels)

            total_loss += loss.item() * labels.size(0)
            preds = logits.argmax(dim=1)
            total_correct += (preds == labels).sum().item()
            total_count   += labels.size(0)

    avg_loss = total_loss / total_count
    accuracy = total_correct / total_count
    return avg_loss, accuracy

In [112]:
n_epochs = 5
for epoch in range(1, n_epochs+1):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion)
    val_loss,   val_acc   = eval_epoch(model, val_loader,   criterion)
    print(f'Epoch {epoch} | '
          f'train loss {train_loss:.3f}, acc {train_acc:.3f} | '
          f' val loss {val_loss:.3f}, acc {val_acc:.3f}')

Epoch 1 | train loss 1.108, acc 0.528 |  val loss 0.929, acc 0.623
Epoch 2 | train loss 0.830, acc 0.670 |  val loss 0.730, acc 0.717
Epoch 3 | train loss 0.617, acc 0.764 |  val loss 0.586, acc 0.772
Epoch 4 | train loss 0.475, acc 0.820 |  val loss 0.492, acc 0.812
Epoch 5 | train loss 0.378, acc 0.857 |  val loss 0.453, acc 0.834


In [113]:
test_loss, test_acc = eval_epoch(model, test_loader, criterion)
print(f'Test loss: {test_loss:.3f} | Test accuracy: {test_acc:.3f}')

Test loss: 0.455 | Test accuracy: 0.836


In [114]:
from sklearn.metrics import classification_report, confusion_matrix

all_preds = []
all_labels = []
model.eval()
with torch.no_grad():
    for input_ids, labels in test_loader:
        input_ids = input_ids.to(device)
        logits, _  = model(input_ids)
        preds = logits.argmax(dim=1).cpu().tolist()
        all_preds.extend(preds)
        all_labels.extend(labels.tolist())

# map back to label names
idx2label = {v:k for k,v in label2idx.items()}
y_true = [idx2label[i] for i in all_labels]
y_pred = [idx2label[i] for i in all_preds]

print(classification_report(y_true, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

  Irrelevant     0.8256    0.7868    0.8057      2575
    Negative     0.8187    0.8958    0.8555      4472
     Neutral     0.8961    0.7714    0.8291      3622
    Positive     0.8185    0.8589    0.8382      4131

    accuracy                         0.8361     14800
   macro avg     0.8397    0.8282    0.8321     14800
weighted avg     0.8388    0.8361    0.8355     14800

Confusion matrix:
 [[2026  224  101  224]
 [ 109 4006  103  254]
 [ 166  353 2794  309]
 [ 153  310  120 3548]]


In [115]:
import torch.nn.functional as F
def infer_tweet(text):
    model.eval()
    # 1) clean & tokenize
    cleaned = clean(text)
    tokens  = tokenize(cleaned)

    # 2) encode & pad to max_len
    ids     = encode_and_pad(tokens)           # list of length max_len
    tensor  = torch.tensor([ids], dtype=torch.long).to(device)  # [1, T]

    # 3) forward
    with torch.no_grad():
        logits, attn = model(tensor)           # logits: [1, C], attn: [1, T]
        probs = F.softmax(logits, dim=1).cpu().squeeze(0).tolist()
        pred_idx = int(torch.argmax(logits, dim=1).cpu().item())

    # 4) map back to label name
    pred_label = idx2label[pred_idx]

    # 5) cut attention to length of tokens
    attn_scores = attn.squeeze(0).cpu().tolist()[:len(tokens)]

    return {
        'original_text': text,
        'cleaned_text' : cleaned,
        'tokens'       : tokens,
        'pred_label'   : pred_label,
        'probs'        : { idx2label[i]: p for i,p in enumerate(probs) },
        'attn_scores'  : attn_scores
    }

In [116]:
tweet = "I am really excited about this competition and hope to success"
res = infer_tweet(tweet)

In [117]:
print(f"\nRAW TWEET:    {res['original_text']}")
print(f"CLEANED:      {res['cleaned_text']}")
print(f"PREDICTION:   {res['pred_label']}  (probs: {res['probs']})")
print("\nAttention:")
for tok, score in zip(res['tokens'], res['attn_scores']):
    print(f"  {tok:>12} : {score:.3f}")


RAW TWEET:    I am really excited about this competition and hope to success
CLEANED:      i am really excited about this competition and hope to success
PREDICTION:   Positive  (probs: {'Negative': 0.26906195282936096, 'Neutral': 0.006114725023508072, 'Positive': 0.6784508228302002, 'Irrelevant': 0.046372465789318085})

Attention:
             i : 0.000
            am : 0.002
        really : 0.003
       excited : 0.372
         about : 0.012
          this : 0.014
   competition : 0.241
           and : 0.027
          hope : 0.305
            to : 0.003
       success : 0.021


In [118]:
tweet = "it is rude to talk with your mom like this , you have to show more respect"
res = infer_tweet(tweet)

In [119]:
print(f"\nRAW TWEET:    {res['original_text']}")
print(f"CLEANED:      {res['cleaned_text']}")
print(f"PREDICTION:   {res['pred_label']}  (probs: {res['probs']})")
print("\nAttention:")
for tok, score in zip(res['tokens'], res['attn_scores']):
    print(f"  {tok:>12} : {score:.3f}")


RAW TWEET:    it is rude to talk with your mom like this , you have to show more respect
CLEANED:      it is rude to talk with your mom like this you have to show more respect
PREDICTION:   Neutral  (probs: {'Negative': 0.010749028995633125, 'Neutral': 0.5673827528953552, 'Positive': 0.40572792291641235, 'Irrelevant': 0.016140274703502655})

Attention:
            it : 0.023
            is : 0.016
          rude : 0.009
            to : 0.008
          talk : 0.036
          with : 0.004
          your : 0.002
           mom : 0.005
          like : 0.005
          this : 0.007
           you : 0.022
          have : 0.071
            to : 0.013
          show : 0.013
          more : 0.065
       respect : 0.700


In [122]:
tweet = "you are too rude and difficult to talk with you"
res = infer_tweet(tweet)

In [123]:
print(f"\nRAW TWEET:    {res['original_text']}")
print(f"CLEANED:      {res['cleaned_text']}")
print(f"PREDICTION:   {res['pred_label']}  (probs: {res['probs']})")
print("\nAttention:")
for tok, score in zip(res['tokens'], res['attn_scores']):
    print(f"  {tok:>12} : {score:.3f}")


RAW TWEET:    you are too rude and difficult to talk with you
CLEANED:      you are too rude and difficult to talk with you
PREDICTION:   Irrelevant  (probs: {'Negative': 0.02282084710896015, 'Neutral': 0.008260281756520271, 'Positive': 0.06202007085084915, 'Irrelevant': 0.9068987965583801})

Attention:
           you : 0.004
           are : 0.010
           too : 0.044
          rude : 0.146
           and : 0.020
     difficult : 0.659
            to : 0.019
          talk : 0.086
          with : 0.007
           you : 0.005
