In [None]:
import kagglehub
jp797498e_twitter_entity_sentiment_analysis_path = kagglehub.dataset_download('jp797498e/twitter-entity-sentiment-analysis')

print('Data source import complete.')


In [None]:
!pip install transformers datasets accelerate -q


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:

import os
import warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore", category=UserWarning)

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

print("PyTorch Version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available(): print("Device Name:", torch.cuda.get_device_name(0))
os.makedirs("./sentiment_model", exist_ok=True)


In [None]:

class Config:
    MODEL_NAME = 'prajjwal1/bert-tiny'
    DATA_PATH = '/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv'
    SAVE_PATH = './sentiment_model'
    MAX_LEN = 128
    EPOCHS = 30
    BATCH_SIZE = 64



    LEARNING_RATE = 2e-5
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = Config()


In [None]:

print("\n--- Loading and Preparing Data ---")
df = pd.read_csv(config.DATA_PATH, header=None)
df.columns = ['id', 'entity', 'label', 'text']


df = df[df['label'] != 'Irrelevant']
label_map = {'Positive': 2, 'Neutral': 1, 'Negative': 0}

df['label'] = df['label'].map(label_map)
df.dropna(subset=['text'], inplace=True)
df = df.reset_index(drop=True)
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df['label'])


In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
class SentimentClassificationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer, self.text, self.labels, self.max_len = tokenizer, dataframe.text.tolist(), dataframe.label.tolist(), max_len
    def __len__(self): return len(self.text)
    def __getitem__(self, index):
        text, label = str(self.text[index]), self.labels[index]
        encoding = self.tokenizer.encode_plus(text, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label, dtype=torch.long)}

train_dataset = SentimentClassificationDataset(train_df, tokenizer, config.MAX_LEN)
val_dataset = SentimentClassificationDataset(val_df, tokenizer, config.MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=2)



In [None]:
model = AutoModelForSequenceClassification.from_pretrained(config.MODEL_NAME, num_labels=3)
if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model)
model.to(config.DEVICE)
optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE)
loss_fn = torch.nn.CrossEntropyLoss().to(config.DEVICE)

total_training_steps = len(train_loader) * config.EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0, 
    num_training_steps=total_training_steps
)


In [None]:



def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader, desc="Training", leave=False):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        if isinstance(model, torch.nn.DataParallel): loss = loss.mean()
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    return total_loss / len(data_loader)

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            if isinstance(model, torch.nn.DataParallel): loss = loss.mean()
            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    report_str = classification_report(all_labels, all_preds, target_names=['Negative', 'Neutral', 'Positive'], zero_division=0)
    report_dict = classification_report(all_labels, all_preds, zero_division=0, output_dict=True)
    weighted_f1 = report_dict['weighted avg']['f1-score']
    return total_loss / len(data_loader), accuracy, report_str, weighted_f1



In [None]:

print("\n--- Starting Model Training ---")
history = {'train_loss': [], 'val_loss': [], 'val_accuracy': [], 'val_f1': []}
best_accuracy = 0

for epoch in range(config.EPOCHS):
    print(f'\n--- Epoch {epoch + 1}/{config.EPOCHS} ---')
    train_loss = train_epoch(model, train_loader, loss_fn, optimizer, config.DEVICE, scheduler)
    print(f'Training Loss: {train_loss:.4f}')

    val_loss, val_acc, val_report_str, val_weighted_f1 = eval_model(model, val_loader, loss_fn, config.DEVICE)
    print(f'Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_acc:.4f}')
    print("Classification Report:\n", val_report_str)

    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['val_accuracy'].append(val_acc)
    history['val_f1'].append(val_weighted_f1)

    if val_acc > best_accuracy:
        best_accuracy = val_acc
        model_to_save = model.module if isinstance(model, torch.nn.DataParallel) else model
        model_to_save.save_pretrained(config.SAVE_PATH)
        tokenizer.save_pretrained(config.SAVE_PATH)
        print(f"\nNew best model saved to {config.SAVE_PATH} with accuracy: {best_accuracy:.4f}")

print("\n--- Training Complete ---")
print(f"Best validation accuracy achieved: {best_accuracy:.4f}")



In [None]:

print("\n--- Plotting Training History ---")
plt.style.use('ggplot')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
epochs_range = range(1, config.EPOCHS + 1)
ax1.plot(epochs_range, history['train_loss'], 'b-o', label='Training Loss')
ax1.plot(epochs_range, history['val_loss'], 'r-o', label='Validation Loss')
ax1.set_title('Training and Validation Loss'); ax1.set_xlabel('Epochs'); ax1.set_ylabel('Loss'); ax1.legend(); ax1.grid(True)
ax2.plot(epochs_range, history['val_accuracy'], 'g-o', label='Validation Accuracy')
ax2.plot(epochs_range, history['val_f1'], 'm-o', label='Validation F1-Score')
ax2.set_title('Validation Accuracy and F1-Score'); ax2.set_xlabel('Epochs'); ax2.set_ylabel('Score'); ax2.legend(); ax2.grid(True)
plt.tight_layout()
plt.show()



In [None]:

print("\n--- Loading saved model for inference test ---")
final_model = AutoModelForSequenceClassification.from_pretrained(config.SAVE_PATH)
final_tokenizer = AutoTokenizer.from_pretrained(config.SAVE_PATH)

final_model.to(config.DEVICE)
final_model.eval()
def predict_sentiment(text):
    inputs = final_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=config.MAX_LEN,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = inputs['input_ids'].to(config.DEVICE)
    attention_mask = inputs['attention_mask'].to(config.DEVICE)

    with torch.no_grad():
        outputs = final_model(input_ids=input_ids, attention_mask=attention_mask)

    prediction = torch.argmax(outputs.logits, dim=1).item()
    confidence = torch.nn.functional.softmax(outputs.logits, dim=1).max().item()
    reverse_label_map = {v: k for k, v in label_map.items()}
    return reverse_label_map[prediction], confidence

test_texts = [
    "This game is absolutely fantastic, I can't stop playing!",
    "The customer service was terrible and I had to wait for hours.",
    "It's an okay product, not great but not bad either.",
]

print("\n--- Inference Examples ---")
for text in test_texts:
    sentiment, conf = predict_sentiment(text)
    print(f"Text: '{text}'\nPredicted Sentiment: {sentiment} (Confidence: {conf:.4f})\n")
