In [None]:
import pandas as pd
import torch
import torch.nn as nn
import math
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset

In [None]:
train_df = pd.read_excel("train.xlsx")
test_df = pd.read_excel("test.xlsx")

train_df = train_df[["content_hindi", "labels"]].dropna()
test_df = test_df[["content_hindi", "labels"]].dropna()

label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
train_df["label"] = train_df["labels"].map(label_map)
test_df["label"] = test_df["labels"].map(label_map)

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

In [None]:
def tokenize_data(df):
    tokens = tokenizer(
        df["content_hindi"].tolist(),
        truncation=True,
        padding="max_length",
        max_length=32
    )
    input_ids = torch.tensor(tokens["input_ids"])
    labels = torch.tensor(df["label"].tolist())
    return TensorDataset(input_ids, labels)

# 🧱 DataLoaders
train_loader = DataLoader(tokenize_data(train_df), batch_size=32, shuffle=True)
test_loader = DataLoader(tokenize_data(test_df), batch_size=32)


In [None]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        for pos in range(max_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                if i + 1 < d_model:
                    pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * i)/d_model)))
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2, num_classes=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.layernorm = nn.LayerNorm(d_model)
        self.pos_encoder = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, activation='gelu')
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)

        # Attention Pooling Layer
        self.attention = nn.Linear(d_model, 1)
        nn.init.xavier_uniform_(self.attention.weight)

        self.fc = nn.Linear(d_model, num_classes)
        self.dropout = nn.Dropout(0.2)

    def forward(self, src):
        x = self.embedding(src)
        x = self.layernorm(x)
        x = self.pos_encoder(x)
        x = self.dropout(x)

        x = self.transformer(x)

        weights = torch.softmax(self.attention(x), dim=1)
        x = (x * weights).sum(dim=1) + x.mean(dim=1) * 0.1  # residual pooling

        return self.fc(x)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size=tokenizer.vocab_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()




In [None]:
for epoch in range(100):
    model.train()
    total_loss = 0
    for batch in train_loader:
        x, y = [b.to(device) for b in batch]
        logits = model(x)
        loss = loss_fn(logits, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

Epoch 1 Loss: 0.4388
Epoch 2 Loss: 0.4440
Epoch 3 Loss: 0.4319
Epoch 4 Loss: 0.4242
Epoch 5 Loss: 0.4138
Epoch 6 Loss: 0.4282
Epoch 7 Loss: 0.4220
Epoch 8 Loss: 0.4014
Epoch 9 Loss: 0.4154
Epoch 10 Loss: 0.3989
Epoch 11 Loss: 0.4159
Epoch 12 Loss: 0.3990
Epoch 13 Loss: 0.3876
Epoch 14 Loss: 0.4045
Epoch 15 Loss: 0.3833
Epoch 16 Loss: 0.4018
Epoch 17 Loss: 0.3864
Epoch 18 Loss: 0.3828
Epoch 19 Loss: 0.3899
Epoch 20 Loss: 0.3771
Epoch 21 Loss: 0.3865
Epoch 22 Loss: 0.3846
Epoch 23 Loss: 0.3671
Epoch 24 Loss: 0.3683
Epoch 25 Loss: 0.3535
Epoch 26 Loss: 0.3504
Epoch 27 Loss: 0.3588
Epoch 28 Loss: 0.3405
Epoch 29 Loss: 0.3408
Epoch 30 Loss: 0.3322
Epoch 31 Loss: 0.3383
Epoch 32 Loss: 0.3518
Epoch 33 Loss: 0.3358
Epoch 34 Loss: 0.3289
Epoch 35 Loss: 0.3293
Epoch 36 Loss: 0.3548
Epoch 37 Loss: 0.3307
Epoch 38 Loss: 0.3078
Epoch 39 Loss: 0.3112
Epoch 40 Loss: 0.3194
Epoch 41 Loss: 0.3091
Epoch 42 Loss: 0.3123
Epoch 43 Loss: 0.3106
Epoch 44 Loss: 0.2864
Epoch 45 Loss: 0.2918
Epoch 46 Loss: 0.29

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        x, y = [b.to(device) for b in batch]
        preds = torch.argmax(model(x), dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)
print(f"\nTest Accuracy: {correct / total:.2%}")


Test Accuracy: 58.94%


In [None]:
# Inference Function
def predict(text):
    model.eval()
    with torch.no_grad():
        tokens = tokenizer(text, return_tensors="pt", padding="max_length", max_length=32, truncation=True)
        logits = model(tokens["input_ids"].to(device))
        pred = torch.argmax(logits, dim=1).item()
        return ["Negative", "Neutral", "Positive"][pred]

print(predict("यह उत्पाद बहुत बेकार है"))

Negative


In [None]:
print(predict("यह उत्पाद अच्छा है"))

Positive
