In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("dataset_cleaned.csv")

# X = complaint descriptions, y = already encoded labels
X = df["text"].tolist()
y = df["label"].values

# Train-test split (80/20 split is common)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [2]:
from transformers import RobertaTokenizer, RobertaModel
import torch

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta = RobertaModel.from_pretrained("roberta-base")

# Example encoding
sample = tokenizer("This is a complaint about service", 
                   return_tensors="pt", 
                   truncation=True, 
                   padding=True, 
                   max_length=128)

print(sample["input_ids"].shape)  # (1, seq_len)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([1, 8])


In [3]:
def get_embeddings(texts, batch_size=16):
    embeddings = []
    roberta.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            encodings = tokenizer(batch, return_tensors="pt",
                                  truncation=True, padding=True, max_length=128)
            outputs = roberta(**encodings)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # shape: (batch, hidden_size)
            embeddings.append(cls_embeddings)
    return torch.cat(embeddings).numpy()

In [4]:
X_train_em = get_embeddings(X_train)
X_test_em = get_embeddings(X_test)
print(X_train_em.shape)  # (num_samples, hidden_size)
type(X_train_em)

(1280, 768)


numpy.ndarray

In [5]:
import tensorflow as tf
from tensorflow.keras import layers, models

num_classes = len(set(y))
print(num_classes)
model = models.Sequential([
    layers.Input(shape=(768,)),
    layers.Dropout(0.3),
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.Dense(num_classes, activation="softmax")
])

model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()

4
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout (Dropout)           (None, 768)               0         
                                                                 
 dense (Dense)               (None, 256)               196864    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 4)                 516       
                                                                 
Total params: 230,276
Trainable params: 230,276
Non-trainable params: 0
_________________________________________________________________


In [7]:
history = model.fit(
    X_train_em, y_train,
    epochs=15
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [8]:
loss, acc = model.evaluate(X_test_em, y_test)
print(f"Test Accuracy: {acc:.4f}")


Test Accuracy: 0.9500


In [9]:
sample_text = ["public isse bus not coming late late"]

In [10]:
import numpy as np

sample_em = get_embeddings(sample_text)

pred = model.predict(sample_em)
pred = pred* 100
pred_label = np.argmax(pred, axis=1)

print("Predicted class:", pred)
print("Predicted class:", pred_label)

Predicted class: [[ 4.14604   59.75931    1.3531185 34.741528 ]]
Predicted class: [1]


In [11]:
model.save("model_new.h5")