In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("dataset_cleaned.csv")
X = df["text"].tolist()
y = df["label"].values

In [15]:
# Now perform train-test split with cleaned data
from sklearn.model_selection import train_test_split

X = df["text"].tolist()
y = df["label"].values


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print("\nClass distribution in y_train:")
import pandas as pd
print(pd.Series(y_train).value_counts().sort_index())
print("\nClass distribution in y_test:")
print(pd.Series(y_test).value_counts().sort_index())

Train set size: 1602
Test set size: 401

Class distribution in y_train:
0    418
1    398
2    397
3    389
Name: count, dtype: int64

Class distribution in y_test:
0    105
1    100
2     99
3     97
Name: count, dtype: int64


In [16]:
from transformers import RobertaTokenizer, RobertaModel
import torch

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta = RobertaModel.from_pretrained("roberta-base")

sample = tokenizer("This is a complaint about service", 
                   return_tensors="pt", 
                   truncation=True, 
                   padding=True, 
                   max_length=128)

print(sample["input_ids"].shape)  # (1, seq_len)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([1, 8])


In [7]:
def get_embeddings(texts, batch_size=16):
    embeddings = []
    roberta.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            encodings = tokenizer(batch, return_tensors="pt",
                                  truncation=True, padding=True, max_length=128)
            outputs = roberta(**encodings)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # shape: (batch, hidden_size)
            embeddings.append(cls_embeddings)
    return torch.cat(embeddings).numpy()

In [8]:
X_train_em = get_embeddings(X_train)
X_test_em = get_embeddings(X_test)
print(X_train_em.shape)  # (num_samples, hidden_size)
type(X_train_em)

(1602, 768)


numpy.ndarray

In [9]:
import tensorflow as tf
from tensorflow.keras import layers, models

num_classes = len(set(y))
print(num_classes)
model = models.Sequential([
    layers.Input(shape=(768,)),
    layers.Dropout(0.3),
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.Dense(num_classes, activation="softmax")
])

model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()

4


In [10]:
history = model.fit(
    X_train_em, y_train,
    epochs=15
)

Epoch 1/15
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3664 - loss: 1.3405 
Epoch 2/15
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3664 - loss: 1.3405
Epoch 2/15
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6561 - loss: 0.9433
Epoch 3/15
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6561 - loss: 0.9433
Epoch 3/15
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8689 - loss: 0.4497
Epoch 4/15
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8689 - loss: 0.4497
Epoch 4/15
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9157 - loss: 0.2719
Epoch 5/15
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9157 - loss: 0.2719
Epoch 5/15
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [11]:
loss, acc = model.evaluate(X_test_em, y_test)
print(f"Test Accuracy: {acc:.4f}")


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9676 - loss: 0.0824 
Test Accuracy: 0.9676
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9676 - loss: 0.0824 
Test Accuracy: 0.9676


In [12]:
sample_text = ["Hello i am very angry right now, there is so much waste in my area like cow dung, human feces, plastic bottlesm but no one is cleaning it. everyday i need to bear that bad smell. please help asap"]

In [13]:
import numpy as np

sample_em = get_embeddings(sample_text)

pred = model.predict(sample_em)
pred = pred* 100
pred_label = np.argmax(pred, axis=1)

print("Predicted class:", pred)
print("Predicted class:", pred_label)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Predicted class: [[1.9412288e-03 9.9997452e+01 4.0870742e-04 2.0090932e-04]]
Predicted class: [1]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Predicted class: [[1.9412288e-03 9.9997452e+01 4.0870742e-04 2.0090932e-04]]
Predicted class: [1]


In [None]:
model.save("model_new.h5")