In [1]:
!pip install simpletransformers pandas scikit-learn torch



In [2]:
import pandas as pd
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cpu


In [4]:
sentence_df = pd.read_csv("sentence-level-annotation.csv")

In [5]:
print("Dataset Preview:")
display(sentence_df.head())

Dataset Preview:


Unnamed: 0.1,Unnamed: 0,Sentence,Hate_speech,Sentiment,Humor,Billing or price,Customer service,Data,Network,Package,Service or product,None
0,0,Ammage Adarayta❤️Eka Dawasak Madi Neda❤️🙏❤️,Not offensive,Negative,Non-humorous,0,0,0,0,0,0,1
1,1,We need IPL Champions leak data offers ..pleas...,Not offensive,Neutral,Non-humorous,0,0,1,0,0,0,0
2,2,#VPN #ummmaaa #proud_be,Not offensive,Neutral,Non-humorous,0,0,0,0,0,0,1
3,3,chandimal.. uuu thama mulu tem ekama kaaa gaha...,Not offensive,Positive,Non-humorous,0,0,0,0,0,0,1
4,4,Batzgo,Not offensive,Neutral,Non-humorous,0,0,0,0,0,0,1


In [6]:
sentence_df = sentence_df[['Sentence', 'Sentiment']]
sentence_df.dropna(inplace=True)

In [7]:
sentence_df.columns = sentence_df.columns.str.strip()

In [8]:
sentence_df["Sentiment"] = sentence_df["Sentiment"].astype(str).str.strip()

In [9]:
label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
sentence_df["Sentiment"] = sentence_df["Sentiment"].map(label_map)

In [10]:
sentence_df = sentence_df.dropna()

In [11]:
sentence_df["Sentiment"] = sentence_df["Sentiment"].astype(int)

In [12]:
print("\nDataset After Preprocessing:")
display(sentence_df.head())


Dataset After Preprocessing:


Unnamed: 0,Sentence,Sentiment
0,Ammage Adarayta❤️Eka Dawasak Madi Neda❤️🙏❤️,0
1,We need IPL Champions leak data offers ..pleas...,1
2,#VPN #ummmaaa #proud_be,1
3,chandimal.. uuu thama mulu tem ekama kaaa gaha...,2
4,Batzgo,1


In [13]:
print("\nData Types:")
print(sentence_df.dtypes)


Data Types:
Sentence     object
Sentiment     int32
dtype: object


In [14]:
train_df, eval_df = train_test_split(sentence_df, test_size=0.2, random_state=42)

In [15]:
print(f"Training Samples: {len(train_df)}")
print(f"Testing Samples: {len(eval_df)}")

Training Samples: 10715
Testing Samples: 2679


In [16]:
model_args = ClassificationArgs()
model_args.num_train_epochs = 3
model_args.train_batch_size = 16
model_args.eval_batch_size = 16
model_args.learning_rate = 2e-5
model_args.overwrite_output_dir = True
model_args.save_best_model = True
model_args.evaluate_during_training = True
model_args.save_eval_checkpoints = False
model_args.save_model_every_epoch = False
model_args.output_dir = "mbert-sentiment-model"
model_args.best_model_dir = "mbert-best-model"

In [20]:
model = ClassificationModel(
    "bert", "bert-base-multilingual-cased",
    num_labels=3,
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

print("\n✅ Model Loaded Successfully!")

model.safetensors:  23%|##3       | 168M/714M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]


✅ Model Loaded Successfully!


In [21]:
print("\n🚀 Training Started...")
model.train_model(train_df, eval_df=eval_df)


🚀 Training Started...




  0%|          | 0/21 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/670 [00:00<?, ?it/s]



  0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/670 [00:00<?, ?it/s]



  0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/670 [00:00<?, ?it/s]



  0%|          | 0/5 [00:00<?, ?it/s]



  0%|          | 0/5 [00:00<?, ?it/s]

(2010,
 defaultdict(list,
             {'global_step': [670, 1340, 2000, 2010],
              'train_loss': [0.8991503119468689,
               0.6450029015541077,
               0.5102134943008423,
               0.4914722442626953],
              'mcc': [0.44112149838566117,
               0.4971815694886369,
               0.49849141941881797,
               0.49911903524483503],
              'eval_loss': [0.6347097381949425,
               0.6101408862464485,
               0.6530977715072888,
               0.6529762708981123]}))

In [22]:
print("\n🔍 Evaluating Model...")
result, model_outputs, wrong_predictions = model.eval_model(eval_df, acc=accuracy_score)


🔍 Evaluating Model...




  0%|          | 0/5 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/168 [00:00<?, ?it/s]

In [23]:
print(f"\nModel Accuracy: {result['acc']:.4f}")


Model Accuracy: 0.7499


In [24]:
predictions, raw_outputs = model.predict(eval_df["Sentence"].tolist())
print("\n📊 Classification Report:\n")
print(classification_report(eval_df["Sentiment"], predictions))

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/168 [00:00<?, ?it/s]


📊 Classification Report:

              precision    recall  f1-score   support

           0       0.66      0.64      0.65       734
           1       0.80      0.83      0.82      1708
           2       0.62      0.49      0.55       237

    accuracy                           0.75      2679
   macro avg       0.69      0.66      0.67      2679
weighted avg       0.75      0.75      0.75      2679



In [25]:
sample_texts = ["à¶¯à·à¶±à·Š à·ƒà·’à¶¸à·Š à¶‘à¶šà¶§ code à¶‘à¶šà¶šà·Š à¶‘à¶ºà·’ à¶‘à¶š à¶¸à·™à¶šà¶§ à¶¯à·“à¶½à· register click à¶šà¶»à¶±à·Šà¶±."]
predictions, raw_outputs = model.predict(sample_texts)

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
reverse_label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
predictions = [reverse_label_map[p] for p in predictions]

In [27]:
print("\n💡 Sample Predictions:")
for text, pred in zip(sample_texts, predictions):
    print(f"📝 Text: {text} --> Predicted Sentiment: {pred}")


💡 Sample Predictions:
📝 Text: à¶¯à·à¶±à·Š à·ƒà·’à¶¸à·Š à¶‘à¶šà¶§ code à¶‘à¶šà¶šà·Š à¶‘à¶ºà·’ à¶‘à¶š à¶¸à·™à¶šà¶§ à¶¯à·“à¶½à· register click à¶šà¶»à¶±à·Šà¶±. --> Predicted Sentiment: Neutral


In [28]:
model.save_model("my_simpletransformers_model_mbert")
