In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight

In [56]:
# Clear CUDA cache
torch.cuda.empty_cache()

# Run garbage collection
import gc
gc.collect()

# Verify the GPU is cleared
torch.cuda.empty_cache()




In [57]:
# Load the dataset
df = pd.read_csv("combined_output.csv", on_bad_lines='skip')

In [58]:
df.head()

Unnamed: 0,original_id,source_name,title,url,body,date_published,language,date_modified,author_list,images,description,sentiment,emotions,entities,quotations,prValues,clipping,label,category
0,'5404c9d2fd24852afa122f2cc01cb3acba3c5d05b682d...,'kabargayo.com,'Kredit sepeda motor bisa terbayar jika Anda m...,'https://www.kabargayo.com/2024/09/19/kredit-s...,"'Jakarta, VIVA – Pembayaran kredit sepeda moto...",19/09/2024 22.32,'id,19/09/2024 22.32,'Aldi Hadad,'https://i1.wp.com/thumb.viva.co.id/media/fron...,,'positive,,"'Hari Pembayaran Berbayar atau Harcilnas 2024,...",,5250000,Adira,,
1,'e1e3f8d68b58568e8217b7562d48de634fceb0d837135...,'viva.co.id,'Kredit Motor Bisa Lunas Jika Bayar Cicilan Te...,'https://www.viva.co.id/otomotif/tips/1753596-...,"'Jakarta, VIVA – Cicilan kredit motor yang ser...",19/09/2024 22.30,'id,19/09/2024 22.30,'Krisna Wicaksono,'https://thumb.viva.co.id/media/frontend/thumb...,,'positive,,"'Harinya Cicilan Lunas,2024,PT Adira Dinamika ...",,5250000,Adira,,
2,'dca74b8fa4eabf60cebfa7b811ecb385872a0fd301eaf...,'kabarmegapolitan.pikiran-rakyat.com,'Adira Finance Umumkan Pemenang HARCILNAS 2024...,'https://kabarmegapolitan.pikiran-rakyat.com/b...,'KABARMEGAPOLITAN.com - PT Adira Dinamika Mult...,19/09/2024 21.45,'id,19/09/2024 21.45,'Yuliansyah,'https://assets.pikiran-rakyat.com/www/network...,,'positive,'HARCILNAS merupakan wujud apresiasi kami kepa...,"'PT Adira Dinamika,Cicilan Lunas HARCILNAS,12 ...","(Person :Tania Endah Budhi ,Quote : HARCILNAS ...",5250000,Adira,,
3,'56c73e2a6d254a17a5cc21dee7ed0b4660c3af70c093b...,'banggairaya.id,"'Dapatkan Promo Menarik, Yamaha Prima Motor Ra...",'https://banggairaya.id/dapatkan-promo-menarik...,'BANGGAI RAYA- Yamaha Prima Motor ramaikan pam...,19/09/2024 19.45,'id,19/09/2024 19.45,'Chikal Connect,'https://i0.wp.com/banggairaya.id/wp-content/u...,,'neutral,,"'RAYA- Yamaha Prima Motor,Banggai Goverment Ex...",,5250000,Adira,,
4,'1cd6c6db60224b6ee5f49cd5d6c62cd2850d9f3255721...,'jakarta.tribunnews.com,"'Sindikat Penipuan Leasing, Satu Bulan Ajukan ...",'https://jakarta.tribunnews.com/2024/09/19/sin...,'Laporan wartawan TribunJakarta.com Yusuf Bach...,19/09/2024 18.43,'id,19/09/2024 18.43,'Yusuf Bachtiar,'https://asset-2.tstatic.net/jakarta/foto/bank...,,'neutral,'Pelaku ini melakukan pembiayaan pembelian ken...,"'Yusuf Bachtiar TRIBUNJAKARTACOM,MEDAN,SATRIA,...","(Person :Dedi ,Quote : Pelaku ini melakukan pe...",5250000,Adira,,


In [59]:
# Clean extra characters from all columns in the DataFrame
for column in df.columns:
    if df[column].dtype == 'object':  # Check if the column is of string type
        df[column] = df[column].str.strip("'")  # Remove extra characters

# Filter relevant columns
df_filtered = df[['body', 'sentiment']].dropna()


In [60]:
df.head()

Unnamed: 0,original_id,source_name,title,url,body,date_published,language,date_modified,author_list,images,description,sentiment,emotions,entities,quotations,prValues,clipping,label,category
0,5404c9d2fd24852afa122f2cc01cb3acba3c5d05b682d4...,kabargayo.com,Kredit sepeda motor bisa terbayar jika Anda me...,https://www.kabargayo.com/2024/09/19/kredit-se...,"Jakarta, VIVA – Pembayaran kredit sepeda motor...",19/09/2024 22.32,id,19/09/2024 22.32,Aldi Hadad,https://i1.wp.com/thumb.viva.co.id/media/front...,,positive,,"Hari Pembayaran Berbayar atau Harcilnas 2024,P...",,5250000,Adira,,
1,e1e3f8d68b58568e8217b7562d48de634fceb0d8371356...,viva.co.id,Kredit Motor Bisa Lunas Jika Bayar Cicilan Tep...,https://www.viva.co.id/otomotif/tips/1753596-k...,"Jakarta, VIVA – Cicilan kredit motor yang seri...",19/09/2024 22.30,id,19/09/2024 22.30,Krisna Wicaksono,https://thumb.viva.co.id/media/frontend/thumbs...,,positive,,"Harinya Cicilan Lunas,2024,PT Adira Dinamika M...",,5250000,Adira,,
2,dca74b8fa4eabf60cebfa7b811ecb385872a0fd301eaf5...,kabarmegapolitan.pikiran-rakyat.com,Adira Finance Umumkan Pemenang HARCILNAS 2024:...,https://kabarmegapolitan.pikiran-rakyat.com/bi...,KABARMEGAPOLITAN.com - PT Adira Dinamika Multi...,19/09/2024 21.45,id,19/09/2024 21.45,Yuliansyah,https://assets.pikiran-rakyat.com/www/network/...,,positive,HARCILNAS merupakan wujud apresiasi kami kepad...,"PT Adira Dinamika,Cicilan Lunas HARCILNAS,12 p...","(Person :Tania Endah Budhi ,Quote : HARCILNAS ...",5250000,Adira,,
3,56c73e2a6d254a17a5cc21dee7ed0b4660c3af70c093b2...,banggairaya.id,"Dapatkan Promo Menarik, Yamaha Prima Motor Ram...",https://banggairaya.id/dapatkan-promo-menarik-...,BANGGAI RAYA- Yamaha Prima Motor ramaikan pame...,19/09/2024 19.45,id,19/09/2024 19.45,Chikal Connect,https://i0.wp.com/banggairaya.id/wp-content/up...,,neutral,,"RAYA- Yamaha Prima Motor,Banggai Goverment Exp...",,5250000,Adira,,
4,1cd6c6db60224b6ee5f49cd5d6c62cd2850d9f3255721d...,jakarta.tribunnews.com,"Sindikat Penipuan Leasing, Satu Bulan Ajukan K...",https://jakarta.tribunnews.com/2024/09/19/sind...,Laporan wartawan TribunJakarta.com Yusuf Bacht...,19/09/2024 18.43,id,19/09/2024 18.43,Yusuf Bachtiar,https://asset-2.tstatic.net/jakarta/foto/bank/...,,neutral,Pelaku ini melakukan pembiayaan pembelian kend...,"Yusuf Bachtiar TRIBUNJAKARTACOM,MEDAN,SATRIA,T...","(Person :Dedi ,Quote : Pelaku ini melakukan pe...",5250000,Adira,,


In [61]:
# Balance dataset classes
class_counts = df_filtered['sentiment'].value_counts()
min_class = class_counts.min()
df_balanced = df_filtered.groupby('sentiment').apply(lambda x: x.sample(min_class)).reset_index(drop=True)

In [62]:
# Ensure balanced classes during split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_filtered['body'].tolist(), df_filtered['sentiment'].tolist(),
    test_size=0.2, random_state=42, stratify=df_filtered['sentiment']
)

In [63]:
# Load tokenizer and tokenize texts
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


In [64]:
# Map sentiment labels to integers
label_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
train_labels = [label_mapping[label] for label in train_labels]
val_labels = [label_mapping[label] for label in val_labels]

In [65]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [66]:
# Create datasets
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

In [67]:
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [68]:
# Load Longformer model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Adjust model's loss function to include class weights
model.config.problem_type = "single_label_classification"
model.config.class_weights = class_weights.tolist()

In [70]:
from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    return {'eval_accuracy': acc}


In [71]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    seed=42,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,  # Extra epoch for thorough training
    learning_rate=2e-5,  # Smaller learning rate
    weight_decay=0.02,  # Higher regularization
    fp16=True,
    logging_dir="./logs",
    logging_steps=50,  # More frequent logging
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    lr_scheduler_type="linear",  # Simpler scheduler
    warmup_steps=100,  # Enough warmup
    save_total_limit=3,  # Keep more checkpoints
)




In [72]:
from transformers import Trainer
from transformers import EarlyStoppingCallback

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Optional: your metric computation function
    callbacks=[early_stopping_callback],  # Add the callback here
)


In [73]:
import torch

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [74]:
# Train the model
trainer.train()

  0%|          | 0/2250 [00:00<?, ?it/s]

{'loss': 1.0931, 'grad_norm': 2.02677321434021, 'learning_rate': 1e-05, 'epoch': 0.11}
{'loss': 1.0318, 'grad_norm': 3.3961737155914307, 'learning_rate': 2e-05, 'epoch': 0.22}
{'loss': 0.9728, 'grad_norm': 3.2854626178741455, 'learning_rate': 1.9534883720930235e-05, 'epoch': 0.33}
{'loss': 0.9173, 'grad_norm': 4.789216995239258, 'learning_rate': 1.9069767441860468e-05, 'epoch': 0.44}
{'loss': 0.7961, 'grad_norm': 3.793623685836792, 'learning_rate': 1.86046511627907e-05, 'epoch': 0.56}
{'loss': 0.806, 'grad_norm': 10.360204696655273, 'learning_rate': 1.8158139534883723e-05, 'epoch': 0.67}
{'loss': 0.8054, 'grad_norm': 5.938619613647461, 'learning_rate': 1.7693023255813956e-05, 'epoch': 0.78}
{'loss': 0.7371, 'grad_norm': 5.0961689949035645, 'learning_rate': 1.7227906976744186e-05, 'epoch': 0.89}
{'loss': 0.7008, 'grad_norm': 5.982089519500732, 'learning_rate': 1.676279069767442e-05, 'epoch': 1.0}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_accuracy': 0.7179344808439756, 'eval_loss': 0.681283712387085, 'eval_runtime': 7.9097, 'eval_samples_per_second': 227.694, 'eval_steps_per_second': 7.206, 'epoch': 1.0}
{'loss': 0.657, 'grad_norm': 6.011390209197998, 'learning_rate': 1.6297674418604652e-05, 'epoch': 1.11}
{'loss': 0.6718, 'grad_norm': 5.365485191345215, 'learning_rate': 1.5832558139534885e-05, 'epoch': 1.22}
{'loss': 0.589, 'grad_norm': 4.217019557952881, 'learning_rate': 1.5367441860465118e-05, 'epoch': 1.33}
{'loss': 0.5849, 'grad_norm': 9.90270709991455, 'learning_rate': 1.490232558139535e-05, 'epoch': 1.44}
{'loss': 0.6598, 'grad_norm': 7.373530387878418, 'learning_rate': 1.4437209302325584e-05, 'epoch': 1.56}
{'loss': 0.6226, 'grad_norm': 4.560194969177246, 'learning_rate': 1.3972093023255815e-05, 'epoch': 1.67}
{'loss': 0.5764, 'grad_norm': 10.292522430419922, 'learning_rate': 1.3506976744186046e-05, 'epoch': 1.78}
{'loss': 0.5923, 'grad_norm': 6.346625328063965, 'learning_rate': 1.3041860465116281e-05, 'e

  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_accuracy': 0.7612437534702943, 'eval_loss': 0.5897889733314514, 'eval_runtime': 7.9179, 'eval_samples_per_second': 227.458, 'eval_steps_per_second': 7.199, 'epoch': 2.0}
{'loss': 0.465, 'grad_norm': 7.430761337280273, 'learning_rate': 1.2111627906976745e-05, 'epoch': 2.11}
{'loss': 0.4799, 'grad_norm': 9.780165672302246, 'learning_rate': 1.1646511627906978e-05, 'epoch': 2.22}
{'loss': 0.4742, 'grad_norm': 6.957958698272705, 'learning_rate': 1.118139534883721e-05, 'epoch': 2.33}
{'loss': 0.499, 'grad_norm': 6.316256046295166, 'learning_rate': 1.0716279069767443e-05, 'epoch': 2.44}
{'loss': 0.4941, 'grad_norm': 13.31041145324707, 'learning_rate': 1.0251162790697676e-05, 'epoch': 2.56}
{'loss': 0.5262, 'grad_norm': 5.265052795410156, 'learning_rate': 9.786046511627909e-06, 'epoch': 2.67}
{'loss': 0.4764, 'grad_norm': 6.188224792480469, 'learning_rate': 9.32093023255814e-06, 'epoch': 2.78}
{'loss': 0.4705, 'grad_norm': 8.415345191955566, 'learning_rate': 8.855813953488373e-06, 'epoc

  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_accuracy': 0.7856746252082176, 'eval_loss': 0.5483001470565796, 'eval_runtime': 7.9272, 'eval_samples_per_second': 227.193, 'eval_steps_per_second': 7.19, 'epoch': 3.0}
{'loss': 0.3741, 'grad_norm': 8.956130027770996, 'learning_rate': 7.925581395348837e-06, 'epoch': 3.11}
{'loss': 0.4114, 'grad_norm': 5.268881320953369, 'learning_rate': 7.460465116279071e-06, 'epoch': 3.22}
{'loss': 0.421, 'grad_norm': 13.39311695098877, 'learning_rate': 6.9953488372093025e-06, 'epoch': 3.33}
{'loss': 0.3942, 'grad_norm': 12.367246627807617, 'learning_rate': 6.5302325581395355e-06, 'epoch': 3.44}
{'loss': 0.3893, 'grad_norm': 10.593647956848145, 'learning_rate': 6.0651162790697684e-06, 'epoch': 3.56}
{'loss': 0.4045, 'grad_norm': 12.737212181091309, 'learning_rate': 5.600000000000001e-06, 'epoch': 3.67}
{'loss': 0.4143, 'grad_norm': 11.052546501159668, 'learning_rate': 5.134883720930233e-06, 'epoch': 3.78}
{'loss': 0.4131, 'grad_norm': 8.418912887573242, 'learning_rate': 4.669767441860466e-06, '

  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_accuracy': 0.7801221543586896, 'eval_loss': 0.5850068926811218, 'eval_runtime': 7.9121, 'eval_samples_per_second': 227.627, 'eval_steps_per_second': 7.204, 'epoch': 4.0}
{'loss': 0.3529, 'grad_norm': 13.765434265136719, 'learning_rate': 3.7395348837209304e-06, 'epoch': 4.11}
{'loss': 0.3577, 'grad_norm': 14.530871391296387, 'learning_rate': 3.2744186046511626e-06, 'epoch': 4.22}
{'loss': 0.2983, 'grad_norm': 4.338857173919678, 'learning_rate': 2.8093023255813956e-06, 'epoch': 4.33}
{'loss': 0.3213, 'grad_norm': 10.938959121704102, 'learning_rate': 2.344186046511628e-06, 'epoch': 4.44}
{'loss': 0.3785, 'grad_norm': 10.517203330993652, 'learning_rate': 1.8790697674418607e-06, 'epoch': 4.56}
{'loss': 0.2953, 'grad_norm': 9.850091934204102, 'learning_rate': 1.413953488372093e-06, 'epoch': 4.67}
{'loss': 0.3273, 'grad_norm': 17.5310115814209, 'learning_rate': 9.488372093023257e-07, 'epoch': 4.78}
{'loss': 0.3069, 'grad_norm': 12.290239334106445, 'learning_rate': 4.837209302325581e-07

  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_accuracy': 0.7884508606329816, 'eval_loss': 0.5802810788154602, 'eval_runtime': 7.9406, 'eval_samples_per_second': 226.81, 'eval_steps_per_second': 7.178, 'epoch': 5.0}
{'train_runtime': 609.7659, 'train_samples_per_second': 59.039, 'train_steps_per_second': 3.69, 'train_loss': 0.5403542370266384, 'epoch': 5.0}


TrainOutput(global_step=2250, training_loss=0.5403542370266384, metrics={'train_runtime': 609.7659, 'train_samples_per_second': 59.039, 'train_steps_per_second': 3.69, 'total_flos': 4768911396864000.0, 'train_loss': 0.5403542370266384, 'epoch': 5.0})

In [75]:
print(f"Number of training examples: {len(train_dataset)}")
print(f"Batch size: {training_args.per_device_train_batch_size}")


Number of training examples: 7200
Batch size: 16


In [76]:
# Save the trained model and tokenizer
model.save_pretrained('./distilbert6')
tokenizer.save_pretrained('./distilbert6')

('./distilbert6\\tokenizer_config.json',
 './distilbert6\\special_tokens_map.json',
 './distilbert6\\vocab.txt',
 './distilbert6\\added_tokens.json')

In [77]:
# Evaluate the model
results = trainer.evaluate()

  0%|          | 0/57 [00:00<?, ?it/s]

In [78]:
print(results)

{'eval_accuracy': 0.7884508606329816, 'eval_loss': 0.5802810788154602, 'eval_runtime': 7.94, 'eval_samples_per_second': 226.826, 'eval_steps_per_second': 7.179, 'epoch': 5.0}


In [79]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# After training, get predictions on the validation dataset
predictions = trainer.predict(val_dataset)

# Get the predicted logits
logits = predictions.predictions

# Convert logits to predicted class labels
predicted_labels = np.argmax(logits, axis=1)

# Get the true labels
true_labels = val_labels

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')

# Display metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


# Detailed classification report
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))


  0%|          | 0/57 [00:00<?, ?it/s]

Accuracy: 0.7885
F1 Score: 0.7883
Precision: 0.7883
Recall: 0.7885

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.73      0.73       338
           1       0.80      0.82      0.81       836
           2       0.79      0.78      0.79       627

    accuracy                           0.79      1801
   macro avg       0.78      0.78      0.78      1801
weighted avg       0.79      0.79      0.79      1801



In [80]:
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [81]:
# Load the saved model and tokenizer
model = LongformerForSequenceClassification.from_pretrained('./longformer_model6').to(device)
tokenizer = LongformerTokenizer.from_pretrained('./longformer_model6')

NameError: name 'LongformerForSequenceClassification' is not defined