In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight




In [2]:
# Load the dataset
df = pd.read_csv("combined_output.csv", on_bad_lines='skip')

In [3]:
df.head()

Unnamed: 0,original_id,source_name,title,url,body,date_published,language,date_modified,author_list,images,description,sentiment,emotions,entities,quotations,prValues,clipping,label,category
0,'5404c9d2fd24852afa122f2cc01cb3acba3c5d05b682d...,'kabargayo.com,'Kredit sepeda motor bisa terbayar jika Anda m...,'https://www.kabargayo.com/2024/09/19/kredit-s...,"'Jakarta, VIVA – Pembayaran kredit sepeda moto...",19/09/2024 22.32,'id,19/09/2024 22.32,'Aldi Hadad,'https://i1.wp.com/thumb.viva.co.id/media/fron...,,'positive,,"'Hari Pembayaran Berbayar atau Harcilnas 2024,...",,5250000,Adira,,
1,'e1e3f8d68b58568e8217b7562d48de634fceb0d837135...,'viva.co.id,'Kredit Motor Bisa Lunas Jika Bayar Cicilan Te...,'https://www.viva.co.id/otomotif/tips/1753596-...,"'Jakarta, VIVA – Cicilan kredit motor yang ser...",19/09/2024 22.30,'id,19/09/2024 22.30,'Krisna Wicaksono,'https://thumb.viva.co.id/media/frontend/thumb...,,'positive,,"'Harinya Cicilan Lunas,2024,PT Adira Dinamika ...",,5250000,Adira,,
2,'dca74b8fa4eabf60cebfa7b811ecb385872a0fd301eaf...,'kabarmegapolitan.pikiran-rakyat.com,'Adira Finance Umumkan Pemenang HARCILNAS 2024...,'https://kabarmegapolitan.pikiran-rakyat.com/b...,'KABARMEGAPOLITAN.com - PT Adira Dinamika Mult...,19/09/2024 21.45,'id,19/09/2024 21.45,'Yuliansyah,'https://assets.pikiran-rakyat.com/www/network...,,'positive,'HARCILNAS merupakan wujud apresiasi kami kepa...,"'PT Adira Dinamika,Cicilan Lunas HARCILNAS,12 ...","(Person :Tania Endah Budhi ,Quote : HARCILNAS ...",5250000,Adira,,
3,'56c73e2a6d254a17a5cc21dee7ed0b4660c3af70c093b...,'banggairaya.id,"'Dapatkan Promo Menarik, Yamaha Prima Motor Ra...",'https://banggairaya.id/dapatkan-promo-menarik...,'BANGGAI RAYA- Yamaha Prima Motor ramaikan pam...,19/09/2024 19.45,'id,19/09/2024 19.45,'Chikal Connect,'https://i0.wp.com/banggairaya.id/wp-content/u...,,'neutral,,"'RAYA- Yamaha Prima Motor,Banggai Goverment Ex...",,5250000,Adira,,
4,'1cd6c6db60224b6ee5f49cd5d6c62cd2850d9f3255721...,'jakarta.tribunnews.com,"'Sindikat Penipuan Leasing, Satu Bulan Ajukan ...",'https://jakarta.tribunnews.com/2024/09/19/sin...,'Laporan wartawan TribunJakarta.com Yusuf Bach...,19/09/2024 18.43,'id,19/09/2024 18.43,'Yusuf Bachtiar,'https://asset-2.tstatic.net/jakarta/foto/bank...,,'neutral,'Pelaku ini melakukan pembiayaan pembelian ken...,"'Yusuf Bachtiar TRIBUNJAKARTACOM,MEDAN,SATRIA,...","(Person :Dedi ,Quote : Pelaku ini melakukan pe...",5250000,Adira,,


In [4]:
# Clean extra characters from all columns in the DataFrame
for column in df.columns:
    if df[column].dtype == 'object':  # Check if the column is of string type
        df[column] = df[column].str.strip("'")  # Remove extra characters

# Filter relevant columns
df_filtered = df[['body', 'sentiment']].dropna()


In [5]:
df.head()

Unnamed: 0,original_id,source_name,title,url,body,date_published,language,date_modified,author_list,images,description,sentiment,emotions,entities,quotations,prValues,clipping,label,category
0,5404c9d2fd24852afa122f2cc01cb3acba3c5d05b682d4...,kabargayo.com,Kredit sepeda motor bisa terbayar jika Anda me...,https://www.kabargayo.com/2024/09/19/kredit-se...,"Jakarta, VIVA – Pembayaran kredit sepeda motor...",19/09/2024 22.32,id,19/09/2024 22.32,Aldi Hadad,https://i1.wp.com/thumb.viva.co.id/media/front...,,positive,,"Hari Pembayaran Berbayar atau Harcilnas 2024,P...",,5250000,Adira,,
1,e1e3f8d68b58568e8217b7562d48de634fceb0d8371356...,viva.co.id,Kredit Motor Bisa Lunas Jika Bayar Cicilan Tep...,https://www.viva.co.id/otomotif/tips/1753596-k...,"Jakarta, VIVA – Cicilan kredit motor yang seri...",19/09/2024 22.30,id,19/09/2024 22.30,Krisna Wicaksono,https://thumb.viva.co.id/media/frontend/thumbs...,,positive,,"Harinya Cicilan Lunas,2024,PT Adira Dinamika M...",,5250000,Adira,,
2,dca74b8fa4eabf60cebfa7b811ecb385872a0fd301eaf5...,kabarmegapolitan.pikiran-rakyat.com,Adira Finance Umumkan Pemenang HARCILNAS 2024:...,https://kabarmegapolitan.pikiran-rakyat.com/bi...,KABARMEGAPOLITAN.com - PT Adira Dinamika Multi...,19/09/2024 21.45,id,19/09/2024 21.45,Yuliansyah,https://assets.pikiran-rakyat.com/www/network/...,,positive,HARCILNAS merupakan wujud apresiasi kami kepad...,"PT Adira Dinamika,Cicilan Lunas HARCILNAS,12 p...","(Person :Tania Endah Budhi ,Quote : HARCILNAS ...",5250000,Adira,,
3,56c73e2a6d254a17a5cc21dee7ed0b4660c3af70c093b2...,banggairaya.id,"Dapatkan Promo Menarik, Yamaha Prima Motor Ram...",https://banggairaya.id/dapatkan-promo-menarik-...,BANGGAI RAYA- Yamaha Prima Motor ramaikan pame...,19/09/2024 19.45,id,19/09/2024 19.45,Chikal Connect,https://i0.wp.com/banggairaya.id/wp-content/up...,,neutral,,"RAYA- Yamaha Prima Motor,Banggai Goverment Exp...",,5250000,Adira,,
4,1cd6c6db60224b6ee5f49cd5d6c62cd2850d9f3255721d...,jakarta.tribunnews.com,"Sindikat Penipuan Leasing, Satu Bulan Ajukan K...",https://jakarta.tribunnews.com/2024/09/19/sind...,Laporan wartawan TribunJakarta.com Yusuf Bacht...,19/09/2024 18.43,id,19/09/2024 18.43,Yusuf Bachtiar,https://asset-2.tstatic.net/jakarta/foto/bank/...,,neutral,Pelaku ini melakukan pembiayaan pembelian kend...,"Yusuf Bachtiar TRIBUNJAKARTACOM,MEDAN,SATRIA,T...","(Person :Dedi ,Quote : Pelaku ini melakukan pe...",5250000,Adira,,


In [6]:
# Balance dataset classes
class_counts = df_filtered['sentiment'].value_counts()
min_class = class_counts.min()
df_balanced = df_filtered.groupby('sentiment').apply(lambda x: x.sample(min_class)).reset_index(drop=True)

In [7]:
# Ensure balanced classes during split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_filtered['body'].tolist(), df_filtered['sentiment'].tolist(),
    test_size=0.2, random_state=42, stratify=df_filtered['sentiment']
)

In [8]:
# Load tokenizer and tokenize texts
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


In [9]:
# Map sentiment labels to integers
label_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
train_labels = [label_mapping[label] for label in train_labels]
val_labels = [label_mapping[label] for label in val_labels]

In [10]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [11]:
# Create datasets
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

In [12]:
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [13]:
# Load Longformer model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Adjust model's loss function to include class weights
model.config.problem_type = "single_label_classification"
model.config.class_weights = class_weights.tolist()

In [15]:
from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    return {'eval_accuracy': acc}


In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  # More frequent evaluation
    eval_steps=200,  # Evaluate more frequently for better monitoring
    save_strategy="steps",
    save_steps=200,
    per_device_train_batch_size=32,  # Larger batch size for stability
    per_device_eval_batch_size=64,  # Larger eval batch size
    gradient_accumulation_steps=2,  # Simulate larger batches
    num_train_epochs=6,  # Longer training for better results
    learning_rate=3e-5,  # Slightly reduced LR for stability
    weight_decay=0.01,  # Regularization for better generalization
    fp16=True,  # Mixed precision for faster training
    logging_dir="./logs",
    logging_steps=50,  # Frequent logging for monitoring
    load_best_model_at_end=True,  # Load best model at the end
    metric_for_best_model="eval_accuracy",  # Use accuracy as best model metric
    greater_is_better=True,
    lr_scheduler_type="cosine_with_restarts",  # Smooth learning rate decay
    warmup_steps=200,  # Longer warm-up for stability
    save_total_limit=3,  # Keep more checkpoints for safety
)




In [17]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Include custom metrics
)


In [18]:
import torch

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [19]:
# Train the model
trainer.train()

  0%|          | 0/672 [00:00<?, ?it/s]

{'loss': 1.0582, 'grad_norm': 0.7676056623458862, 'learning_rate': 7.5e-06, 'epoch': 0.44}
{'loss': 1.0164, 'grad_norm': 1.9296014308929443, 'learning_rate': 1.5e-05, 'epoch': 0.89}
{'loss': 0.8712, 'grad_norm': 3.5376548767089844, 'learning_rate': 2.25e-05, 'epoch': 1.33}
{'loss': 0.7597, 'grad_norm': 11.02445125579834, 'learning_rate': 2.985e-05, 'epoch': 1.78}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_accuracy': 0.7057190449750139, 'eval_loss': 0.7058994174003601, 'eval_runtime': 22.1518, 'eval_samples_per_second': 81.303, 'eval_steps_per_second': 1.309, 'epoch': 1.78}
{'loss': 0.6544, 'grad_norm': 6.5381340980529785, 'learning_rate': 2.920929188742639e-05, 'epoch': 2.22}
{'loss': 0.6256, 'grad_norm': 4.113133907318115, 'learning_rate': 2.6859664429194316e-05, 'epoch': 2.67}
{'loss': 0.5674, 'grad_norm': 12.621991157531738, 'learning_rate': 2.3208620902784484e-05, 'epoch': 3.11}
{'loss': 0.4978, 'grad_norm': 8.399321556091309, 'learning_rate': 1.865680727413076e-05, 'epoch': 3.56}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_accuracy': 0.7623542476401999, 'eval_loss': 0.6019715070724487, 'eval_runtime': 21.8733, 'eval_samples_per_second': 82.338, 'eval_steps_per_second': 1.326, 'epoch': 3.56}
{'loss': 0.5004, 'grad_norm': 9.688924789428711, 'learning_rate': 1.370371519673206e-05, 'epoch': 4.0}
{'loss': 0.3993, 'grad_norm': 8.474952697753906, 'learning_rate': 8.892870472281623e-06, 'epoch': 4.44}
{'loss': 0.4022, 'grad_norm': 7.239161014556885, 'learning_rate': 4.7521894396680756e-06, 'epoch': 4.89}
{'loss': 0.3585, 'grad_norm': 4.980536937713623, 'learning_rate': 1.7360482628389473e-06, 'epoch': 5.33}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_accuracy': 0.7806774014436424, 'eval_loss': 0.5940728187561035, 'eval_runtime': 21.862, 'eval_samples_per_second': 82.381, 'eval_steps_per_second': 1.327, 'epoch': 5.33}
{'loss': 0.3411, 'grad_norm': 6.1303558349609375, 'learning_rate': 1.7542212366829236e-07, 'epoch': 5.78}
{'train_runtime': 1850.5512, 'train_samples_per_second': 23.344, 'train_steps_per_second': 0.363, 'train_loss': 0.6109332562912078, 'epoch': 5.97}


TrainOutput(global_step=672, training_loss=0.6109332562912078, metrics={'train_runtime': 1850.5512, 'train_samples_per_second': 23.344, 'train_steps_per_second': 0.363, 'total_flos': 5697259482120192.0, 'train_loss': 0.6109332562912078, 'epoch': 5.973333333333334})

In [20]:
print(f"Number of training examples: {len(train_dataset)}")
print(f"Batch size: {training_args.per_device_train_batch_size}")


Number of training examples: 7200
Batch size: 32


In [21]:
# Save the trained model and tokenizer
model.save_pretrained('./distilbert3')
tokenizer.save_pretrained('./distilbert3')

('./distilbert3\\tokenizer_config.json',
 './distilbert3\\special_tokens_map.json',
 './distilbert3\\vocab.txt',
 './distilbert3\\added_tokens.json')

In [22]:
# Evaluate the model
results = trainer.evaluate()

  0%|          | 0/29 [00:00<?, ?it/s]

In [23]:
print(results)

{'eval_accuracy': 0.7806774014436424, 'eval_loss': 0.5940728187561035, 'eval_runtime': 21.8487, 'eval_samples_per_second': 82.431, 'eval_steps_per_second': 1.327, 'epoch': 5.973333333333334}


In [24]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# After training, get predictions on the validation dataset
predictions = trainer.predict(val_dataset)

# Get the predicted logits
logits = predictions.predictions

# Convert logits to predicted class labels
predicted_labels = np.argmax(logits, axis=1)

# Get the true labels
true_labels = val_labels

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')

# Display metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


# Detailed classification report
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))


  0%|          | 0/29 [00:00<?, ?it/s]

Accuracy: 0.7807
F1 Score: 0.7796
Precision: 0.7820
Recall: 0.7807

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.65      0.70       338
           1       0.82      0.79      0.80       836
           2       0.75      0.84      0.79       627

    accuracy                           0.78      1801
   macro avg       0.77      0.76      0.76      1801
weighted avg       0.78      0.78      0.78      1801



In [25]:
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [26]:
# Load the saved model and tokenizer
model = LongformerForSequenceClassification.from_pretrained('./longformer_model6').to(device)
tokenizer = LongformerTokenizer.from_pretrained('./longformer_model6')

NameError: name 'LongformerForSequenceClassification' is not defined