In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight




In [2]:
# Clear CUDA cache
torch.cuda.empty_cache()

# Run garbage collection
import gc
gc.collect()

# Verify the GPU is cleared
torch.cuda.empty_cache()

In [3]:
# Load the dataset
df = pd.read_csv("combined_output.csv", on_bad_lines='skip')

In [4]:
df.head()

Unnamed: 0,original_id,source_name,title,url,body,date_published,language,date_modified,author_list,images,description,sentiment,emotions,entities,quotations,prValues,clipping,label,category
0,'5404c9d2fd24852afa122f2cc01cb3acba3c5d05b682d...,'kabargayo.com,'Kredit sepeda motor bisa terbayar jika Anda m...,'https://www.kabargayo.com/2024/09/19/kredit-s...,"'Jakarta, VIVA – Pembayaran kredit sepeda moto...",19/09/2024 22.32,'id,19/09/2024 22.32,'Aldi Hadad,'https://i1.wp.com/thumb.viva.co.id/media/fron...,,'positive,,"'Hari Pembayaran Berbayar atau Harcilnas 2024,...",,5250000,Adira,,
1,'e1e3f8d68b58568e8217b7562d48de634fceb0d837135...,'viva.co.id,'Kredit Motor Bisa Lunas Jika Bayar Cicilan Te...,'https://www.viva.co.id/otomotif/tips/1753596-...,"'Jakarta, VIVA – Cicilan kredit motor yang ser...",19/09/2024 22.30,'id,19/09/2024 22.30,'Krisna Wicaksono,'https://thumb.viva.co.id/media/frontend/thumb...,,'positive,,"'Harinya Cicilan Lunas,2024,PT Adira Dinamika ...",,5250000,Adira,,
2,'dca74b8fa4eabf60cebfa7b811ecb385872a0fd301eaf...,'kabarmegapolitan.pikiran-rakyat.com,'Adira Finance Umumkan Pemenang HARCILNAS 2024...,'https://kabarmegapolitan.pikiran-rakyat.com/b...,'KABARMEGAPOLITAN.com - PT Adira Dinamika Mult...,19/09/2024 21.45,'id,19/09/2024 21.45,'Yuliansyah,'https://assets.pikiran-rakyat.com/www/network...,,'positive,'HARCILNAS merupakan wujud apresiasi kami kepa...,"'PT Adira Dinamika,Cicilan Lunas HARCILNAS,12 ...","(Person :Tania Endah Budhi ,Quote : HARCILNAS ...",5250000,Adira,,
3,'56c73e2a6d254a17a5cc21dee7ed0b4660c3af70c093b...,'banggairaya.id,"'Dapatkan Promo Menarik, Yamaha Prima Motor Ra...",'https://banggairaya.id/dapatkan-promo-menarik...,'BANGGAI RAYA- Yamaha Prima Motor ramaikan pam...,19/09/2024 19.45,'id,19/09/2024 19.45,'Chikal Connect,'https://i0.wp.com/banggairaya.id/wp-content/u...,,'neutral,,"'RAYA- Yamaha Prima Motor,Banggai Goverment Ex...",,5250000,Adira,,
4,'1cd6c6db60224b6ee5f49cd5d6c62cd2850d9f3255721...,'jakarta.tribunnews.com,"'Sindikat Penipuan Leasing, Satu Bulan Ajukan ...",'https://jakarta.tribunnews.com/2024/09/19/sin...,'Laporan wartawan TribunJakarta.com Yusuf Bach...,19/09/2024 18.43,'id,19/09/2024 18.43,'Yusuf Bachtiar,'https://asset-2.tstatic.net/jakarta/foto/bank...,,'neutral,'Pelaku ini melakukan pembiayaan pembelian ken...,"'Yusuf Bachtiar TRIBUNJAKARTACOM,MEDAN,SATRIA,...","(Person :Dedi ,Quote : Pelaku ini melakukan pe...",5250000,Adira,,


In [5]:
# Clean extra characters from all columns in the DataFrame
for column in df.columns:
    if df[column].dtype == 'object':  # Check if the column is of string type
        df[column] = df[column].str.strip("'")  # Remove extra characters

# Filter relevant columns
df_filtered = df[['body', 'sentiment']].dropna()


In [6]:
df.head()

Unnamed: 0,original_id,source_name,title,url,body,date_published,language,date_modified,author_list,images,description,sentiment,emotions,entities,quotations,prValues,clipping,label,category
0,5404c9d2fd24852afa122f2cc01cb3acba3c5d05b682d4...,kabargayo.com,Kredit sepeda motor bisa terbayar jika Anda me...,https://www.kabargayo.com/2024/09/19/kredit-se...,"Jakarta, VIVA – Pembayaran kredit sepeda motor...",19/09/2024 22.32,id,19/09/2024 22.32,Aldi Hadad,https://i1.wp.com/thumb.viva.co.id/media/front...,,positive,,"Hari Pembayaran Berbayar atau Harcilnas 2024,P...",,5250000,Adira,,
1,e1e3f8d68b58568e8217b7562d48de634fceb0d8371356...,viva.co.id,Kredit Motor Bisa Lunas Jika Bayar Cicilan Tep...,https://www.viva.co.id/otomotif/tips/1753596-k...,"Jakarta, VIVA – Cicilan kredit motor yang seri...",19/09/2024 22.30,id,19/09/2024 22.30,Krisna Wicaksono,https://thumb.viva.co.id/media/frontend/thumbs...,,positive,,"Harinya Cicilan Lunas,2024,PT Adira Dinamika M...",,5250000,Adira,,
2,dca74b8fa4eabf60cebfa7b811ecb385872a0fd301eaf5...,kabarmegapolitan.pikiran-rakyat.com,Adira Finance Umumkan Pemenang HARCILNAS 2024:...,https://kabarmegapolitan.pikiran-rakyat.com/bi...,KABARMEGAPOLITAN.com - PT Adira Dinamika Multi...,19/09/2024 21.45,id,19/09/2024 21.45,Yuliansyah,https://assets.pikiran-rakyat.com/www/network/...,,positive,HARCILNAS merupakan wujud apresiasi kami kepad...,"PT Adira Dinamika,Cicilan Lunas HARCILNAS,12 p...","(Person :Tania Endah Budhi ,Quote : HARCILNAS ...",5250000,Adira,,
3,56c73e2a6d254a17a5cc21dee7ed0b4660c3af70c093b2...,banggairaya.id,"Dapatkan Promo Menarik, Yamaha Prima Motor Ram...",https://banggairaya.id/dapatkan-promo-menarik-...,BANGGAI RAYA- Yamaha Prima Motor ramaikan pame...,19/09/2024 19.45,id,19/09/2024 19.45,Chikal Connect,https://i0.wp.com/banggairaya.id/wp-content/up...,,neutral,,"RAYA- Yamaha Prima Motor,Banggai Goverment Exp...",,5250000,Adira,,
4,1cd6c6db60224b6ee5f49cd5d6c62cd2850d9f3255721d...,jakarta.tribunnews.com,"Sindikat Penipuan Leasing, Satu Bulan Ajukan K...",https://jakarta.tribunnews.com/2024/09/19/sind...,Laporan wartawan TribunJakarta.com Yusuf Bacht...,19/09/2024 18.43,id,19/09/2024 18.43,Yusuf Bachtiar,https://asset-2.tstatic.net/jakarta/foto/bank/...,,neutral,Pelaku ini melakukan pembiayaan pembelian kend...,"Yusuf Bachtiar TRIBUNJAKARTACOM,MEDAN,SATRIA,T...","(Person :Dedi ,Quote : Pelaku ini melakukan pe...",5250000,Adira,,


In [7]:
# Balance dataset classes
class_counts = df_filtered['sentiment'].value_counts()
min_class = class_counts.min()
df_balanced = df_filtered.groupby('sentiment').apply(lambda x: x.sample(min_class)).reset_index(drop=True)

In [8]:
# Ensure balanced classes during split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_filtered['body'].tolist(), df_filtered['sentiment'].tolist(),
    test_size=0.2, random_state=42, stratify=df_filtered['sentiment']
)

In [9]:

# Load tokenizer and tokenize texts
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


In [10]:
# Map sentiment labels to integers
label_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
train_labels = [label_mapping[label] for label in train_labels]
val_labels = [label_mapping[label] for label in val_labels]

In [11]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [12]:
# Create datasets
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

In [13]:
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [14]:
# Load XLNet model for sequence classification
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=3)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Adjust model's loss function to include class weights
model.config.problem_type = "single_label_classification"
model.config.class_weights = class_weights.tolist()

In [16]:
from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    return {'eval_accuracy': acc}


In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  # Evaluate more frequently
    eval_steps=200,              # Adjust based on dataset size
    save_strategy="steps",
    save_steps=200,
    per_device_train_batch_size=8,  # Lower batch size for better generalization
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,  # Simulates a larger batch size
    num_train_epochs=6,            # Train for more epochs
    learning_rate=1e-5,            # Lower learning rate for better fine-tuning
    weight_decay=0.01,
    warmup_steps=500,              # Gradual learning rate increase
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    lr_scheduler_type="cosine_with_restarts",  # More effective for fine-tuning
    fp16=True,                   # Enable mixed precision training
)




In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Include custom metrics
)


In [19]:
import torch

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [20]:
# Train the model
trainer.train()

  0%|          | 0/1800 [00:00<?, ?it/s]

{'loss': 1.0472, 'grad_norm': 18.638700485229492, 'learning_rate': 1.8922222222222224e-05, 'epoch': 0.22}
{'loss': 0.8686, 'grad_norm': 19.8570556640625, 'learning_rate': 1.7811111111111112e-05, 'epoch': 0.44}
{'loss': 0.7459, 'grad_norm': 19.30426597595215, 'learning_rate': 1.67e-05, 'epoch': 0.67}
{'loss': 0.7409, 'grad_norm': 13.437138557434082, 'learning_rate': 1.558888888888889e-05, 'epoch': 0.89}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_accuracy': 0.7179344808439756, 'eval_loss': 0.6741003394126892, 'eval_runtime': 360.7419, 'eval_samples_per_second': 4.992, 'eval_steps_per_second': 0.158, 'epoch': 1.0}
{'loss': 0.6517, 'grad_norm': 15.205403327941895, 'learning_rate': 1.447777777777778e-05, 'epoch': 1.11}
{'loss': 0.5937, 'grad_norm': 11.223296165466309, 'learning_rate': 1.3366666666666669e-05, 'epoch': 1.33}
{'loss': 0.5848, 'grad_norm': 17.775564193725586, 'learning_rate': 1.2255555555555556e-05, 'epoch': 1.56}
{'loss': 0.5825, 'grad_norm': 16.878305435180664, 'learning_rate': 1.1144444444444445e-05, 'epoch': 1.78}
{'loss': 0.5773, 'grad_norm': 17.598421096801758, 'learning_rate': 1.0044444444444446e-05, 'epoch': 2.0}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_accuracy': 0.7445863409217102, 'eval_loss': 0.6423030495643616, 'eval_runtime': 361.0947, 'eval_samples_per_second': 4.988, 'eval_steps_per_second': 0.158, 'epoch': 2.0}
{'loss': 0.4558, 'grad_norm': 12.489052772521973, 'learning_rate': 8.933333333333333e-06, 'epoch': 2.22}
{'loss': 0.4652, 'grad_norm': 12.065128326416016, 'learning_rate': 7.822222222222224e-06, 'epoch': 2.44}
{'loss': 0.4915, 'grad_norm': 16.34526824951172, 'learning_rate': 6.711111111111111e-06, 'epoch': 2.67}
{'loss': 0.487, 'grad_norm': 15.237283706665039, 'learning_rate': 5.600000000000001e-06, 'epoch': 2.89}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_accuracy': 0.7684619655746807, 'eval_loss': 0.579537570476532, 'eval_runtime': 359.8464, 'eval_samples_per_second': 5.005, 'eval_steps_per_second': 0.158, 'epoch': 3.0}
{'loss': 0.4446, 'grad_norm': 22.059194564819336, 'learning_rate': 4.488888888888889e-06, 'epoch': 3.11}
{'loss': 0.3964, 'grad_norm': 16.094192504882812, 'learning_rate': 3.377777777777778e-06, 'epoch': 3.33}
{'loss': 0.3887, 'grad_norm': 20.18824005126953, 'learning_rate': 2.266666666666667e-06, 'epoch': 3.56}
{'loss': 0.3723, 'grad_norm': 13.33903694152832, 'learning_rate': 1.1666666666666668e-06, 'epoch': 3.78}
{'loss': 0.3921, 'grad_norm': 34.81520080566406, 'learning_rate': 5.555555555555556e-08, 'epoch': 4.0}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_accuracy': 0.7712382009994447, 'eval_loss': 0.6496511697769165, 'eval_runtime': 362.0603, 'eval_samples_per_second': 4.974, 'eval_steps_per_second': 0.157, 'epoch': 4.0}
{'train_runtime': 16599.3214, 'train_samples_per_second': 1.735, 'train_steps_per_second': 0.108, 'train_loss': 0.5714531347486708, 'epoch': 4.0}


TrainOutput(global_step=1800, training_loss=0.5714531347486708, metrics={'train_runtime': 16599.3214, 'train_samples_per_second': 1.735, 'train_steps_per_second': 0.108, 'total_flos': 8204620087296000.0, 'train_loss': 0.5714531347486708, 'epoch': 4.0})

In [21]:
print(f"Number of training examples: {len(train_dataset)}")
print(f"Batch size: {training_args.per_device_train_batch_size}")


Number of training examples: 7200
Batch size: 16


In [22]:
# Save the trained model and tokenizer
model.save_pretrained('./xlnet_model3')
tokenizer.save_pretrained('./xlnet_model3')

('./xlnet_model2\\tokenizer_config.json',
 './xlnet_model2\\special_tokens_map.json',
 './xlnet_model2\\spiece.model',
 './xlnet_model2\\added_tokens.json')

In [23]:
# Evaluate the model
results = trainer.evaluate()

  0%|          | 0/57 [00:00<?, ?it/s]

In [24]:
print(results)

{'eval_accuracy': 0.7712382009994447, 'eval_loss': 0.6496511697769165, 'eval_runtime': 362.6635, 'eval_samples_per_second': 4.966, 'eval_steps_per_second': 0.157, 'epoch': 4.0}


In [25]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# After training, get predictions on the validation dataset
predictions = trainer.predict(val_dataset)

# Get the predicted logits
logits = predictions.predictions

# Convert logits to predicted class labels
predicted_labels = np.argmax(logits, axis=1)

# Get the true labels
true_labels = val_labels

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')

# Display metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))


  0%|          | 0/57 [00:00<?, ?it/s]

Accuracy: 0.7712
F1 Score: 0.7697
Precision: 0.7710
Recall: 0.7712

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.63      0.69       338
           1       0.78      0.80      0.79       836
           2       0.77      0.80      0.78       627

    accuracy                           0.77      1801
   macro avg       0.77      0.75      0.76      1801
weighted avg       0.77      0.77      0.77      1801



In [26]:
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda
