In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight

In [39]:
# Load the dataset
df = pd.read_csv("combined_output.csv", on_bad_lines='skip')

In [40]:
df.head()

Unnamed: 0,original_id,source_name,title,url,body,date_published,language,date_modified,author_list,images,description,sentiment,emotions,entities,quotations,prValues,clipping,label,category
0,'5404c9d2fd24852afa122f2cc01cb3acba3c5d05b682d...,'kabargayo.com,'Kredit sepeda motor bisa terbayar jika Anda m...,'https://www.kabargayo.com/2024/09/19/kredit-s...,"'Jakarta, VIVA – Pembayaran kredit sepeda moto...",19/09/2024 22.32,'id,19/09/2024 22.32,'Aldi Hadad,'https://i1.wp.com/thumb.viva.co.id/media/fron...,,'positive,,"'Hari Pembayaran Berbayar atau Harcilnas 2024,...",,5250000,Adira,,
1,'e1e3f8d68b58568e8217b7562d48de634fceb0d837135...,'viva.co.id,'Kredit Motor Bisa Lunas Jika Bayar Cicilan Te...,'https://www.viva.co.id/otomotif/tips/1753596-...,"'Jakarta, VIVA – Cicilan kredit motor yang ser...",19/09/2024 22.30,'id,19/09/2024 22.30,'Krisna Wicaksono,'https://thumb.viva.co.id/media/frontend/thumb...,,'positive,,"'Harinya Cicilan Lunas,2024,PT Adira Dinamika ...",,5250000,Adira,,
2,'dca74b8fa4eabf60cebfa7b811ecb385872a0fd301eaf...,'kabarmegapolitan.pikiran-rakyat.com,'Adira Finance Umumkan Pemenang HARCILNAS 2024...,'https://kabarmegapolitan.pikiran-rakyat.com/b...,'KABARMEGAPOLITAN.com - PT Adira Dinamika Mult...,19/09/2024 21.45,'id,19/09/2024 21.45,'Yuliansyah,'https://assets.pikiran-rakyat.com/www/network...,,'positive,'HARCILNAS merupakan wujud apresiasi kami kepa...,"'PT Adira Dinamika,Cicilan Lunas HARCILNAS,12 ...","(Person :Tania Endah Budhi ,Quote : HARCILNAS ...",5250000,Adira,,
3,'56c73e2a6d254a17a5cc21dee7ed0b4660c3af70c093b...,'banggairaya.id,"'Dapatkan Promo Menarik, Yamaha Prima Motor Ra...",'https://banggairaya.id/dapatkan-promo-menarik...,'BANGGAI RAYA- Yamaha Prima Motor ramaikan pam...,19/09/2024 19.45,'id,19/09/2024 19.45,'Chikal Connect,'https://i0.wp.com/banggairaya.id/wp-content/u...,,'neutral,,"'RAYA- Yamaha Prima Motor,Banggai Goverment Ex...",,5250000,Adira,,
4,'1cd6c6db60224b6ee5f49cd5d6c62cd2850d9f3255721...,'jakarta.tribunnews.com,"'Sindikat Penipuan Leasing, Satu Bulan Ajukan ...",'https://jakarta.tribunnews.com/2024/09/19/sin...,'Laporan wartawan TribunJakarta.com Yusuf Bach...,19/09/2024 18.43,'id,19/09/2024 18.43,'Yusuf Bachtiar,'https://asset-2.tstatic.net/jakarta/foto/bank...,,'neutral,'Pelaku ini melakukan pembiayaan pembelian ken...,"'Yusuf Bachtiar TRIBUNJAKARTACOM,MEDAN,SATRIA,...","(Person :Dedi ,Quote : Pelaku ini melakukan pe...",5250000,Adira,,


In [41]:
# Clean extra characters from all columns in the DataFrame
for column in df.columns:
    if df[column].dtype == 'object':  # Check if the column is of string type
        df[column] = df[column].str.strip("'")  # Remove extra characters

# Filter relevant columns
df_filtered = df[['body', 'sentiment']].dropna()


In [42]:
df.head()

Unnamed: 0,original_id,source_name,title,url,body,date_published,language,date_modified,author_list,images,description,sentiment,emotions,entities,quotations,prValues,clipping,label,category
0,5404c9d2fd24852afa122f2cc01cb3acba3c5d05b682d4...,kabargayo.com,Kredit sepeda motor bisa terbayar jika Anda me...,https://www.kabargayo.com/2024/09/19/kredit-se...,"Jakarta, VIVA – Pembayaran kredit sepeda motor...",19/09/2024 22.32,id,19/09/2024 22.32,Aldi Hadad,https://i1.wp.com/thumb.viva.co.id/media/front...,,positive,,"Hari Pembayaran Berbayar atau Harcilnas 2024,P...",,5250000,Adira,,
1,e1e3f8d68b58568e8217b7562d48de634fceb0d8371356...,viva.co.id,Kredit Motor Bisa Lunas Jika Bayar Cicilan Tep...,https://www.viva.co.id/otomotif/tips/1753596-k...,"Jakarta, VIVA – Cicilan kredit motor yang seri...",19/09/2024 22.30,id,19/09/2024 22.30,Krisna Wicaksono,https://thumb.viva.co.id/media/frontend/thumbs...,,positive,,"Harinya Cicilan Lunas,2024,PT Adira Dinamika M...",,5250000,Adira,,
2,dca74b8fa4eabf60cebfa7b811ecb385872a0fd301eaf5...,kabarmegapolitan.pikiran-rakyat.com,Adira Finance Umumkan Pemenang HARCILNAS 2024:...,https://kabarmegapolitan.pikiran-rakyat.com/bi...,KABARMEGAPOLITAN.com - PT Adira Dinamika Multi...,19/09/2024 21.45,id,19/09/2024 21.45,Yuliansyah,https://assets.pikiran-rakyat.com/www/network/...,,positive,HARCILNAS merupakan wujud apresiasi kami kepad...,"PT Adira Dinamika,Cicilan Lunas HARCILNAS,12 p...","(Person :Tania Endah Budhi ,Quote : HARCILNAS ...",5250000,Adira,,
3,56c73e2a6d254a17a5cc21dee7ed0b4660c3af70c093b2...,banggairaya.id,"Dapatkan Promo Menarik, Yamaha Prima Motor Ram...",https://banggairaya.id/dapatkan-promo-menarik-...,BANGGAI RAYA- Yamaha Prima Motor ramaikan pame...,19/09/2024 19.45,id,19/09/2024 19.45,Chikal Connect,https://i0.wp.com/banggairaya.id/wp-content/up...,,neutral,,"RAYA- Yamaha Prima Motor,Banggai Goverment Exp...",,5250000,Adira,,
4,1cd6c6db60224b6ee5f49cd5d6c62cd2850d9f3255721d...,jakarta.tribunnews.com,"Sindikat Penipuan Leasing, Satu Bulan Ajukan K...",https://jakarta.tribunnews.com/2024/09/19/sind...,Laporan wartawan TribunJakarta.com Yusuf Bacht...,19/09/2024 18.43,id,19/09/2024 18.43,Yusuf Bachtiar,https://asset-2.tstatic.net/jakarta/foto/bank/...,,neutral,Pelaku ini melakukan pembiayaan pembelian kend...,"Yusuf Bachtiar TRIBUNJAKARTACOM,MEDAN,SATRIA,T...","(Person :Dedi ,Quote : Pelaku ini melakukan pe...",5250000,Adira,,


In [43]:
# Balance dataset classes
class_counts = df_filtered['sentiment'].value_counts()
min_class = class_counts.min()
df_balanced = df_filtered.groupby('sentiment').apply(lambda x: x.sample(min_class)).reset_index(drop=True)

In [44]:
# Ensure balanced classes during split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_filtered['body'].tolist(), df_filtered['sentiment'].tolist(),
    test_size=0.2, random_state=42, stratify=df_filtered['sentiment']
)

In [45]:

# Load tokenizer and tokenize texts
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


In [46]:
# Map sentiment labels to integers
label_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
train_labels = [label_mapping[label] for label in train_labels]
val_labels = [label_mapping[label] for label in val_labels]

In [47]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [48]:
# Create datasets
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

In [49]:
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [50]:
# Load XLNet model for sequence classification
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=3)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Adjust model's loss function to include class weights
model.config.problem_type = "single_label_classification"
model.config.class_weights = class_weights.tolist()

In [52]:
from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    return {'eval_accuracy': acc}


In [53]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    fp16=True,
)



In [54]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Include custom metrics
)


OutOfMemoryError: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 14.23 GiB is allocated by PyTorch, and 26.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [18]:
import torch

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [None]:
# Train the model
trainer.train()

  0%|          | 0/1800 [00:00<?, ?it/s]

Initializing global attention on CLS token...


OutOfMemoryError: CUDA out of memory. Tried to allocate 290.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 13.89 GiB is allocated by PyTorch, and 240.49 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [23]:
print(f"Number of training examples: {len(train_dataset)}")
print(f"Batch size: {training_args.per_device_train_batch_size}")


Number of training examples: 7200
Batch size: 16


In [24]:
# Save the trained model and tokenizer
model.save_pretrained('./xlnet_model2')
tokenizer.save_pretrained('./xlnet_model2')

('./longformer_model7\\tokenizer_config.json',
 './longformer_model7\\special_tokens_map.json',
 './longformer_model7\\vocab.json',
 './longformer_model7\\merges.txt',
 './longformer_model7\\added_tokens.json')

In [25]:
# Evaluate the model
results = trainer.evaluate()

  0%|          | 0/57 [00:00<?, ?it/s]

In [26]:
print(results)

{'eval_accuracy': 0.7784564131038312, 'eval_loss': 0.583918571472168, 'eval_runtime': 200.6016, 'eval_samples_per_second': 8.978, 'eval_steps_per_second': 0.284, 'epoch': 4.0}


In [27]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# After training, get predictions on the validation dataset
predictions = trainer.predict(val_dataset)

# Get the predicted logits
logits = predictions.predictions

# Convert logits to predicted class labels
predicted_labels = np.argmax(logits, axis=1)

# Get the true labels
true_labels = val_labels

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')

# Display metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))


  0%|          | 0/57 [00:00<?, ?it/s]

Accuracy: 0.7785
F1 Score: 0.7788
Precision: 0.7794
Recall: 0.7785

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.72      0.70       338
           1       0.80      0.79      0.80       836
           2       0.80      0.80      0.80       627

    accuracy                           0.78      1801
   macro avg       0.76      0.77      0.76      1801
weighted avg       0.78      0.78      0.78      1801



In [28]:
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda
