<a href="https://colab.research.google.com/github/lulumulum86/Transformer_Model/blob/main/Tugas4_Transformer_Matkul_Deep_Learning_Lanjut_Pertemuan_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---
# **Tugas 4 : Transformer Matkul Deep Learning Lanjut Pertemuan 10**
---
# **Nama  : Lu'luah Nafisah Ulum**  
# **NIM     : 41236607**  
# **Kelas : TI-2023-KIP-C1**  
---

# **1. PERSIAPAN LINGKUNGAN (ENVIRONMENT SETUP)**

In [1]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from google.colab import drive
import os

# Cek ketersediaan GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Status Perangkat: {device}")
if device.type == 'cuda':
    print(f"GPU Terdeteksi: {torch.cuda.get_device_name(0)}")
else:
    print("PERINGATAN: GPU tidak terdeteksi. Harap ubah Runtime Type ke T4 GPU.")



Status Perangkat: cuda
GPU Terdeteksi: Tesla T4


# **2. INTEGRASI GOOGLE DRIVE & LOAD DATA**

In [2]:
# Mounting Google Drive untuk mengakses dataset sesuai path yang diberikan.
drive.mount('/content/drive')

# Definisi Path File
path_training = '/content/drive/MyDrive/Colab Notebooks/Model_Transformer_DLL/Training.csv'
path_testing  = '/content/drive/MyDrive/Colab Notebooks/Model_Transformer_DLL/Testing.csv'

# Membaca Data Training
try:
    df_train = pd.read_csv(path_training)
    print("\n[INFO] Data Training Berhasil Dimuat:")
    print(df_train.head())
    print(f"Total Data Training: {len(df_train)}")
    print(f"Distribusi Label:\n{df_train['Label'].value_counts()}")
except FileNotFoundError:
    print(f"[ERROR] File tidak ditemukan di: {path_training}")

# Membaca Data Testing
try:
    df_test = pd.read_csv(path_testing)
    print("\n[INFO] Data Testing Berhasil Dimuat:")
    print(df_test.head())
    print(f"Total Data Testing: {len(df_test)}")
except FileNotFoundError:
    print(f"[ERROR] File tidak ditemukan di: {path_testing}")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

[INFO] Data Training Berhasil Dimuat:
                                                Text     Label
0            It works as described, no major issues.   Neutral
1               The experience was average, not bad.   Neutral
2            Very poor quality, not worth the money.  Negative
3  The item arrived on time and in perfect condit...  Positive
4              The product is okay, nothing special.   Neutral
Total Data Training: 300
Distribusi Label:
Label
Neutral     100
Negative    100
Positive    100
Name: count, dtype: int64

[INFO] Data Testing Berhasil Dimuat:
                                                Text
0      Excellent customer service and great product.
1      Excellent customer service and great product.
2  The product exceeded my expectations and works...
3                 Item arrived late and was damaged.
4                 Item arriv

# **3. DATA PREPROCESSING**

In [3]:
# Transformasi label teks menjadi format numerik agar bisa diproses oleh model.
# Mapping Label: Negative -> 0, Neutral -> 1, Positive -> 2
label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
inverse_label_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}

# Terapkan mapping pada data training
# Kita asumsikan kolom bernama 'Label' dan 'Text' sesuai sample
df_train['label_code'] = df_train['Label'].map(label_map)

# Pengecekan data kosong
if df_train['label_code'].isnull().any():
    print("[WARNING] Ada label yang tidak dikenali atau kosong. Dropping NaN...")
    df_train = df_train.dropna(subset=['label_code'])

# Ubah tipe data label menjadi integer
df_train['label_code'] = df_train['label_code'].astype(int)

# Pisahkan teks dan label untuk training
train_texts = df_train['Text'].tolist()
train_labels = df_train['label_code'].tolist()
test_texts_source = df_test['Text'].tolist() # Data uji asli (tanpa label)


# **4. TOKENISASI (TOKENIZATION)**

In [5]:
# Menggunakan DistilBERT tokenizer karena ringan dan cepat untuk T4 GPU
# Model: distilbert-base-uncased
MODEL_NAME = "distilbert-base-uncased"
print(f"\n[INFO] Memuat Tokenizer: {MODEL_NAME}...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Fungsi untuk encoding data
def preprocess_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Tokenisasi data training
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
# Tokenisasi data testing (untuk prediksi nanti)
test_encodings = tokenizer(test_texts_source, truncation=True, padding=True, max_length=128)

# Membuat Class Dataset PyTorch Custom
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Membuat objek dataset
train_dataset = SentimentDataset(train_encodings, train_labels)
# Untuk testing dataset, kita tidak punya label
predict_dataset = SentimentDataset(test_encodings)




[INFO] Memuat Tokenizer: distilbert-base-uncased...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# **5. MEMBANGUN MODEL TRANSFORMER**

In [6]:
# Menggunakan AutoModelForSequenceClassification
# num_labels=3 karena ada (Negative, Neutral, Positive)

print(f"\n[INFO] Menginisialisasi Model {MODEL_NAME} untuk Klasifikasi...")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
model.to(device) # Pindahkan model ke GPU




[INFO] Menginisialisasi Model distilbert-base-uncased untuk Klasifikasi...


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# **6. TRAINING (FINE-TUNING)**

In [7]:
training_args = TrainingArguments(
    output_dir='./results',          # folder output
    num_train_epochs=4,              # jumlah epoch (data sedikit, 4 epoch cukup)
    per_device_train_batch_size=16,  # batch size
    per_device_eval_batch_size=16,
    warmup_steps=50,                 # warmup learning rate
    weight_decay=0.01,               # regularisasi
    logging_dir='./logs',            # log directory
    logging_steps=10,
    learning_rate=2e-5,              # learning rate standar untuk BERT
    save_strategy="epoch"
)

# Inisialisasi Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

print("\n[INFO] Memulai Proses Training...")
trainer.train()
print("[INFO] Training Selesai.")

# Simpan model yang sudah dilatih
model_save_path = "/content/drive/MyDrive/Colab Notebooks/Model_Transformer_DLL/saved_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"[INFO] Model disimpan di: {model_save_path}")




[INFO] Memulai Proses Training...


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
10,1.0983
20,1.0742
30,1.0043
40,0.8209
50,0.5066
60,0.2301
70,0.1177


[INFO] Training Selesai.
[INFO] Model disimpan di: /content/drive/MyDrive/Colab Notebooks/Model_Transformer_DLL/saved_model


# **7. PREDIKSI PADA DATA TESTING (UNLABELED)**

In [8]:
# Menggunakan model yang sudah dilatih untuk melabeli 400 data testing.
print("\n[INFO] Melakukan Prediksi pada Data Testing...")

# Prediksi menggunakan Trainer
predictions = trainer.predict(predict_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

# Konversi kode label kembali ke teks (0->Negative, dst)
predicted_labels = [inverse_label_map[p] for p in preds]

# Masukkan hasil prediksi ke DataFrame
df_test['Predicted_Label'] = predicted_labels




[INFO] Melakukan Prediksi pada Data Testing...


# **8. MENAMPILKAN DAN MENYIMPAN HASIL**

In [9]:
print("\n[INFO] Contoh 10 Hasil Prediksi Teratas:")
print(df_test[['Text', 'Predicted_Label']].head(10))

# Simpan hasil ke CSV baru di Drive
output_path = '/content/drive/MyDrive/Colab Notebooks/Model_Transformer_DLL/Testing_Labeled_Result.csv'
df_test.to_csv(output_path, index=False)

print(f"\n[SUCCESS] Hasil pelabelan otomatis telah disimpan di: {output_path}")



[INFO] Contoh 10 Hasil Prediksi Teratas:
                                                Text Predicted_Label
0      Excellent customer service and great product.        Positive
1      Excellent customer service and great product.        Positive
2  The product exceeded my expectations and works...        Positive
3                 Item arrived late and was damaged.        Negative
4                 Item arrived late and was damaged.        Negative
5            Very poor quality, not worth the money.        Negative
6          Amazing quality, totally worth the price.        Positive
7           Customer service was unhelpful and rude.        Negative
8  I am disappointed, it did not meet my expectat...        Negative
9  Delivery was on time, but packaging could be b...         Neutral

[SUCCESS] Hasil pelabelan otomatis telah disimpan di: /content/drive/MyDrive/Colab Notebooks/Model_Transformer_DLL/Testing_Labeled_Result.csv
