In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
import torch_directml

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = 'model/80/model_penilaian'
DEVICE = torch_directml.device()
MAX_LEN = 512
BATCH_SIZE = 3
EPOCHS = 10
LEARNING_RATE = 1e-5

In [3]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, nama_dokumen, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.nama_dokumen = nama_dokumen
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        nama_dokumen = self.nama_dokumen[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long),
            'nama_dokumen': nama_dokumen
        }

# Fungsi untuk memuat dan memproses data
def load_data(file_path):
    df = pd.read_csv(file_path)
    texts = df['summary'].tolist()
    nama_dokumen = df['nama_dokumen'].tolist()  # Tambahkan baris ini
    label_columns = [
        'nilai_isi_disertasi',
        'nilai_penguasaan_materi_dan_metode_penelitian',
        'nilai_kontribusi_hasil_penelitian_bagi_ilmu_pengetahuan',
        'nilai_kontribusi_hasil_penelitian_bagi_masyarakat',
        'nilai_wawasan_pengetahuan_konsep_ilmu_komputer',
        'nilai_kemampuan_untuk_menjawab_pertanyaan'
    ]

    # Inisialisasi LabelEncoder
    label_encoders = {col: LabelEncoder() for col in label_columns}

    # Mengonversi label kategorikal menjadi numerik
    encoded_labels = []
    for col in label_columns:
        encoded_labels.append(label_encoders[col].fit_transform(df[col]))

    # Menggabungkan label yang telah dienkode
    labels = np.column_stack(encoded_labels)

    return texts, labels, label_encoders, label_columns, nama_dokumen  # Tambahkan nama_dokumen di sini

# Fungsi untuk melatih model
def train_model(model, train_data_loader, val_data_loader, epochs, optimizer, device):
    criterion = torch.nn.CrossEntropyLoss()
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_data_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            loss = 0
            for i in range(labels.shape[1]):
                loss += criterion(outputs.logits[:, i*3:(i+1)*3], labels[:, i])
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        avg_train_loss = total_loss / len(train_data_loader)
        print(f"Average train loss: {avg_train_loss}")

        val_loss = evaluate_model(model, val_data_loader, device)
        print(f"Validation loss: {val_loss}")

# Fungsi untuk evaluasi model
def evaluate_model(model, data_loader, device):
    model.eval()
    criterion = torch.nn.CrossEntropyLoss()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            loss = 0
            for i in range(labels.shape[1]):
                loss += criterion(outputs.logits[:, i*3:(i+1)*3], labels[:, i])
            total_loss += loss.item()

    return total_loss / len(data_loader)

# Fungsi untuk prediksi
def predict(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    nama_dokumen_list = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels']
            nama_dokumen = batch['nama_dokumen']

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits.view(-1, 6, 3), dim=2).cpu().numpy()
            predictions.extend(preds)
            actual_labels.extend(labels.numpy())
            nama_dokumen_list.extend(nama_dokumen)

    return np.array(predictions), np.array(actual_labels), nama_dokumen_list

    return np.array(predictions), np.array(actual_labels)

def save_model(model, tokenizer, output_dir):
    # Pastikan direktori output ada
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Simpan model
    model.save_pretrained(output_dir)

    # Simpan tokenizer
    tokenizer.save_pretrained(output_dir)

    print(f"Model dan tokenizer telah disimpan di {output_dir}")

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=18).to(DEVICE)

In [None]:
test_texts, test_labels, _, _, test_nama_dokumen = load_data('data/penilaian-data/80/final-data-penilaian.csv')
train_texts, train_labels, label_encoders, label_columns, train_nama_dokumen = load_data('data/penilaian-data/80/final-data-penilaian.csv')
test_dataset = CustomDataset(test_texts, test_labels, test_nama_dokumen, tokenizer, MAX_LEN)
test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [6]:
# Evaluate on test set
test_loss = evaluate_model(model, test_data_loader, DEVICE)
print(f"Test loss: {test_loss}")

# Make predictions on test set
predictions, actual_labels, nama_dokumen_list = predict(model, test_data_loader, DEVICE)

# Calculate metrics
f1 = f1_score(actual_labels.flatten(), predictions.flatten(), average='weighted')
accuracy = accuracy_score(actual_labels.flatten(), predictions.flatten())

print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

# Convert predictions back to original labels
original_predictions = {}
for i, col in enumerate(label_columns):
    original_predictions[col] = label_encoders[col].inverse_transform(predictions[:, i])

# Create DataFrame with original labels
results_df = pd.DataFrame(original_predictions)
results_df['nama_dokumen'] = nama_dokumen_list
results_df = results_df[['nama_dokumen'] + [col for col in label_columns]]
results_df.to_csv('data/output-bert-penilaian/pred-80.csv', index=False)
print("Predictions saved to 'data/output-bert-penilaian/pred-80.csv'")


Test loss: 3.8355111360549925
F1 Score: 0.8263962673191882
Accuracy: 0.8277777777777777
Predictions saved to 'data/output-bert-penilaian/pred-80.csv'
