# Membuat summary BERT

In [1]:
import torch
import pandas as pd
from transformers import EncoderDecoderModel, BertTokenizer
import torch_directml
import warnings
from tqdm import tqdm

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Gunakan torch_directml untuk akselerasi GPU
device = torch_directml.device()

def load_bert_model(model_path):
    model = EncoderDecoderModel.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = model.to(device)
    return model, tokenizer

@torch.no_grad()
def generate_bert_summary(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    summary_ids = model.generate(
        inputs["input_ids"],
        num_beams=8,
        max_length=256,
        early_stopping=True
    )
    
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def process_section(section, model, tokenizer, df):
    print(f"Processing {section}...")
    
    summaries = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Summarizing {section}"):
        summary = generate_bert_summary(row['kalimat'], model, tokenizer)
        summaries.append(summary)
    
    result = pd.DataFrame({
        'nama_dokumen': df['nama_dokumen'],
        section: summaries
    })
    
    print(f"Completed {section}")
    return result

In [3]:
def main():
    sections = ['latarbelakang', 'rumusanmasalah', 'tujuanpenelitian', 'rangkumanpenelitianterkait', 'metodologipenelitian']
    results = []

    for section in sections:
        df = pd.read_csv(f'data/final-data/{section}.csv')
        model, tokenizer = load_bert_model(f"model/80/model_{section}")
        
        result = process_section(section, model, tokenizer, df)
        results.append(result)
        
        # Clear GPU memory
        del model
        del tokenizer
        torch.cuda.empty_cache()
    
    # Gabungkan hasil dari semua bagian
    output_df = pd.concat(results, axis=1)
    output_df = output_df.loc[:,~output_df.columns.duplicated()]  # Hapus kolom duplikat
    
    # Simpan output-bert.csv
    output_df.to_csv('data/penilaian-data/80/output-bert.csv', index=False)
    
    # Buat summary akhir
    output_df['summary'] = output_df[sections].agg(' '.join, axis=1)
    
    # Simpan final-output-bert.csv
    final_output_df = output_df[['nama_dokumen', 'summary']]
    final_output_df.to_csv('data/penilaian-data/80/merged-summary.csv', index=False)
    
    print("All sections processed and saved.")

if __name__ == "__main__":
    main()

Processing latarbelakang...


Summarizing latarbelakang: 100%|██████████| 30/30 [08:55<00:00, 17.86s/it]


Completed latarbelakang
Processing rumusanmasalah...


Summarizing rumusanmasalah: 100%|██████████| 30/30 [02:44<00:00,  5.50s/it]


Completed rumusanmasalah
Processing tujuanpenelitian...


Summarizing tujuanpenelitian: 100%|██████████| 30/30 [02:12<00:00,  4.40s/it]


Completed tujuanpenelitian
Processing rangkumanpenelitianterkait...


Summarizing rangkumanpenelitianterkait: 100%|██████████| 30/30 [02:33<00:00,  5.11s/it]


Completed rangkumanpenelitianterkait
Processing metodologipenelitian...


Summarizing metodologipenelitian: 100%|██████████| 30/30 [08:57<00:00, 17.91s/it]

Completed metodologipenelitian
All sections processed and saved.





# Menggabungkan data penilaian dan summary BERT

In [4]:
import pandas as pd

def merge_assessment_data(excel_path, csv_path, output_path):
    # Baca file Excel
    excel_df = pd.read_excel(excel_path)
    
    # Baca file CSV
    csv_df = pd.read_csv(csv_path)
    
    # Gabungkan DataFrame berdasarkan 'nama_dokumen'
    merged_df = pd.merge(excel_df, csv_df, on='nama_dokumen', how='outer')
    
    # Periksa apakah ada data yang tidak cocok
    unmatched_excel = excel_df[~excel_df['nama_dokumen'].isin(csv_df['nama_dokumen'])]
    unmatched_csv = csv_df[~csv_df['nama_dokumen'].isin(excel_df['nama_dokumen'])]
    
    if not unmatched_excel.empty:
        print("Data dari Excel yang tidak cocok:")
        print(unmatched_excel['nama_dokumen'])
    
    if not unmatched_csv.empty:
        print("Data dari CSV yang tidak cocok:")
        print(unmatched_csv['nama_dokumen'])
    
    # Simpan hasil gabungan ke file CSV baru
    merged_df.to_csv(output_path, index=False)
    print(f"File hasil penggabungan telah disimpan di: {output_path}")

    # Tampilkan informasi tentang hasil penggabungan
    print(f"Jumlah baris dalam file Excel: {len(excel_df)}")
    print(f"Jumlah baris dalam file CSV: {len(csv_df)}")
    print(f"Jumlah baris dalam file hasil penggabungan: {len(merged_df)}")

# Jalankan fungsi
excel_path = 'data/penilaian-xlsx/data-penilaian.xlsx'
csv_path = 'data/penilaian-data/80/merged-summary.csv'
output_path = 'data/penilaian-data/80/final-data-penilaian.csv'

merge_assessment_data(excel_path, csv_path, output_path)

File hasil penggabungan telah disimpan di: data/penilaian-data/80/final-data-penilaian.csv
Jumlah baris dalam file Excel: 30
Jumlah baris dalam file CSV: 30
Jumlah baris dalam file hasil penggabungan: 30


# Split data

In [5]:
import pandas as pd

def split_dataset(input_file, output_dir, val_docs, test_docs):
    # Baca file input
    df = pd.read_csv(input_file)
    
    # Fungsi untuk menentukan set berdasarkan nama dokumen
    def get_set(doc_name):
        if doc_name in val_docs:
            return 'val'
        elif doc_name in test_docs:
            return 'test'
        else:
            return 'train'
    
    # Tambahkan kolom 'set' ke DataFrame
    df['set'] = df['nama_dokumen'].apply(get_set)
    
    # Pisahkan DataFrame berdasarkan set
    train_df = df[df['set'] == 'train'].drop('set', axis=1)
    val_df = df[df['set'] == 'val'].drop('set', axis=1)
    test_df = df[df['set'] == 'test'].drop('set', axis=1)
    
    # Simpan file CSV
    train_df.to_csv(f"{output_dir}/train.csv", index=False)
    val_df.to_csv(f"{output_dir}/val.csv", index=False)
    test_df.to_csv(f"{output_dir}/test.csv", index=False)
    
    print(f"Train set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")
    print(f"Test set size: {len(test_df)}")

# Definisikan dokumen validasi dan test
val_docs = [
    'Utami Lestari_Kualifikasi.txt',
    'Kualifikasi Witta Listiya Ningrum.txt',
    'Kualifikasi_Remigius.txt'
]

test_docs = [
    'Robert_Kualifikasi.txt',
    'MetaMeysawati_KUALIFIKASI(99216026).txt',
    'Kualifikasi_Rama Dian Syah.txt'
]

# Jalankan fungsi
input_file = 'data/penilaian-data/80/final-data-penilaian.csv'
output_dir = 'data/model-data-penilaian/80'

split_dataset(input_file, output_dir, val_docs, test_docs)

Train set size: 24
Validation set size: 3
Test set size: 3


: 