In [1]:
from modules.preprocessing import read_pdf, encode_text, load_tokenizer
from modules.model import load_indobert_model, summarize_text
from rouge_score import rouge_scorer

# Fungsi evaluasi menggunakan ROUGE
def evaluate_summary(summary, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, summary)
    return scores

# Fungsi utama pipeline
def summarization_pipeline(file_path):
    # Langkah 1: Membaca dokumen PDF
    text = read_pdf(file_path)
    
    # Langkah 2: Memuat tokenizer dan model
    tokenizer = load_tokenizer()
    model = load_indobert_model()

    # Langkah 3: Melakukan embedding
    input_ids, token_type_ids, attention_mask = encode_text(text, tokenizer)
    inputs = {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask
    }
    
    # Langkah 4: Menghasilkan ringkasan
    summarization_output = summarize_text(inputs, model)

    # Konversi tensor ke list token IDs
    summarization_output = summarization_output.tolist()

    # Konversi tensor output model menjadi string
    summary_text = tokenizer.decode(summarization_output[0], skip_special_tokens=True)
    
    # Langkah 5: Evaluasi hasil ringkasan (misal: dengan ringkasan referensi)
    reference_summary = "..."  # Ringkasan yang benar
    rouge_scores = evaluate_summary(summary_text, text)

    return summarization_output, rouge_scores


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = 'clean-data/Ike Putri Kusumawijaya (99216004).pdf'  # Sesuaikan dengan file yang diunggah

summary, rouge = summarization_pipeline(file_path)

print("Ringkasan:", summary)
print("ROUGE Scores:", rouge)



TypeError: The current model class (BertModel) is not compatible with `.generate()`, as it doesn't have a language model head. Please use one of the following classes instead: {'BertLMHeadModel'}

In [10]:
import pdfplumber
from transformers import AutoTokenizer, AutoModel, pipeline

# Import libraries for text preprocessing
import re
import torch

In [2]:
import pdfplumber
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))

# Fungsi untuk membersihkan teks
def clean_text(text):
    # Hapus karakter non-alfabet dan simbol khusus
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Hapus spasi ganda atau lebih
    text = re.sub(r'\s+', ' ', text)
    # Lowercase
    text = text.lower()
    # Hapus stopwords
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text.strip()

# Fungsi untuk mengekstrak teks dari PDF
def extract_text_from_pdf(pdf_path):
    extracted_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                # Bersihkan teks dari halaman
                cleaned_text = clean_text(page_text)
                extracted_text += cleaned_text + "\n"
    return extracted_text

# Path ke file PDF
pdf_file_path = 'clean-data/Ike Putri Kusumawijaya (99216004).pdf'

# Ekstrak dan bersihkan teks dari PDF
cleaned_pdf_text = extract_text_from_pdf(pdf_file_path)

# Output teks yang sudah dibersihkan
print(cleaned_pdf_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


abstrak mendeteksi perilaku anomali cepat otomatis lingkungan ramai meningkatkan keselamatan mencegah risiko menjamin respon cepat deteksi anomali sistem pengawasan keselamatan keamanan pencegahan bencana deteksi anomali menemukan anomali cepat otomatis sistem pemantauan cerdas manajemen kerumunan efektif tujuan penelitian mengembangkan metode identifikasi pergerakan anomali kerumunan berbasis generatif generative adversarial network gan pemrosesan video realtime konsep penelitian kamera pengawas membandingkan masukkan train gan disimpan server video diambil kamera real time hasil perbandingan video menghasilkan behaviour analysis pola gerakan anomali kerumunan outputnya dataset identifikasi gerakan anomali kerumunan ii
1 pendahuluan 11 latar bertambahnya populasi keragaman aktivitas manusia adegan keramaian dunia nyata komposisi penonton heterogen warna kulit usia bahasa budaya menghadirkan administratif penyelenggara lokal berfokus manajemen acara efisien otoritas administratif pedul

In [11]:
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

In [5]:
summarizer = pipeline("summarization", model="indobenchmark/indobert-base-p1")

def summarize_text(text, max_length=500, min_length=50):
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']


The model 'BertModel' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].


In [None]:
summary = summarize_text(cleaned_pdf_text)
print("Summarized Text:\n", summary)