In [4]:
!pip install gradio
!pip install transformers
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [5]:
import gradio as gr
from transformers import pipeline
import docx

# Inisialisasi pipeline Hugging Face untuk summarization menggunakan T5
summarizer = pipeline('summarization', model='t5-large', tokenizer='t5-large')

# Fungsi untuk membaca file txt
def read_txt(file):
    with open(file.name, 'r') as f:
        return f.read()

# Fungsi untuk membaca file docx
def read_docx(file):
    doc = docx.Document(file.name)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

# Fungsi utama untuk menangani input file dan menghasilkan ringkasan
def summarize_file(file):
    if file.name.endswith('.txt'):
        text = read_txt(file)
    elif file.name.endswith('.docx'):
        text = read_docx(file)
    else:
        return "Unsupported file type. Please upload a .txt or .docx file."

    # Gunakan model T5 untuk meringkas teks
    prefix = "summarize: "
    text = prefix + text
    summary = summarizer(text, max_length=10000, min_length=100, do_sample=False)[0]['summary_text']

    return summary

# Antarmuka Gradio
iface = gr.Interface(fn=summarize_file,
                     inputs=gr.File(),
                     outputs="text",
                     title="T5 Document Summarization",
                     description="Upload a .txt or .docx file for summarization using T5")

iface.launch()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://23f3cdd4b783d310a5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [7]:
import gradio as gr
from transformers import pipeline
import docx
import math

# Inisialisasi pipeline Hugging Face untuk summarization menggunakan T5
summarizer = pipeline('summarization', model='t5-large', tokenizer='t5-large')

# Fungsi untuk membaca file txt
def read_txt(file):
    with open(file.name, 'r') as f:
        return f.read()

# Fungsi untuk membaca file docx
def read_docx(file):
    doc = docx.Document(file.name)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

# Fungsi untuk membagi teks menjadi bagian yang lebih kecil (chunking)
def split_text(text, max_chunk_size=2096):
    words = text.split()
    chunks = [' '.join(words[i:i + max_chunk_size]) for i in range(0, len(words), max_chunk_size)]
    return chunks

# Fungsi utama untuk menangani input file dan menghasilkan ringkasan
def summarize_file(file):
    if file.name.endswith('.txt'):
        text = read_txt(file)
    elif file.name.endswith('.docx'):
        text = read_docx(file)
    else:
        return "Unsupported file type. Please upload a .txt or .docx file."

    # Memastikan teks tidak terlalu panjang untuk model
    max_token_per_chunk = 512  # Batas token yang diizinkan
    chunks = split_text(text, max_chunk_size=max_token_per_chunk)

    summarized_chunks = []
    for chunk in chunks:
        # Tambahkan prefix untuk model T5
        prefix = "summarize: "
        chunk = prefix + chunk
        # Ringkas tiap bagian (chunk) teks
        summary = summarizer(chunk, max_length=1024, min_length=100, do_sample=False)[0]['summary_text']
        summarized_chunks.append(summary)

    # Gabungkan hasil ringkasan dari setiap bagian
    full_summary = ' '.join(summarized_chunks)

    return full_summary

# Antarmuka Gradio
iface = gr.Interface(fn=summarize_file,
                     inputs=gr.File(),
                     outputs="text",
                     title="T5 Document Summarization",
                     description="Upload a .txt or .docx file for summarization using T5")

iface.launch()




Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://d2e38c6c42a5799086.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


