<a href="https://colab.research.google.com/github/navin2065/Text-Summarization/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install transformers ipywidgets PyPDF2 python-docx -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m41.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Import librarie
from transformers import pipeline, BartTokenizer
import ipywidgets as widgets
from IPython.display import display, clear_output, JavaScript
import io
import base64
import PyPDF2
import docx
import time


In [None]:
# Initialize the summarization pipeline and tokenizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Create GUI components
upload_button = widgets.FileUpload(
    accept='.txt,.pdf,.doc,.docx',  # Accept .txt, .pdf, .doc, and .docx files
    multiple=False,
    description="Upload File (TXT/PDF/DOC)"
)
text_input = widgets.Textarea(
    value='',
    placeholder='Paste text here or upload a file (50-10000 chars)',
    description='Input Text:',
    layout={'width': '1000px', 'height': '400px'}
)
summary_output = widgets.Textarea(
    value='',
    placeholder='Summary will appear here',
    description='Summary:',
    layout={'width': '1000px', 'height': '400px'},
    disabled=True
)
summarize_button = widgets.Button(
    description="Summarize",
    button_style='success',
    tooltip='Click to summarize the text'
)
download_button = widgets.Button(
    description="Download Summary",
    button_style='info',
    tooltip='Click to download the summary as a text file',
    disabled=True
)
max_length_slider = widgets.IntSlider(
    value=130,
    min=1,
    max=1000,
    step=10,
    description='Max Length:',
    layout={'width': '600px'}
)
min_length_slider = widgets.IntSlider(
    value=30,
    min=1,
    max=500,
    step=5,
    description='Min Length:',
    layout={'width': '600px'}
)
progress_label = widgets.HTML(
    value='',
    layout={'width': '600px'}
)

# Function to validate and preprocess input text
def preprocess_text(text):
    text = text.strip()
    if len(text) < 50:
        return None, "Error: Input text is too short (minimum 50 characters)."
    if len(text) > 10000:
        return None, "Error: Input text is too long (maximum 10000 characters)."

    tokens = tokenizer(text, truncation=True, max_length=1024, return_tensors="pt")
    token_count = tokens["input_ids"].shape[1]
    if token_count < 10:
        return None, "Error: Input text is too short for summarization (minimum 10 tokens)."

    return text, None

# Function to split text into chunks for summarization
def split_text_into_chunks(text, max_tokens=500):
    sentences = text.split('. ')
    chunks = []
    current_chunk = []
    current_token_count = 0

    for sentence in sentences:
        sentence = sentence.strip() + '. '
        tokens = tokenizer(sentence, truncation=True, max_length=1024, return_tensors="pt")
        token_count = tokens["input_ids"].shape[1]

        if current_token_count + token_count > max_tokens:
            chunks.append(''.join(current_chunk))
            current_chunk = [sentence]
            current_token_count = token_count
        else:
            current_chunk.append(sentence)
            current_token_count += token_count

    if current_chunk:
        chunks.append(''.join(current_chunk))

    return chunks

# Function to summarize long documents by chunking with retry logic
def summarize_long_document(text, max_len, min_len):
    chunks = split_text_into_chunks(text)
    summaries = []
    total_chunks = len(chunks)

    for i, chunk in enumerate(chunks):
        max_attempts = 2
        attempt = 1
        while attempt <= max_attempts:
            progress_label.value = f"<i>Summarizing chunk {i+1}/{total_chunks}, attempt {attempt}/{max_attempts}...</i>"
            try:
                start_time = time.time()
                chunk_max_len = max_len // total_chunks + 10
                chunk_min_len = max(1, min_len // total_chunks)
                summary = summarizer(
                    chunk,
                    max_length=chunk_max_len,
                    min_length=chunk_min_len,
                    do_sample=False,
                    truncation=True
                )[0]['summary_text']
                if time.time() - start_time > 60:
                    if attempt == max_attempts:
                        return f"Error: Summarization of chunk {i+1} timed out after {max_attempts} attempts."
                    attempt += 1
                    continue
                summaries.append(summary)
                break
            except Exception as e:
                if attempt == max_attempts:
                    return f"Error during summarization of chunk {i+1}: {str(e)}"
                attempt += 1

    return ' '.join(summaries)

# Function to handle file upload (supporting .txt, .pdf, .doc, and .docx)
def on_upload_change(change):
    if upload_button.value:
        try:
            uploaded_file = list(upload_button.value.values())[0]
            file_name = list(upload_button.value.keys())[0]
            file_content = uploaded_file['content']

            if file_name.lower().endswith('.txt'):
                content = file_content.decode('utf-8')
            elif file_name.lower().endswith('.pdf'):
                pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
                content = ""
                for page in pdf_reader.pages:
                    content += page.extract_text() + "\n"
            elif file_name.lower().endswith(('.doc', '.docx')):
                doc = docx.Document(io.BytesIO(file_content))
                content = ""
                for para in doc.paragraphs:
                    content += para.text + "\n"
            else:
                summary_output.value = "Error: Unsupported file format. Please upload a .txt, .pdf, or .doc/.docx file."
                upload_button.value.clear()
                return

            processed_text, error = preprocess_text(content)
            if error:
                summary_output.value = error
                upload_button.value.clear()
            else:
                text_input.value = processed_text
        except Exception as e:
            summary_output.value = f"Error reading file: {str(e)}"
            upload_button.value.clear()

# Function to handle download
def download_summary(b):
    if summary_output.value and not summary_output.value.startswith("Error"):
        summary_text = summary_output.value.encode('utf-8')
        b64_string = base64.b64encode(summary_text).decode('utf-8')
        js_download = f"""
        var link = document.createElement('a');
        link.setAttribute('download', 'summary.txt');
        link.setAttribute('href', 'data:text/plain;base64,{b64_string}');
        document.body.appendChild(link);
        link.click();
        link.remove();
        """
        display(Javascript(js_download))

# Function to perform summarization
def summarize_text(b):
    summarize_button.disabled = True
    download_button.disabled = True
    progress_label.value = "<i>Preparing to summarize... Please wait.</i>"
    summary_output.value = ""

    input_text, error = preprocess_text(text_input.value)
    if error:
        summary_output.value = error
        summarize_button.disabled = False
        progress_label.value = ""
        return

    try:
        max_len = max_length_slider.value
        min_len = min_length_slider.value
        if max_len <= min_len:
            max_len = min_len + 10

        summary = summarize_long_document(input_text, max_len, min_len)
        if summary.startswith("Error"):
            summary_output.value = summary
        else:
            summary_output.value = summary
            download_button.disabled = False
    except Exception as e:
        summary_output.value = f"Error during summarization: {str(e)}"
        download_button.disabled = True

    summarize_button.disabled = False
    progress_label.value = ""

# Function to display GUI
def display_gui():
    display(upload_button, text_input, max_length_slider, min_length_slider,
            summarize_button, download_button, progress_label, summary_output)

# Link button and upload handlers
summarize_button.on_click(summarize_text)
download_button.on_click(download_summary)
upload_button.observe(on_upload_change, names='value')

# Display the GUI
display_gui()

Device set to use cpu


FileUpload(value={}, accept='.txt,.pdf,.doc,.docx', description='Upload File (TXT/PDF/DOC)')

Textarea(value='', description='Input Text:', layout=Layout(height='400px', width='1000px'), placeholder='Past…

IntSlider(value=130, description='Max Length:', layout=Layout(width='600px'), max=1000, min=1, step=10)

IntSlider(value=30, description='Min Length:', layout=Layout(width='600px'), max=500, min=1, step=5)

Button(button_style='success', description='Summarize', style=ButtonStyle(), tooltip='Click to summarize the t…

Button(button_style='info', description='Download Summary', disabled=True, style=ButtonStyle(), tooltip='Click…

HTML(value='', layout=Layout(width='600px'))

Textarea(value='', description='Summary:', disabled=True, layout=Layout(height='400px', width='1000px'), place…