#### Text Summarization from PDF and norml text

In [None]:
%pip install transformers
%pip install sentencepiece
%pip install langchain
%pip install streamlit

In [None]:
%pip install accelerate

In [None]:
import streamlit as st 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.chains.summarize import load_summarize_chain
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline
import torch
import base64

# LLM is a custom model trained on the LaMini dataset. It is a T5 model trained on the summarization task. 

In [None]:
#model and tokenizer loading
checkpoint = "MBZUAI/LaMini-Flan-T5-248M"
tokenizer = T5Tokenizer.from_pretrained(checkpoint)
base_model = T5ForConditionalGeneration.from_pretrained(checkpoint, device_map='auto', torch_dtype=torch.float32)

# LaMini-Flan-T5-248M install


In [None]:
#file loader and preprocessing
def file_preprocessing(file):
    loader =  PyPDFLoader(file)
    pages = loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    texts = text_splitter.split_documents(pages)
    # split_documents returns a list of Text objects
    final_texts = ""
    for text in texts:
        print(text)
        final_texts = final_texts + text.page_content
    return final_texts

In [None]:
@st.cache_data
#function to display the PDF of a given file 
def displayPDF(file):
    # Opening file from file path
    with open(file, "rb") as f:
        base64_pdf = base64.b64encode(f.read()).decode('utf-8')

    # Embedding PDF in HTML
    pdf_display = F'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'

    # Displaying File
    st.markdown(pdf_display, unsafe_allow_html=True)

In [None]:
#streamlit code 
st.set_page_config(layout="wide")

#### Pipeline for pdf summarization

In [None]:
#LLM pipeline
def llm_pipeline(filepath):
    pipe_sum = pipeline(
        'summarization',
        model = base_model,
        tokenizer = tokenizer,
        max_length = 500, 
        min_length = 50)
    input_text = file_preprocessing(filepath)
    result = pipe_sum(input_text)
    result = result[0]['summary_text']
    return result

#### Pipeline for normal text summarization

In [None]:
#LLM pipeline
def llm_pipeline(input_text):
    pipe_sum = pipeline(
        'summarization',
        model = base_model,
        tokenizer = tokenizer,
        max_length = 500, 
        min_length = 50)
    result = pipe_sum(input_text)
    result = result[0]['summary_text']
    return result

##### Summary using normal text

In [None]:
text = """
With the national polls around six months away, the Awami League and the BNP are going to hold programmes and counter-programmes today and tomorrow to show off their strength on the streets.

The BNP will today march from Gabtoli to Jatrabari from 10:00am to 4:00pm in the capital and bring out processions in other cities and districts to realise its one-point demand for polls under a non-party interim government.

Sources in the BNP said instructions were given to all Dhaka north and South city units to ensure huge gatherings.

To counter BNP's programme, ruling Awami league will bring out what it termed "a march for peace and development" in the capital.

At 3:00pm today, Dhaka city (South) AL will bring out a procession from Institution of Engineers, Bangladesh, that will end at Dhanmondi 32. AL General Secretary Obaidul Quader will join the programme.

The unit is planning to gather over a lakh people in the event.
"""

In [None]:
len(text.split())

In [None]:
summary = llm_pipeline(text)

In [None]:
len(summary.split())

##### Summary from PDF file

In [None]:
st.title("Document Summarization App using Langauge Model")

uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])

if uploaded_file is not None:
    if st.button("Summarize"):
        col1, col2 = st.columns(2)
        filepath = "data/"+uploaded_file.name
        with open(filepath, "wb") as temp_file:
            temp_file.write(uploaded_file.read())
        with col1:
            st.info("Uploaded File")
            pdf_view = displayPDF(filepath)

        with col2:
            summary = llm_pipeline(filepath)
            st.info("Summarization Complete")
            st.success(summary)