In [None]:
# --- Installation Commands ---
!pip install PyPDF2 streamlit transformers torch pyngrok
!npm install localtunnel

# --- Imports ---
import PyPDF2
import streamlit as st
from transformers import pipeline
import textwrap
import os
from pyngrok import ngrok

# --- Setting Up ngrok Auth Token ---
ngrok.set_auth_token("2sORcLksJMfkGpyIfJQBhOSkP6i_FBL4HizBW9FAHPG5zAMs")

# --- Define the PDFSummarizer Class ---
class PDFSummarizer:
    def __init__(self):
        # Initializing the summarization pipeline
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    def extract_text_from_pdf(self, pdf_file):
        """Extract text from a PDF file."""
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in pdf_reader.pages:
            # Make sure the page has text (avoid NoneType concatenation)
            page_text = page.extract_text()
            if page_text:
                text += page_text
        return text

    def chunk_text(self, text, max_chunk_size=1024):
        """Split text into smaller chunks for processing."""
        return textwrap.wrap(text, max_chunk_size, break_long_words=False)

    def summarize_text(self, text):
        """Generate a summary from text."""
        chunks = self.chunk_text(text)
        summaries = []
        for chunk in chunks:
            summary = self.summarizer(chunk, max_length=130, min_length=30, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        return " ".join(summaries)

    def extract_key_info(self, text):
        """Extract key information using Named Entity Recognition (NER)."""
        ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
        entities = ner(text)
        grouped_entities = {}
        for entity in entities:
            entity_type = entity['entity']
            if entity_type not in grouped_entities:
                grouped_entities[entity_type] = []
            grouped_entities[entity_type].append(entity['word'])
        return grouped_entities

# --- Defining the Main Streamlit App ---
def main():
    st.title("PDF Summarizer and Information Extractor")
    summarizer = PDFSummarizer()
    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

    if uploaded_file is not None:
        with st.spinner("Processing PDF..."):
            text = summarizer.extract_text_from_pdf(uploaded_file)

            with st.expander("Show Original Text"):
                st.text(text)

            summary = summarizer.summarize_text(text)
            st.subheader("Summary")
            st.write(summary)

            st.subheader("Key Information")
            key_info = summarizer.extract_key_info(text)
            for entity_type, entities in key_info.items():
                st.write(f"{entity_type}: ", ", ".join(set(entities)))

# --- Writing the Complete App to a File (app.py) ---
app_code = '''
import PyPDF2
import streamlit as st
from transformers import pipeline
import textwrap

class PDFSummarizer:
    def __init__(self):
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    def extract_text_from_pdf(self, pdf_file):
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
        return text

    def chunk_text(self, text, max_chunk_size=1024):
        return textwrap.wrap(text, max_chunk_size, break_long_words=False)

    def summarize_text(self, text):
        chunks = self.chunk_text(text)
        summaries = []
        for chunk in chunks:
            summary = self.summarizer(chunk, max_length=130, min_length=30, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        return " ".join(summaries)

    def extract_key_info(self, text):
        ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
        entities = ner(text)
        grouped_entities = {}
        for entity in entities:
            entity_type = entity['entity']
            if entity_type not in grouped_entities:
                grouped_entities[entity_type] = []
            grouped_entities[entity_type].append(entity['word'])
        return grouped_entities

def main():
    st.title("PDF Summarizer and Information Extractor")
    summarizer = PDFSummarizer()
    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

    if uploaded_file is not None:
        with st.spinner("Processing PDF..."):
            text = summarizer.extract_text_from_pdf(uploaded_file)
            with st.expander("Show Original Text"):
                st.text(text)
            summary = summarizer.summarize_text(text)
            st.subheader("Summary")
            st.write(summary)
            st.subheader("Key Information")
            key_info = summarizer.extract_key_info(text)
            for entity_type, entities in key_info.items():
                st.write(f"{entity_type}: ", ", ".join(set(entities)))

if __name__ == "__main__":
    main()
'''

with open('app.py', 'w') as f:
    f.write(app_code)

# --- Runing the Streamlit App ---
# The following command starts the Streamlit server and directs its logs to a file.
!streamlit run app.py &>/content/logs.txt &

# --- Connecting ngrok to expose the app ---
public_url = ngrok.connect(addr='8501')
print(f"Access your app at: {public_url}")


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nv