In [1]:
import streamlit as st
import pdfplumber
import tiktoken

def extract_text_from_pdf(pdf_file):
    """Extract text from each page of a PDF file."""
    full_text = ''
    with pdfplumber.open(pdf_file) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            page_text = page.extract_text()
            if page_text:
                full_text += f'\n--- Page {page_number} ---\n'
                full_text += page_text + '\n'
    return full_text

def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Count the number of tokens in the given string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Title of the app
st.title("📄 PDF to Text Extractor with Token Counter")

# Upload PDF file
uploaded_pdf = st.file_uploader("Upload your PDF file here", type=["pdf"])

if uploaded_pdf:
    st.subheader("📋 Extracted Text from PDF")
    
    # Extract the text from the uploaded PDF
    extracted_text = extract_text_from_pdf(uploaded_pdf)
    
    if extracted_text:
        # Count the tokens in the extracted text
        token_count = num_tokens_from_string(extracted_text)
        
        # Display the extracted text
        st.text_area("Extracted Text", extracted_text, height=400)
        
        # Display token count
        st.metric(label="📏 Total Tokens", value=f"{token_count} tokens")
        
        # Download option for the extracted text
        st.download_button(
            label="💾 Download Extracted Text",
            data=extracted_text,
            file_name='extracted_text.txt',
            mime='text/plain'
        )
    else:
        st.warning("No text could be extracted from this PDF.")


2024-12-17 11:15:36.434 
  command:

    streamlit run /opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]
