In [2]:
import PyPDF2 
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [30]:
def extract_text_from_pdf(pdf_paths):
    combined_text = ""
    for pdf_path in pdf_paths:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            num_pages = reader(len(reader.pages))
            for page_num in range(reader(len(reader.pages))):
                page = reader.getPage(page_num)
                combined_text += page.extractText()
    return combined_text

In [31]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    
    # Removing special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Removing numeric characters
    text = re.sub(r'\b\d+\b', '', text)
    
    # Removing URLs or Email Addresses
    text = re.sub(r'\b(?:https?://|www\.)\S+\b', '', text)
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '', text)
    
    # Handling Line Breaks and Page Numbers
    text = re.sub(r'\n|\r|\t', ' ', text)
    text = re.sub(r'\b\d+\b', '', text)
    
    # Removing Headers and Footers
    text = re.sub(r'^.*[Hh]eaders?[^$]*|.*[Ff]ooters?[^$]*', '', text)
    return text

In [32]:
def segment_text_based_on_themes(text, num_topics=5):
    # Vectorize text
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform([text])

    # Apply Latent Dirichlet Allocation (LDA)
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(X)

    # Get the most representative words for each topic
    feature_names = vectorizer.get_feature_names_out()
    segments = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words_idx = topic.argsort()[:-10-1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        segments.append(', '.join(top_words))

    return segments

In [33]:
def generate_daily_readings(segments, num_days):
    # Generate daily readings by distributing segments across days
    readings_per_day = len(segments) // num_days
    daily_readings = [segments[i:i+readings_per_day] for i in range(0, len(segments), readings_per_day)]
    return daily_readings

In [34]:
def main():
    pdf_paths = ["/Users/meenakshigopalakrishnan/EcoTrackr-Backend/AncientEgypt/encyclo.pdf", "/Users/meenakshigopalakrishnan/EcoTrackr-Backend/AncientEgypt/spqr.pdf","/Users/meenakshigopalakrishnan/EcoTrackr-Backend/AncientEgypt/THE TWELVE CAESARS.pdf", "/Users/meenakshigopalakrishnan/EcoTrackr-Backend/AncientEgypt/The_Roman_Triumph_Mary_Beard.pdf"]  # List of PDF paths
    text = extract_text_from_pdf(pdf_paths)
    processed_text = preprocess_text(text)
    segments = segment_text_based_on_themes(processed_text)
    num_days = 7  # Number of days for daily readings
    daily_readings = generate_daily_readings(segments, num_days)
    return daily_readings

In [35]:
daily_readings = main()

# Get the first segment
example_segment = daily_readings[0][0]  

# Print out the example segment
print("Example Segment:")
print(example_segment)

TypeError: 'PdfReader' object is not callable

In [28]:
import spacy


In [29]:
nlp = spacy.load("en_core_web_sm")

def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens