In [None]:
%%capture
!pip install pdfplumber mistralai

In [None]:
import re
import time
import math
import os
import pdfplumber
from mistralai import Mistral

def clean_text(text):
    """Cleans extracted text by removing extra spaces, dots, dashes, and unnecessary characters."""
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r'[-–—]', ' ', text)  # Replace dashes with a space
    text = re.sub(r'[\.\•·•‣]', '', text)  # Remove bullets and dots
    text = re.sub(r'\s+', ' ', text).strip()  # Remove any extra spaces
    text = re.sub(r'\n.', ' ', text)
    return text

def extract_text_from_pdf(pdf_path, start_page, end_page):
    """Extracts and cleans text from a PDF file within a given range of pages."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        end_page = min(end_page, total_pages)  # Ensure we don't exceed total pages
        for i in range(start_page, end_page):
            page_text = pdf.pages[i].extract_text()
            if page_text:
                cleaned_text = clean_text(page_text)  # Apply text cleaning
                text += cleaned_text + "\n\n"
    return text.strip()

def stream_text(text):
    """Prints text with original formatting, word by word."""
    for line in text.split("\n"):  # Split by lines
        words = line.split()  # Split each line into words
        for word in words:
            print(word, end=" ", flush=True)
            time.sleep(0.02)  # Adjust delay as needed
        print()  # Ensure new line after each sentence/paragraph

# API Key
api_key = os.getenv("MISTRAL_API_KEY", "sba8MP7XKrbwtcH3HbFdi9oYwSgO83YN").strip()
agent_id = "ag:0303e266:20250227:untitled-agent:d181a2b1"  # Replace with your real agent ID

# PDF File
pdf_path = "/content/stemnotes.pdf"  # Change this to your actual PDF file path

# Count total pages
with pdfplumber.open(pdf_path) as pdf:
    total_pages = len(pdf.pages)

# Process in chunks of 20 pages
chunk_size = 10
num_chunks = math.ceil(total_pages / chunk_size)

with Mistral(api_key=api_key) as mistral:
    for chunk in range(num_chunks):
        start_page = chunk * chunk_size
        end_page = start_page + chunk_size

        # Extract and clean text
        document_content = extract_text_from_pdf(pdf_path, start_page, end_page)

        # Start time for tracking
        start_time = time.time()

        # Call Mistral API
        res = mistral.agents.complete(
            messages=[
                {
                    "role": "user",
                    "content": f"generate notes from this content don't miss any point:\n\n{document_content}"
                }
            ],
            agent_id=agent_id,
            stream=False
        )

        # End time for tracking
        end_time = time.time()

        # Extract summary
        clean_summary = res.choices[0].message.content

        # Print execution time
        print(f"\n⏳ **Processing Pages {start_page+1} - {end_page}:**")
        print(f"⏳ **Start Time:** {start_time}")
        print(f"⏳ **End Time:** {end_time}")
        print(f"⏳ **Execution Time:** {end_time - start_time:.2f} seconds\n")

        # Stream the text word by word while preserving formatting
        print(f"📌 **Summary for Pages {start_page+1} - {end_page}:**\n")
        stream_text(clean_summary)



⏳ **Processing Pages 1 - 10:**
⏳ **Start Time:** 1740656644.4696386
⏳ **End Time:** 1740656670.2455604
⏳ **Execution Time:** 25.78 seconds

📌 **Summary for Pages 1 - 10:**

**Notes from Talia Gafoor's Semester 1 Report Card 2024-2025** 

**Student Information** 
- Grade: 9th 
- GPA: 3.83 
- Graduation Year: 2028 
- Student ID: 217496 
- Address: 776 Greenleaf Avenue, Glencoe, IL 60022 
- Date of Birth: 05/06/2010 
- Reporting Period: Aug 26 - Jan 01 
- Advisor: Mr. David Scarisbrick 

**Courses and Teachers** 
1. **English 9: Foundations of Critical Reading and Expression** - Dr. Katelyn Hartke 
- Grade: A 
- Comments: 
- Making steady progress in writing and analytical skills. 
- Actively participates in discussions and takes feedback constructively. 
- Needs to improve supporting ideas with specific textual evidence and analysis. 

2. **Geometry** - Ms. Emily Hansen 
- Grade: A 
- Comments: 
- Earned 94% in homework, quizzes, and tests, and 99% on the semester exam. 
- Actively enga

In [None]:
import re
import time
import math
import os
import pdfplumber
from mistralai import Mistral

def clean_text(text):
    """Cleans extracted text by removing extra spaces, dots, dashes, and unnecessary characters."""
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r'[-–—]', ' ', text)  # Replace dashes with a space
    text = re.sub(r'[\.\•▪●·•‣]', '', text)  # Remove bullets and dots
    text = re.sub(r'\s+', ' ', text).strip()  # Remove any extra spaces
    return text

def extract_text_from_pdf(pdf_path, start_page, end_page):
    """Extracts and cleans text from a PDF file within a given range of pages."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        end_page = min(end_page, total_pages)  # Ensure we don't exceed total pages
        for i in range(start_page, end_page):
            page_text = pdf.pages[i].extract_text()
            if page_text:
                cleaned_text = clean_text(page_text)  # Apply text cleaning
                text += cleaned_text + "\n\n"
    return text.strip()

def clean_summary(text):
    """Removes unnecessary introduction phrases from the summary."""
    patterns = [
        r"### Key Points from the Document",  # Remove headers
        r"Based on the provided document, here are the key points:",  # Remove intro statements
        r"The main highlights are as follows:",  # Another common summary start
        r"Here is a summary of the document:",  # Generic phrase
        r"In summary, the key takeaways are:",  # Alternative phrasing
    ]

    for pattern in patterns:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE).strip()

    return text

def stream_text(text):
    """Prints text with original formatting, word by word."""
    for line in text.split("\n"):  # Split by lines
        words = line.split()  # Split each line into words
        for word in words:
            print(word, end=" ", flush=True)
            time.sleep(0.02)  # Adjust delay as needed
        print()  # Ensure new line after each sentence/paragraph

# API Key
api_key = os.getenv("MISTRAL_API_KEY", "sba8MP7XKrbwtcH3HbFdi9oYwSgO83YN").strip()
agent_id = "ag:0303e266:20250227:untitled-agent:d181a2b1"  # Replace with your real agent ID

# PDF File
pdf_path = "/content/stemnotes.pdf"  # Change this to your actual PDF file path

# Count total pages
with pdfplumber.open(pdf_path) as pdf:
    total_pages = len(pdf.pages)

# Process in chunks of 20 pages
chunk_size = 20
num_chunks = math.ceil(total_pages / chunk_size)

with Mistral(api_key=api_key) as mistral:
    for chunk in range(num_chunks):
        start_page = chunk * chunk_size
        end_page = start_page + chunk_size

        # Extract and clean text
        document_content = extract_text_from_pdf(pdf_path, start_page, end_page)

        # Start time for tracking
        start_time = time.time()

        # Call Mistral API
        res = mistral.agents.complete(
            messages=[
                {
                    "role": "user",
                    "content": f"Summarize the key points from the following content in simple format:\n\n{document_content}"
                }
            ],
            agent_id=agent_id,
            stream=False
        )

        # End time for tracking
        end_time = time.time()

        # Extract and clean summary
        raw_summary = res.choices[0].message.content
        clean_summary_text = clean_summary(raw_summary)  # Remove unnecessary text

        # Print execution time
        print(f"\n⏳ **Processing Pages {start_page+1} - {end_page}:**")
        print(f"⏳ **Start Time:** {start_time}")
        print(f"⏳ **End Time:** {end_time}")
        print(f"⏳ **Execution Time:** {end_time - start_time:.2f} seconds\n")

        # Stream the cleaned summary
        print(f"📌 **Summary for Pages {start_page+1} - {end_page}:**\n")
        stream_text(clean_summary_text)



⏳ **Processing Pages 1 - 20:**
⏳ **Start Time:** 1740635778.848977
⏳ **End Time:** 1740635858.1119647
⏳ **Execution Time:** 79.26 seconds

📌 **Summary for Pages 1 - 20:**

### Summary of Key Points from Lecture Notes on Probability, Statistics, and Linear Algebra 

#### **Contents** 
1. **Data Exploration** 
- **1.1 Snowfall Data** 
- **1.2 Data Mining** 
- **1.3 Exercises** 

2. **Basic Notions from Probability Theory** 
- **2.1 Talking the Talk** 
- **2.2 Axiomatic Definition of Probability** 
- **2.3 Computing Probabilities for Subsets** 
- **2.4 Some Consequences of the Definition** 
- **2.5 That’s All There Is to Probability** 
- **2.6 Exercises** 

3. **Conditional Probability** 
- **3.1 The Definition of Conditional Probability** 
- **3.2 Independent Events** 
- **3.3 Bayes' Theorem** 
- **3.4 Decomposing a Subset to Compute Probabilities** 
- **3.5 More Linear Algebra** 
- **3.6 An Iterated Form of Bayes’ Theorem** 
- **3.7 Exercises** 

4. **Linear Transformations** 
- **4.1 

In [None]:
import re
import time
import math
import os
import pdfplumber
from mistralai import Mistral

def clean_text(text):
    """Cleans extracted text by removing extra spaces, dots, dashes, and unnecessary characters."""
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r'[-–—]', ' ', text)  # Replace dashes with a space
    text = re.sub(r'[\.\•▪●·•‣]', '', text)  # Remove bullets and dots
    text = re.sub(r'\s+', ' ', text).strip()  # Remove any extra spaces
    text = re.sub(r'\n.', ' ', text)
    return text

def extract_text_from_pdf(pdf_path, start_page, end_page):
    """Extracts and cleans text from a PDF file within a given range of pages."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        end_page = min(end_page, total_pages)  # Ensure we don't exceed total pages
        for i in range(start_page, end_page):
            page_text = pdf.pages[i].extract_text()
            if page_text:
                cleaned_text = clean_text(page_text)  # Apply text cleaning
                text += cleaned_text + "\n\n"
    return text.strip()

def clean_summary(text):
    """Removes unnecessary introduction phrases from the summary."""
    patterns = [
        r"### Summary",
         r"### Conclusion",
        r"Based on the provided document, here are the key points:",
        r"The main highlights are as follows:",
        r"Here is a summary of the document:",
        r"In summary, the key takeaways are:",
        r"Certainly! Here's a summary of the key points from the provided section: ",
    ]

    for pattern in patterns:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE).strip()

    return text

def stream_text(text):
    """Prints text with original formatting, word by word."""
    for line in text.split("\n"):  # Split by lines
        words = line.split()  # Split each line into words
        for word in words:
            print(word, end=" ", flush=True)
            time.sleep(0.05)  # Adjust delay as needed
        print()  # Ensure new line after each sentence/paragraph

# API Key
api_key = os.getenv("MISTRAL_API_KEY", "sba8MP7XKrbwtcH3HbFdi9oYwSgO83YN").strip()
agent_id = "ag:0303e266:20250227:agent:cbd038c5"  # Replace with your real agent ID

# PDF File
pdf_path = "/content/stemnotes.pdf"  # Change this to your actual PDF file path

# Count total pages
with pdfplumber.open(pdf_path) as pdf:
    total_pages = len(pdf.pages)

# Process in chunks of 20 pages
chunk_size = 20
num_chunks = math.ceil(total_pages / chunk_size)

# Store last 2 summaries only (rolling window)
previous_summaries = []

with Mistral(api_key=api_key) as mistral:
    for chunk in range(num_chunks):
        start_page = chunk * chunk_size
        end_page = start_page + chunk_size

        # Extract and clean text
        document_content = extract_text_from_pdf(pdf_path, start_page, end_page)

        # Start time for tracking
        start_time = time.time()

        # Keep only last 2 summaries in context
        context_summary = "\n".join(previous_summaries[-1:])  # Last 1 chunks only

        # Formulate prompt with minimal context
        prompt = f"""
        Here is a brief summary of the previous section:
        {context_summary}

        Now summarize this content:
        {document_content}
        """

        # Call Mistral API
        res = mistral.agents.complete(
            messages=[{"role": "user", "content": prompt}],
            agent_id=agent_id,
            stream=False
        )

        # End time for tracking
        end_time = time.time()

        # Extract and clean summary
        raw_summary = res.choices[0].message.content
        clean_summary_text = clean_summary(raw_summary)

        # Update rolling context
        previous_summaries.append(clean_summary_text)

        stream_text(clean_summary_text)


The content provided is a detailed outline of lecture notes on Probability, Statistics, and Linear Algebra, authored by Cliff Taubes from the Department of Mathematics at Harvard University. The notes cover a wide range of topics, from data exploration and basic probability theory to more advanced concepts such as conditional probability, random variables, statistical inverse problems, and applications in biology and genetics. 

of Key Sections: 

1. **Data Exploration**: 
- **Snowfall Data**: Analysis of snowfall totals in Boston from 1890 to 2001. 
- **Data Mining**: Techniques for comparing datasets, including means, standard deviations, and rank sum tests. 
- **Exercises**: Practical problems related to data collection and analysis. 

2. **Basic Notions from Probability Theory**: 
- **Sample Space and Events**: Definitions and examples of sample spaces and events. 
- **Axiomatic Definition of Probability**: Rules and conditions for defining probability functions. 
- **Computing Pro

KeyboardInterrupt: 

In [None]:
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"
        return text
text=extract_text_from_pdf("/content/stemnotes.pdf")

In [None]:
len(text)

364164

In [None]:
def clean_text(text):
    """Cleans extracted text by removing extra spaces, dots, dashes, and unnecessary characters."""
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r'[-–—]', ' ', text)  # Replace dashes with a space
    text = re.sub(r'[\.\•▪●·•‣]', '', text)  # Remove bullets and dots
    text = re.sub(r'\s+', ' ', text).strip()  # Remove any extra spaces
    text = re.sub(r'\n.', ' ', text)
    return text
text=clean_text(text)

In [None]:
len(text)

343828

In [None]:
text

'LECTURE NOTES ON PROBABILITY, STATISTICS AND LINEAR ALGEBRA C H Taubes Department of Mathematics Harvard University Cambridge, MA 02138 Spring,2010 CONTENTS 1 DataExploration 2 11 Snowfalldata 3 12 Datamining 3 13 Exercises 6 2 Basicnotionsfromprobabilitytheory 7 21 Talkingthetalk 7 22 Axiomaticdefinitionofprobability 9 23 Computingprobabilitiesforsubsets 11 24 Someconsequencesofthedefinition 12 25 That’sallthereistoprobability 13 26 Exercises 13 3 Conditionalprobability 16 31 Thedefinitionofconditionalprobability 16 32 Independentevents 17 33 Bayestheorem 18 34 Decomposingasubsettocomputeprobabilities 19 35 Morelinearalgebra 22 36 AniteratedformofBayes’theorem 22 37 Exercises 23 4 Lineartransformations 25 41 Proteinmolecules 25 42 Proteinfolding 26 5 Howmatrixproductsarise 27 51 Genomics 27 52 Howbacteriafindfood 28 53 Growthofnervesinadevelopingembryo 29 54 Enzymedynamics 29 55 Exercises 29 6 Randomvariables 31 61 Thedefinitionofarandomvariable 31 62 Probabilityforarandomvariable 32