In [117]:
import re
import os
os.chdir(r"D:\Industry\Projects\machine learning chatbot\Notebooks")
import pytesseract
from pdf2image import convert_from_path
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import pandas as pd
import cv2
import numpy as np
pytesseract.pytesseract.tesseract_cmd = r'D:\Program Files\Tesseract-OCR\tesseract.exe'

def clean_text(text):
    
    text = re.sub(r'\r?\n', ' ', text)  # Replace actual newlines
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'Slide \d+ of \d+', '', text)
    text = re.sub(r'Lecture \d+ of \d+', '', text)
    text = text.replace('\\n', ' ')  # Replace literal "\n" with space
    text = text.replace('\\n\\n', ' ')  # Replace literal "\n" with space
    return text.strip()


def find_pdfs(root_dir):
    pdf_files = []
    for dirpath, _, filenames in os.walk(root_dir):
        for file in filenames:
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(dirpath, file))
    return pdf_files


def chunk_text(text, max_words=16000):
    pages = re.split(r'\n{2,}', text.strip())  # split by 2+ newlines to get pages/paragraphs
    chunks = []
    current_chunk = []
    current_word_count = 0

    for page in pages:
        clean_page = re.sub(r'^PAGE \d+\s*', '', page.strip(), flags=re.IGNORECASE)
        word_count = len(clean_page.split())

        if current_word_count + word_count > max_words:
            if current_chunk:
                chunks.append("\n\n".join(current_chunk))
            current_chunk = [clean_page]
            current_word_count = word_count
        else:
            current_chunk.append(clean_page)
            current_word_count += word_count

    if current_chunk:
        chunks.append("\n\n".join(current_chunk))

    return chunks


def extract_text_with_ocr_fallback(pdf_path):
    doc = fitz.open(pdf_path)
    results = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()
        if text.strip():
            # Text extracted successfully
            results.append(text)
        else:
            # No text found, fallback to OCR of rendered page image
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes()))
            img = np.array(img)
            try:
                ocr_text = pytesseract.image_to_string(img, lang='eng')
                results.append(ocr_text)
            except PermissionError as e:
                print(f"Permission error when running Tesseract on {pdf_path}, page {page_num}: {e}")
            except Exception as e:
                print(f"Other error on {pdf_path}, page {page_num}: {e}")

    return results

In [95]:
pdfs = find_pdfs("../Dataset/Not_garb")
df = pd.DataFrame(columns=["pdfs", "text"])
for pdf in pdfs:
    df = pd.concat([df, pd.DataFrame([{"pdfs": pdf, "text": None}])]
                   , ignore_index=True)

In [116]:
df_chunks_data = []

for pdf in pdfs:
    text = extract_text_with_ocr_fallback(pdf)
    full_text = "\n\n".join(text)  # keep page separation for chunking
    chunks = chunk_text(full_text)
    
    for chunk in chunks:
        df_chunks_data.append({"pdfs": pdf, "text": chunk})

df_chunks = pd.DataFrame(df_chunks_data)

df_chunks

Unnamed: 0,pdfs,text
0,../Dataset/Not_garb\andrew-ng-machine-learning...,Draft Version\n\nMACHINE\nLEARNING\n\nTechnica...
1,../Dataset/Not_garb\andrew-ng-machine-learning...,37 How to decide whether to use all your data ...
2,../Dataset/Not_garb\C1M1.pdf,Andrew Ng\nIntroduction to \nDeep Learning\nWe...
3,../Dataset/Not_garb\C1M2.pdf,Basics of Neural \nNetwork Programming\nBinary...
4,../Dataset/Not_garb\C1M3.pdf,deeplearning.ai\nOne hidden layer\nNeural Netw...
...,...,...
141,../Dataset/Not_garb\pdfs\5_learning3.pdf,Lecture 4: Machine learning III\nCS221 / Autum...
142,../Dataset/Not_garb\pdfs\6_search1.pdf,Lecture 5: Search I\nCS221 / Autumn 2015 / Lia...
143,../Dataset/Not_garb\pdfs\7_search2.pdf,Lecture 6: Search II\nCS221 / Autumn 2015 / Li...
144,../Dataset/Not_garb\pdfs\8_mdp1.pdf,Lecture 7: MDPs I\nCS221 / Autumn 2015 / Liang...


In [118]:
df_chunks['text'] = df_chunks['text'].apply(lambda x: clean_text(x))
df_chunks["text"][0]

'Draft Version MACHINE LEARNING Technical Strategy for Al Engineers, In the Era of Deep Learning Machine Learning Yearning is a deeplearning.ai project. © 2018 Andrew Ng. All Rights Reserved. Page 2 Machine Learning Yearning-Draft Andrew Ng Deeplearning.AI Table of Contents 1 Why Machine Learning Strategy 2 How to use this book to help your team 3 Prerequisites and Notation 4 Scale drives machine learning progress 5 Your development and test sets 6 Your dev and test sets should come from the same distribution 7 How large do the dev/test sets need to be? 8 Establish a single-number evaluation metric for your team to optimize 9 Optimizing and satisficing metrics 10 Having a dev set and metric speeds up iterations 11 When to change dev/test sets and metrics 12 Takeaways: Setting up development and test sets 13 Build your first system quickly, then iterate 14 Error analysis: Look at dev set examples to evaluate ideas 15 Evaluating multiple ideas in parallel during error analysis 16 Cleanin

In [119]:
import pickle
with open("../Dataset/dataset.pkl", "wb") as f:
    pickle.dump(df_chunks, f)