# ETL Pipeline

In [None]:
#!pip install langchain
#!pip install pypdf
#!pip install html2text
#!pip install pylatexenc

In [None]:
import os
import sys
import langchain
from langchain.document_loaders import PyPDFLoader
from dotenv import load_dotenv
import json
from pylatexenc.latexencode import UnicodeToLatexEncoder
import re

## Setup textbook directory and file configurations

In [None]:
load_dotenv()
TEXTBOOK_DIRECTORY = os.getenv('TEXTBOOK_DIRECTORY')
DATA_DIRECTORY = os.getenv('DATA_DIRECTORY')

with open('./textbook_config.json') as f:
    textbook_config = json.load(f)

print("Textbooks: ", textbook_config.keys())

## Convert textbooks into plain text

In [None]:
for book_name, book_info in textbook_config.items():
    
    if(book_name == 'Deep Learning'): # Skip Deep Learning textbook due to formatting issues
        continue
    
    loader = PyPDFLoader(f'{TEXTBOOK_DIRECTORY}/{book_name}.pdf')
    pages = loader.load()

    encoder = UnicodeToLatexEncoder(
        replacement_latex_protection='braces-all',
        unknown_char_policy='ignore'
    )

    for chapter in book_info['chapters']:
        for chapter_number, page_range in chapter.items():
            chapter_content = [pages[i].page_content for i in range(page_range[0]-1, page_range[1])]
            chapter_content_latex = encoder.unicode_to_latex('\n'.join(chapter_content)) # Convert the unicode characters to LaTeX
            chapter_content_latex = re.sub(r'\\ensuremath\{(.+?)\}', r'\1', chapter_content_latex) # Remove all instances of \ensuremath{}
            
            # Manually fix incorrect LaTeX conversions
            chapter_content_latex = re.sub(r'\{\\textquoteright\}', '`', chapter_content_latex)
            chapter_content_latex = re.sub(r'\{ff\}', 'ff', chapter_content_latex)
            chapter_content_latex = re.sub(r'\{fi\}', 'fi', chapter_content_latex)
            chapter_content_latex = re.sub(r'\{ffi\}', 'ffi', chapter_content_latex) 
            chapter_content_latex = re.sub(r'\{\\textbullet\}', '', chapter_content_latex)

            os.makedirs(f'{DATA_DIRECTORY}/{book_name}', exist_ok=True)
            with open(f'{DATA_DIRECTORY}/{book_name}/{chapter_number}.txt', 'w') as f:
                f.write(chapter_content_latex)

    print(f'Processed {book_name}')