In [1]:
import os
import sys
import re

from dotenv import load_dotenv
load_dotenv()

from openai import OpenAI

import json
import numpy as np
import pandas as pd

from tqdm import tqdm

# Loading functions from the scripts
"""
Mostafa:
I used the new structured output for question generation.
It's a beta version, but it works on my end (10/23/2024).
https://platform.openai.com/docs/guides/structured-outputs/structured-outputs

For answer generation, I had some issues, so I used the standard API."

Please upgrade before running this notebook: pip install --upgrade openai
"""

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))  # Get parent directory of the notebook 
sys.path.append(parent_dir)  #  to the Python path

from scripts.chunking import process_latex_files
from scripts.embedding import get_embeddings, fixed_knn_retrieval
from scripts.prompts import gen_questions, gen_questions_s, gen_answer


## 0. Setting API and Models

In [2]:
"""
Mostafa: 
I suggest using 'gpt-4o' for production runs, but it is more expensive.
For embeddings, I recommend 'text-embedding-3-large.' We only need to run it once, but it also costs more.

# https://openai.com/api/pricing/
# https://openai.com/index/new-embedding-models-and-api-updates/
# https://platform.openai.com/docs/guides/embeddings/embedding-models
"""

api_key = os.getenv("OPENAI_API_KEY")  # Replace with your actual API key
client = OpenAI(api_key=api_key)

production_mode = True
chunk_by_section = True
chpt_for_quest_answ = 108
author_for_quest_answ = "garikipati"

if production_mode == False:
    llm_model_questions = "gpt-4o"
    llm_model_answers = llm_model_questions     # option to run different model
    embedding_size = "large"                    # small or large
elif production_mode == True:
    llm_model_questions = "gpt-4o"
    llm_model_answers = llm_model_questions     # option to run different model
    embedding_size = "large"                    # small or large

embedding_model = f"text-embedding-3-{embedding_size}"  # NOTE: this must be the same for all embeddings. 
author_for_quest_answ = author_for_quest_answ.lower()

# Setting path for root data folder
main_dir = f'../data/{author_for_quest_answ}_latex_Q_then_A_use_context'

if not os.path.exists(main_dir):
    os.makedirs(main_dir)


## 1. Generating Context Embedding Space

In [3]:
"""
Mostafa: 
I used fixed size chunks (512) with a 25% overlap.
Make sure environment_sensitive is set to False for fixed size.

We should embed all chapters to generate the embedding space. For the demo, I only included two chapters.
please update the paths in latex_file_paths.
"""

# add all book chapters paths

if author_for_quest_answ.lower() == "hughes":
    latex_file_paths = [ 
        '../data/FEM_Hughes_LaTeX_Textbook/chapter1.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter2.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter3.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter4.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter7.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter8.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter9.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter10.tex',
    ]

elif author_for_quest_answ == "garikipati":
    latex_file_paths = [ 
        '../data/FEM_Hughes_LaTeX_Textbook/chapter1.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter2.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter3.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter4.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter7.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter8.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter9.tex',
        '../data/FEM_Hughes_LaTeX_Textbook/chapter10.tex',
        '../data/FEM_Garikipati_lectures/chapter101.tex',
        '../data/FEM_Garikipati_lectures/chapter102.tex',
        '../data/FEM_Garikipati_lectures/chapter103.tex',
        '../data/FEM_Garikipati_lectures/chapter104.tex',
        '../data/FEM_Garikipati_lectures/chapter105.tex',
        '../data/FEM_Garikipati_lectures/chapter106.tex',
        '../data/FEM_Garikipati_lectures/chapter107.tex',
        '../data/FEM_Garikipati_lectures/chapter108.tex',
    ]

tokens_per_chunk = 4096                         # was 512
token_overlap = int(0.25 * tokens_per_chunk)    # 25% overlap
environment_sensitive = False                   # If False, equations can split between two chunks, but chunk lengths remain fixed.

#------------------------------------------------------------------------
if chunk_by_section == False:
    embedding_space_file_name = f'{main_dir}/{author_for_quest_answ}_latex_embedding_space_tpc{tokens_per_chunk}_o{token_overlap}_{embedding_size}.json'
elif chunk_by_section == True:
    embedding_space_file_name = f'{main_dir}/{author_for_quest_answ}_latex_embedding_space_by_sections_tpc{tokens_per_chunk}_{embedding_size}.json'
    token_overlap = 0
print(f"embedding space filename: {embedding_space_file_name}")

space = {}
if not os.path.exists(embedding_space_file_name):
    
    chunks = process_latex_files(latex_file_paths, 
                                 tokens_per_chunk, 
                                 token_overlap, 
                                 environment_sensitive, 
                                 chunk_by_section = chunk_by_section)
    
    chunk_length = []
    char_length = []
    print(chunks)
    for chunk in chunks:
        print(f"chunk word length: {len(chunk.split(" "))}, chunk char length: {len(chunk)}, chunk = {chunk}")
        chunk_length.append(len(chunk.split(" ")))
        char_length.append(len(chunk))
    print(f"max chunk length in words = {np.max(chunk_length)}")
    print(f"max chunk length in char = {np.max(char_length)}")
    #print(f"chunk lengths = {chunk_length}")

    # using api
    embedding_space = get_embeddings(client, chunks, model=embedding_model)
    
    # save
    with open(embedding_space_file_name, 'w') as json_file:
        json.dump({'embedding_model': embedding_model, 'chunks': chunks, 'embedding_space': embedding_space}, json_file)

    print("saved")
else:
    # save
    with open(embedding_space_file_name, 'r') as json_file:
        loaded_data = json.load(json_file)

    chunks = loaded_data['chunks']
    embedding_space = np.array(loaded_data['embedding_space'])
    print("loaded")

chunks = np.array(chunks)
embedding_space = np.array(embedding_space)
print("Space size:", embedding_space.shape)


embedding space filename: ../data/garikipati_latex_Q_then_A_use_context/garikipati_latex_embedding_space_by_sections_tpc4096_large.json
loaded
Space size: (221, 3072)


## 2. Generating Questions and Their Embeddings

In [4]:
"""
Mostafa: 
For generating questions, we want larger chunks with a bit of overlap.
The following values are just for this demo, so please adjust them as needed.

I only ran Chapter One.
"""


chapter = chpt_for_quest_answ

if author_for_quest_answ == "hughes":
    latex_file_path = f'../data/FEM_Hughes_LaTeX_Textbook/chapter{chapter}.tex'
elif author_for_quest_answ == "garikipati":
    latex_file_path = f'../data/FEM_Garikipati_lectures/chapter{chapter}.tex'

max_questions = 40                             # max number of questions per chunk

tokens_per_chunk = 1536                       
token_overlap = int(0.2 * tokens_per_chunk)   # 10% overlap
environment_sensitive = True                  # If True, equations won't be split between chunks, which may result in chunks larger than the specified tokens_per_chunk

#------------------------------------------------------------------------
def embed_all_q(questions):
    all_questions = []
    for item in questions['data']:
        for sub_item in item['questions']:
            all_questions.append(sub_item['question'])
    # using api
    embeddings = get_embeddings(client, all_questions, model = embedding_model) 
    # add them to data:
    k = 0
    for item in questions['data']:
        for sub_item in item['questions']:
            sub_item['embedding'] = embeddings[k]
            k +=1
    print('Questions are embedded')
    return questions



#------------------------------------------------------------------------
if chunk_by_section == False:
    questions_file_name = f"{main_dir}/{author_for_quest_answ}_ch{chapter}_Qs_n{max_questions}_tpc{tokens_per_chunk}_o{token_overlap}.json"   
elif chunk_by_section == True:
    questions_file_name = f"{main_dir}/{author_for_quest_answ}_ch{chapter}_Qs_n{max_questions}_by_sections_tpc{tokens_per_chunk}.json"  
    token_overlap = 0 

if not os.path.exists(questions_file_name):
    question_chunks = process_latex_files(latex_file_path, tokens_per_chunk, token_overlap, environment_sensitive, chunk_by_section=chunk_by_section)
    
    if production_mode == False:
        question_chunks = question_chunks[0:7] # for testing small batch
    
    for question in question_chunks:
        print(f"chunk word length: {len(question.split(" "))}, chunk char length: {len(question)}, chunk = {question}")

    questions = {}  # main data

    # we should save generation info we used
    questions['info'] = {
        'tokens_per_chunk': tokens_per_chunk,
        'token_overlap': token_overlap,
        'environment_sensitive': environment_sensitive,
        'max_questions': max_questions,
        'embedding_model': embedding_model,
        'llm_model_questions': llm_model_questions,
        'llm_model_answers': llm_model_answers
    }

    ## step 1: generate questions
    questions['data'] = []
    for i in tqdm(range(len(question_chunks)), desc="Generating Questions"):
        # q_for_chunk = gen_questions(client, question_chunks[i], max_questions, model=llm_model_questions)
        q_for_chunk = gen_questions_s(client, question_chunks[i], max_questions, model=llm_model_questions)   # Using the new function
        questions['data'].append({'chunk': question_chunks[i],'questions': q_for_chunk})
    print('Questions are generated')

    ## step 2: embedding all questions at once
    questions = embed_all_q(questions)
    

    with open(questions_file_name, 'w') as json_file:
        json.dump(questions, json_file, indent=4)
    print('saved', questions_file_name)

else:
    with open(questions_file_name, 'r') as json_file:
        questions = json.load(json_file)

    print('loaded', questions_file_name)

if questions['info']['embedding_model'] != embedding_model:
    print("embedding model mismatch. re-embedding questions")
    questions = embed_all_q(questions)
    questions['info']['embedding_model'] = embedding_model



chunk word length: 1401, chunk char length: 7542, chunk = Okay. So let's get on. So what, what we're aiming to do in this segment is get some sense of the stability of the equations that we need to look at. Now if we want to look at stability we have to first understand we must first understand the stability of the time exact case, all right? Because that is the, the sort of behavior, the sort of response we are aspiring towards for our system, right? For our algorithmic system. Okay? So, in terms of stability, let's first understand the time exact case. Right, now, we've derived the single degree of freedom, modal equations, for a partic, for an arbitrary mode L. Okay? Now, everything we do holds for every mode, right? The, the, the, our, our analysis holds for any mode, because we're really working for an arbitrary mode. With that in mind, I can afford, I believe, to drop the explicit, use of the modal index, L. Okay? All right, so I'm going to drop that, right? So, from now on, the 

Generating Questions: 100%|██████████| 27/27 [05:17<00:00, 11.77s/it]


Questions are generated
Questions are embedded
saved ../data/garikipati_latex_Q_then_A_use_context/garikipati_ch108_Qs_n40_by_sections_tpc1536.json


## 3. Context Retrieval and Generating Answers 

In [6]:
"""
Mostafa: 
Since we answer each question separately, this process is slow.
We might want to consider using the batch API for this.
"""

top_k = 10   # number of retrieved closest contexts         

#------------------------------------------------------------------------
if chunk_by_section == False:
    questions_answers_file_name = f"{main_dir}/{author_for_quest_answ}_ch{chapter}_QAs_n{max_questions}_topk{top_k}_tpc{tokens_per_chunk}_o{token_overlap}.json"   
elif chunk_by_section == True:
    questions_answers_file_name = f"{main_dir}/{author_for_quest_answ}_ch{chapter}_QAs_n{max_questions}_topk{top_k}_by_sections.json"   

if not os.path.exists(questions_answers_file_name):

    questions_answers = questions.copy()

    # step 1) finding top_k context from the book embedding and adding them to each question
    for item in questions_answers['data']:
        for sub_item in item['questions']:
            ind = fixed_knn_retrieval(sub_item['embedding'], embedding_space, top_k)
            context = ''
            for i, chunk in enumerate(chunks[ind]):
                context += f'\n\n Additional context {i}: {chunk}' 
            sub_item['context'] = context
    print('top_k context added')

    # step 2) generating answers (slow)  (should we try batch API?)
    for item in tqdm(questions_answers['data'], desc="Answering Questions"):
        question_chunk = item['chunk']
        for sub_item in item['questions']:
            question = sub_item['question']
            context = question_chunk + sub_item['context']
            sub_item['answer'] = gen_answer(client, question, context, model = llm_model_answers)
    print('Questions are answered')
    
    with open(questions_answers_file_name, 'w') as json_file:
        json.dump(questions_answers, json_file, indent=4)
    print('saved', questions_answers_file_name)

else:
    with open(questions_answers_file_name, 'r') as json_file:
        questions_answers = json.load(json_file)

    print('loaded', questions_answers_file_name)


loaded ../data/garikipati_latex_Q_then_A_use_context/garikipati_ch108_QAs_n40_topk10_by_sections.json


## Save a CSV file

In [7]:
"""
Mostafa: 
I think it's better to work with JSON/DataFrame in the code, but for reviewing QAs, CSV is easier to work with
"""

csv_file_name = f"{main_dir}/{author_for_quest_answ}_ch{chapter}_QAs_n{max_questions}.csv"   
# ----------------------------------

data = []

for item in questions_answers['data']:
    question_chunk = item['chunk']
    for sub_item in item['questions']:
        new_item = {}
        new_item['question_chunk'] = question_chunk
        for k,v in sub_item.items():
            if k == 'embedding':
                continue
            new_item[k] = v
        data.append(new_item)

# data[0]
df = pd.DataFrame(data)[['question_chunk','context','coverage','question','answer']]
df.to_csv(csv_file_name)
df

Unnamed: 0,question_chunk,context,coverage,question,answer
0,"Okay. So let's get on. So what, what we're aim...",\n\n Additional context 0: All right. What we'...,80,What is the significance of understanding the ...,Understanding the stability of the time exact ...
1,"Okay. So let's get on. So what, what we're aim...",\n\n Additional context 0: All right. What we'...,85,Explain the role of spatial discretization in ...,Spatial discretization plays a crucial role in...
2,"Okay. So let's get on. So what, what we're aim...",\n\n Additional context 0: The Galerkin method...,90,How does the eigenvalue $\\lambda^h$ relate to...,"In the context of finite element analysis, the..."
3,"Okay. So let's get on. So what, what we're aim...",\n\n Additional context 0: All right we are re...,75,Describe the exact solution for the single deg...,The exact solution for the single degree of fr...
4,"Okay. So let's get on. So what, what we're aim...",\n\n Additional context 0: All right. What we'...,70,Why is it important to consider the homogeneou...,The importance of considering the homogeneous ...
...,...,...,...,...,...
379,So if you've got as far as this particular vid...,\n\n Additional context 0: So if you've got as...,85,What are some of the key features of the Deal....,Deal.II and FEniCS are both open-source platfo...
380,So if you've got as far as this particular vid...,\n\n Additional context 0: So if you've got as...,70,How do the resources provided by Open Michigan...,The resources provided by Open Michigan and si...
381,So if you've got as far as this particular vid...,\n\n Additional context 0: The main constituen...,65,What is the significance of having access to a...,Answer: NOT ENOUGH INFO.
382,So if you've got as far as this particular vid...,\n\n Additional context 0: So if you've got as...,80,How can the integration of computational scien...,Answer: The integration of computational scien...


## Print and review

In [8]:
import textwrap

def print_wrapped(text, wrap_length=160):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [9]:
i = 9  # try different QAs

print('Q:')
print_wrapped(df.iloc[i,:]['question'])
print('A:')
print_wrapped(df.iloc[i,:]['answer'])
print('\nChunk used for Q generation:')
print_wrapped(df.iloc[i,:]['question_chunk'])
print('\nRetrieved context:')
for item in df.iloc[i,:]['context'].split('Additional context'):
    print_wrapped(item)
    print()

Q:
Explain the significance of assuming a homogeneous case in the stability analysis of finite element equations.
A:
The significance of assuming a homogeneous case in the stability analysis of finite element equations is to expose the fundamental characteristic of the
equations being analyzed. By considering the homogeneous case, where external forces or sources (such as heat supply or mass influx) are set to zero, the
analysis focuses on the intrinsic behavior of the system. This allows for a clearer understanding of the natural tendency of the solution, which, in the context
provided, is to decay or remain stable over time without external influences. This fundamental characteristic is crucial for determining what the exact behavior
of the algorithmic equations should aim to represent, ensuring that the numerical methods used in the finite element analysis accurately capture the inherent
stability properties of the system.

Chunk used for Q generation:
Okay. So let's get on. So what