In [1]:
from rag_tools.document_parsing import documentparser
from rag_tools.markdown_element import MarkdownElementNodeParser_mod

In [2]:
import glob
import pickle
import itertools
import nest_asyncio
from dotenv import load_dotenv
import os

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.extractors import (SummaryExtractor,TitleExtractor)
from llama_index.core.schema import BaseNode, TextNode, Document
from llama_index.core.node_parser import LlamaParseJsonNodeParser

In [3]:
# Load environment variables from .env file
load_dotenv()

nest_asyncio.apply()

openai_api_key = os.getenv("OPENAI_API_KEY")
llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY")

### Parse documents

In [None]:
from document_parsing import documentparser

In [None]:
files = glob.glob("GSU - CIS 3260 - Fall 2023/Lecture Slides/*.pdf")  

In [None]:
files[:2]

['GSU - CIS 3260 - Fall 2023/Lecture Slides/Week 4.pdf',
 'GSU - CIS 3260 - Fall 2023/Lecture Slides/Week 3.pdf']

In [None]:
md = documentparser.document_to_markdown(doc = files[:2],doc_type="lecture slide")

Started parsing the file under job_id fb64c4ef-9ec7-4bb2-b606-bcd59d77e33a
Started parsing the file under job_id 4b824308-2ca0-4b1a-8a7f-befd4877af14


### Load Parsed Document 

In [52]:
import os

file_path = 'GSU - CIS 3260 - Fall 2023/Lecture Slides Parsed/Week 14.pkl'
base_path, _ = os.path.splitext(file_path)

In [53]:
base_path

'GSU - CIS 3260 - Fall 2023/Lecture Slides Parsed/Week 14'

In [4]:
files = glob.glob("GSU - CIS 3260 - Fall 2023/Lecture Slides Parsed/*.pkl")  
loaded_files = []
for i in files:
    loaded_files.append(pickle.load(open(i, "rb")))

md = loaded_files[:2]

In [7]:
md

[[Document(id_='720eb6d8-4f68-4a81-a717-8c42d2eb139c', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='# Objectives for class 14\n\n- Review IA4-IA7\n- Review Quiz 3 – Quiz 5\n- Final Exam Schedule\n- Group Project Schedule\n---\n# Individual Assignment 4 - 2\n\n- Enter a letter grade A/a, B/b, C/c, D/d, and then displays its corresponding numeric value 90, 80, 70, 60.\n\n## Sample Run\n```\nEnter a letter grade: B\nThe numeric value for grade B is 80\n```\n\n```python\nletter = input("Enter a letter grade: ")\n\nif letter in \'Aa\':\n    print("The numeric value for grade A is 90")\nelif letter in \'Bb\':\n    print("The numeric value for grade B is 80")\nelif letter in \'Cc\':\n    print("The numeric value for grade C is 70")\nelif letter in \'Dd\':\n    print("The numeric value for grade D is 60")\nelse:\n    print(letter, "is an invalid grade")\n```\n\n- Letter in “Aa”\n- Multiple way if statements\n---\n# Individ

### Create Chunks using LlamaIndex

In [None]:
def chunk(md):
    node_parser_OPENAI = MarkdownElementNodeParser_mod(
        llm=OpenAI(model="gpt-4"), num_workers=8, 
    )
    nodes_OA = []
    base_nodes_OA  = []
    objects_OA  = []

    for doc in md:
        pdoc_OA = node_parser_OPENAI.get_nodes_from_documents(doc)
        nodes_OA.append(pdoc_OA)
        base, obj = node_parser_OPENAI.get_nodes_and_objects(pdoc_OA)
        base_nodes_OA.append(base)
        objects_OA.append(obj)
    base_nodes_OA_list = list(itertools.chain(*base_nodes_OA))
    objects_OA_list = list(itertools.chain(*objects_OA))

    await SummaryExtractor().acall(base_nodes_OA_list)
    await TitleExtractor().acall(base_nodes_OA_list+objects_OA_list)
return base_nodes_OA,objects_OA

In [None]:
base_nodes_OA, objects_OA = chunk(md)

### Serialize

In [9]:
from rag_tools.serialize_doc import serialize_

In [13]:
text_chunk = serialize_.serialize_text_chunk(base_nodes_OA)
obj_chunk = serialize_.serialize_object_chunk(objects_OA)

In [20]:
cf = serialize_.combine_chunk(text_chunk,obj_chunk)

In [24]:
cf[1]

[{'section_summary': 'In this section, the objectives for class 7 include learning how to solve mathematics problems using functions in the math module, representing and processing strings and characters, encoding characters using ASCII and Unicode, using the `ord` and `chr` functions, representing special characters using escape sequences, testing substrings using `in` and `not in` operators, comparing strings, and using string functions such as `min`, `max`, and `len`. Additionally, students will learn how to solve math problems using Python built-in functions like `max`, `min`, `round`, `abs`, and `pow`, as well as using math functions for more complex mathematical operations.',
  'document_title': '"Mastering String Manipulation and Character Encoding in Python: A Comprehensive Guide to Diversity, Uniqueness, Unicode, ASCII, and Built-in Functions"',
  'start_char_idx': 0,
  'end_char_idx': 1071},
 {'section_summary': 'The table provides a list of mathematical functions, their desc

In [27]:
doc_full = serialize_.serialize_doc(cf,md,files[:2])

In [33]:
doc_full[0]

{'filename': 'GSU - CIS 3260 - Fall 2023/Lecture Slides Parsed/Week 14.pkl',
 'title': 'Lecture Slides Parsed/Week 14',
 'content': '# Objectives for class 14\n\n- Review IA4-IA7\n- Review Quiz 3 – Quiz 5\n- Final Exam Schedule\n- Group Project Schedule\n---\n# Individual Assignment 4 - 2\n\n- Enter a letter grade A/a, B/b, C/c, D/d, and then displays its corresponding numeric value 90, 80, 70, 60.\n\n## Sample Run\n```\nEnter a letter grade: B\nThe numeric value for grade B is 80\n```\n\n```python\nletter = input("Enter a letter grade: ")\n\nif letter in \'Aa\':\n    print("The numeric value for grade A is 90")\nelif letter in \'Bb\':\n    print("The numeric value for grade B is 80")\nelif letter in \'Cc\':\n    print("The numeric value for grade C is 70")\nelif letter in \'Dd\':\n    print("The numeric value for grade D is 60")\nelse:\n    print(letter, "is an invalid grade")\n```\n\n- Letter in “Aa”\n- Multiple way if statements\n---\n# Individual Assignment 4 - 3\n\n- Display numbe

In [35]:
content = serialize_.content_for_embedding(cf,doc_full)

In [55]:
content[0]

['section_summary: In this section, the key topics covered include reviewing previous assignments and quizzes, discussing the final exam schedule, and group project schedule. \n\nThe individual assignments focus on different tasks such as converting letter grades to numeric values, determining the number of days in a month, displaying the length and first character of a string, deciding a major and year based on input characters, and converting an ISBN-9 to an ISBN-10 without using a function.\n\nOverall, the section emphasizes problem-solving skills, conditional statements, string manipulation, and mathematical calculations.\n\ndocument_title: Python Programming Concepts and Techniques: ISBN Conversion, String Analysis, and Rating Frequency Count\n\ncontent: # Objectives for class 14\n\n- Review IA4-IA7\n- Review Quiz 3 – Quiz 5\n- Final Exam Schedule\n- Group Project Schedule\n---\n# Individual Assignment 4 - 2\n\n- Enter a letter grade A/a, B/b, C/c, D/d, and then displays its corre

In [51]:
print(content[0][6])

section_summary: The table provides a breakdown of points distribution across different programming and software development topics for a test or assignment. Each topic has points allocated for multiple choice questions, fill in the blanks, and problem-solving tasks. The total points for the test or assignment is 100.,
with the following columns:
- Topic: Topics include loops, string concepts, function creation, 1D and 2D lists, and software development.
- Multiple Choices: Points allocated for multiple choice questions.
- Fill In blanks: Points allocated for fill in the blanks.
- Problem Solving: Points allocated for problem-solving tasks.
- Total: Total points for each topic.


document_title: Python Programming Concepts and Techniques: ISBN Conversion, String Analysis, and Rating Frequency Count

content: |                                      | Multiple Choices | Fill In blanks | Problem Solving | Total |
|--------------------------------------|------------------|----------------|-

### Send to LLM/Embedding Model

In [None]:
import rag_tools.utils

In [None]:
rag_tools.utils.embed_nodes(base_nodes_OA[0],
                            embed_model= OpenAIEmbedding())

{'9371e839-7327-4c8d-b877-2a49fd17d242': [0.012948758900165558,
  0.0022825654596090317,
  0.01547309197485447,
  -0.04823964089155197,
  -0.031059956178069115,
  0.0195689108222723,
  -0.02312430739402771,
  -0.024717126041650772,
  -0.031230615451931953,
  -0.01135594118386507,
  0.027732104063034058,
  0.00610106298699975,
  -0.0314297191798687,
  0.043859388679265976,
  -0.004362473264336586,
  -0.010218213312327862,
  0.004860228858888149,
  -0.005628195125609636,
  -0.007565886713564396,
  -0.006161504425108433,
  0.003628283506259322,
  0.0005253100534901023,
  -0.01124216802418232,
  -0.0067836991511285305,
  -0.008348074741661549,
  -0.0029420917853713036,
  -0.015771744772791862,
  -0.040645308792591095,
  0.01779121160507202,
  -0.0013963825767859817,
  0.010566642507910728,
  -0.018274744972586632,
  -0.025001557543873787,
  -0.03174259141087532,
  -0.012628773227334023,
  -0.018175194039940834,
  -0.004860228858888149,
  -0.02554197795689106,
  0.005283321253955364,
  -0.0

In [None]:
from llama_index.core.schema import BaseNode, ImageNode, MetadataMode


In [None]:
base_nodes_OA[0][0].get_content(metadata_mode=MetadataMode.EMBED)

'[Excerpt from document]\nsection_summary: In this section, the key topics covered include reviewing previous assignments and quizzes, discussing the final exam schedule, and group project schedule. \n\nThe individual assignments in this section focus on various tasks such as converting letter grades to numeric values, determining the number of days in a given month, calculating the length of a string and displaying its first and last characters, and deciding a major and year based on user input. \n\nAdditionally, there is an assignment that involves converting a 9-digit ISBN to a 10-digit ISBN following a specific checksum calculation formula. \n\nOverall, the section covers a range of programming tasks and concepts related to conditional statements, string manipulation, and mathematical calculations.\ndocument_title: Python Programming Concepts and Techniques: ISBN Conversion, String Analysis, and Ratings Frequency Analysis\ncontent: # Objectives for class 14\n\n- Review IA4-IA7\n- R

In [None]:
objects_OA[0][0].get_content(metadata_mode=MetadataMode.EMBED)

'[Excerpt from document]\ndocument_title: Python Programming Concepts and Techniques: ISBN Conversion, String Analysis, and Ratings Frequency Analysis\nExcerpt:\n-----\nThe table provides a distribution of ratings from 0 to 5, along with their respective counts.,\nwith the following columns:\n- Rating: This column lists the ratings from 0 to 5.\n- Count: This column shows the count of each rating.\n\n-----'

In [None]:
print(base_nodes_OA[0][0].get_content(metadata_mode=MetadataMode.EMBED))

[Excerpt from document]
section_summary: In this section, the key topics covered include reviewing previous assignments and quizzes, discussing the final exam schedule, and group project schedule. 

The individual assignments in this section focus on various tasks such as converting letter grades to numeric values, determining the number of days in a given month, calculating the length of a string and displaying its first and last characters, and deciding a major and year based on user input. 

Additionally, there is an assignment that involves converting a 9-digit ISBN to a 10-digit ISBN following a specific checksum calculation formula. 

Overall, the section covers a range of programming tasks and concepts related to conditional statements, string manipulation, and mathematical calculations.
document_title: Python Programming Concepts and Techniques: ISBN Conversion, String Analysis, and Ratings Frequency Analysis
content: # Objectives for class 14

- Review IA4-IA7
- Review Quiz 3 