In [88]:
import os
import re
import openai
import pandas as pd
from langchain.chat_models import ChatOpenAI
import matplotlib.pyplot as plt
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import Table, NarrativeText, Text
from typing import List
from langchain.hub import pull
from langchain.chains import create_extraction_chain_pydantic
import json



In [None]:
def extract_pdf_elements_with_tables(pdf_path: str):
    elements = partition_pdf(filename=pdf_path, strategy="fast", infer_table_structure=True)
    return elements

def get_pdf_text_with_tables(pdf_path: str):
    elements = extract_pdf_elements_with_tables(pdf_path)
    extracted_content = []

    # Process the extracted elements
    for element in elements:
        if isinstance(element, Table):
            table_html = element.metadata.text_as_html
            extracted_content.append({"table": table_html})
        else:
            extracted_content.append({"text": element.text})

    return extracted_content

def extract_student_handbook_pdf():
    pdf_path = '/Users/bruce/Desktop/SCHOOL CHATBOT/data/student_handbook.pdf'
    extracted_data = get_pdf_text_with_tables(pdf_path)

    total_characters = 0
    for item in extracted_data:
        if 'text' in item:
            text_length = len(item['text'])
            total_characters += text_length
        if 'table' in item:
            table_length = len(item['table'])
            total_characters += table_length
    print(f"Total number of characters: {total_characters}")
    
    return extracted_data
    

def clean_text(text):
    text = re.sub(r'\d+\s*\|\s*P\s*a\s*g\s*e', '', text)
    text = re.sub(r'\n+', '\n', text) 
    text = re.sub(r'\s{2,}', ' ', text) 
    text = re.sub(r'•|\d+\.', '', text) 
    # Strip extra whitespace 
    text = text.strip() 
    
    return text

# Extract data
extracted_data = extract_student_handbook_pdf()

# Clean the text entries in the extracted data
cleaned_data = []
for item in extracted_data:
    if 'text' in item:
        cleaned_text = clean_text(item['text'])  # Clean the text
        cleaned_data.append({"text": cleaned_text})
    elif 'table' in item:
        cleaned_data.append({"table": item['table']})  # Keep the table as is

# Preview cleaned text and tables (first 5 entries)
#for i, item in enumerate(cleaned_data[:500]):
    #print(f"Entry {i+1}: {item}")
    
#print(cleaned_data[:200])

# Function to split text into paragraphs
def split_text_into_paragraphs(text_list):
    all_paragraphs = []
    
    # Iterate through each dictionary in the list
    for item in text_list:
        if 'text' in item:
            text = item['text']
            # Split the text into paragraphs using double newline
            paragraphs = text.split("\n\n")
            # Strip any extra whitespace and store non-empty paragraphs
            all_paragraphs.extend([para.strip() for para in paragraphs if para.strip()])
    
    return all_paragraphs

# Implementing clean paragraphs
paragraphs = split_text_into_paragraphs(cleaned_data)

for i,paragraph in enumerate(paragraphs[:50]):
      print(f"Paragraph {i+1}: {paragraph}")




In [31]:
len(paragraphs)

1349

In [32]:
type(paragraphs)

list

In [67]:
openai_api_key = ""  
llm = ChatOpenAI(model="gpt-4-1106-preview", openai_api_key=openai_api_key)

In [None]:
obj = pull("wfh/proposal-indexing")

In [71]:
# Create a runnable that combines the prompt and the model
runnable = obj | llm

In [123]:
def get_propositions(text):
    # Use the runnable to extract propositions
    result = runnable.invoke({"input": text})
    
    # Extract the content part and remove the backticks and format indicators
    raw_content = result.content.strip().strip("```json").strip("```")
    
    # Parse the JSON string to a Python list
    try:
        propositions = json.loads(raw_content)
    except json.JSONDecodeError as e:
        print("Error decoding JSON:", e)
        propositions = []
    
    return propositions

# extracting propositions from The paragraphs
student_conduct_propositions = []
for i, paragraph in enumerate(paragraphs[694:]):
    proposition = get_propositions(paragraph)
    student_conduct_propositions.extend(proposition)
    print(f"Done with {694 + i}")


Done with 694
Done with 695
Done with 696
Done with 697
Done with 698
Done with 699
Done with 700
Done with 701
Done with 702
Done with 703
Done with 704
Done with 705
Done with 706
Done with 707
Done with 708
Done with 709
Done with 710
Done with 711
Done with 712
Done with 713
Done with 714
Done with 715
Done with 716
Done with 717
Done with 718
Done with 719
Done with 720
Done with 721
Done with 722
Done with 723
Done with 724
Done with 725
Done with 726
Done with 727
Done with 728
Done with 729
Done with 730
Done with 731
Done with 732
Done with 733
Done with 734
Done with 735
Done with 736
Done with 737
Done with 738
Done with 739
Done with 740
Done with 741
Done with 742
Done with 743
Done with 744
Error decoding JSON: Expecting value: line 1 column 1 (char 0)
Done with 745
Done with 746
Done with 747
Done with 748
Done with 749
Done with 750
Done with 751
Done with 752
Done with 753
Done with 754
Done with 755
Done with 756
Done with 757
Done with 758
Done with 759
Done with 760

In [124]:
len(student_conduct_propositions)

3720

In [None]:
with open('prep_batch.json', 'w') as file:
    json.dump(student_conduct_propositions, file)

In [None]:
with open('prep_batch.json', 'r') as file:
    loaded_list = json.load(file)

In [125]:
print(len(loaded_list))

3090


In [126]:
from agentic_chunker import AgenticChunker

In [127]:
ac = AgenticChunker()

In [128]:
ac.add_propositions(student_conduct_propositions)


Adding: 'No organization/club shall intake members prior to meeting the college/university's requirements.'
No chunks, creating a new one
Created new chunk (cb0f2): College Organization Membership Rules

Adding: 'No organization/club shall intake members prior to meeting the requirements of its national organization.'
Chunk Found (cb0f2), adding to: College Organization Membership Rules

Adding: 'No intake activity should be held without the presence of the advisor(s).'
Chunk Found (cb0f2), adding to: Membership Intake Regulations for Educational Organizations

Adding: 'HAZING in any form is prohibited.'
Chunk Found (cb0f2), adding to: Educational Organization Membership Intake Rules & Advisor Requirements

Adding: 'Physical abuse is prohibited.'
Chunk Found (cb0f2), adding to: Membership Intake & Hazing Policies in Educational Organizations

Adding: 'Mental abuse is prohibited.'
No chunks found
Created new chunk (806a0): Legal Status of Abuse

Adding: 'Physical and mental abuse will 

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [133]:
ac.pretty_print_chunks()


You have 26 chunks

Chunk #0
Chunk ID: cb0f2
Summary: This chunk details the comprehensive guidelines and regulations for membership intake processes, hazing policies, and event scheduling for student organizations at Georgia State University, emphasizing the mandatory compliance with SEAL's procedures and the university's student conduct codes.
Propositions:
    -No organization/club shall intake members prior to meeting the college/university's requirements.
    -No organization/club shall intake members prior to meeting the requirements of its national organization.
    -No intake activity should be held without the presence of the advisor(s).
    -HAZING in any form is prohibited.
    -Physical abuse is prohibited.
    -Physical and mental abuse will not be tolerated.
    -A list of intended candidates must be filed with SEAL.
    -The list of intended candidates must be filed the following day after any informational meetings.
    -All students interested in joining a student org

In [131]:
chunks = ac.get_chunks(get_type='dict')

In [132]:
chunks


{'cb0f2': {'chunk_id': 'cb0f2',
  'propositions': ["No organization/club shall intake members prior to meeting the college/university's requirements.",
   'No organization/club shall intake members prior to meeting the requirements of its national organization.',
   'No intake activity should be held without the presence of the advisor(s).',
   'HAZING in any form is prohibited.',
   'Physical abuse is prohibited.',
   'Physical and mental abuse will not be tolerated.',
   'A list of intended candidates must be filed with SEAL.',
   'The list of intended candidates must be filed the following day after any informational meetings.',
   'All students interested in joining a student organization must be authorized at Georgia State University (GSU).',
   'All students interested in joining a student organization at Georgia State University (GSU) must take the Anti-Hazing certification course.',
   'The Anti-Hazing certification course is sponsored by the Student Engagement and Leadership C

In [97]:
type(chunks)


list

In [134]:
# Save the chunks to a file
with open("chunk_data.json", "w") as f:
    json.dump(chunks, f, indent=4)

In [98]:
len(chunks)

17

In [105]:
chunks[10]


"I encourage you to embrace the principles from Jim Collins' book 'Good to Great' as you begin this academic year. Jim Collins emphasizes the importance of disciplined people. Jim Collins emphasizes the importance of disciplined thought. Jim Collins emphasizes the importance of disciplined action. Jim Collins suggests that greatness is largely a matter of conscious choice. Jim Collins suggests that greatness is largely a matter of discipline."