In [48]:
import os
import docx
import json
import re
import time
from dotenv import load_dotenv
from litellm import completion
from pydantic import BaseModel, ValidationError
from typing import List

load_dotenv()

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
    return text


v1 = extract_text_from_docx("data/v1.docx")
v2 = extract_text_from_docx("data/v2.docx")

# Count tokens (rough estimate)
v1_tokens = len(v1.split()) / 0.75
v2_tokens = len(v2.split()) / 0.75
print(f"v1 tokens: {v1_tokens}, v2 tokens: {v2_tokens}")


def batch_text(text, max_tokens=10000):
    sentences = text.replace("\n", " ").split(".")
    current_batch = []
    current_token_count = 0
    batches = []

    for sentence in sentences:
        sentence_tokens = len(sentence.split()) / 0.75
        if current_token_count + sentence_tokens > max_tokens and current_batch:
            batches.append(" ".join(current_batch) + ".")
            current_batch = []
            current_token_count = 0
        current_batch.append(sentence)
        current_token_count += sentence_tokens

    if current_batch:
        batches.append(" ".join(current_batch) + ".")

    return batches


# Create batches for v1 and v2
v1_batches = batch_text(v1)
v2_batches = batch_text(v2)


# Define a Pydantic model to enforce the JSON structure
class Section(BaseModel):
    title: str
    Purpose: str


class Sections(BaseModel):
    sections: List[Section]


def extract_titles(batch_text: str) -> List[Section]:
    prompt = """
    Analyze the following text from Apple's Terms and Conditions. For each section you find:
    1. Extract the section title.
    2. Create a brief 1 sentence description of the section's main purpose.
    
    Return the results as a JSON list of dictionaries, where each dictionary has exactly two keys: "title" and "Purpose". For example:
    [
      {{"title": "Section Title 1", "Purpose": "Description for section 1."}},
      {{"title": "Section Title 2", "Purpose": "Description for section 2."}}
    ]
    
    Respond with only the JSON array.
    
    Text to analyze:
    {text}
    """

    try:
        response = completion(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt.format(text=batch_text)}],
            response_format=Sections,
        )

        return [
            Section(**item)
            for item in json.loads(response.choices[0].message.content)["sections"]
        ]

    except (json.JSONDecodeError, ValidationError) as e:
        print(f"Error processing batch: {str(e)}")
        print("Response content was:")
        print(response.choices[0].message.content)
        return []

    except Exception as e:
        print(f"Error processing batch: {str(e)}")
        print("Response content was:")
        print(response.choices[0].message.content)
        return []


def process_batches(batches: List[str]) -> List[Section]:
    """
    Processes a list of text batches to extract sections using the extract_titles function.

    Args:
        batches (List[str]): A list of text batches to process.

    Returns:
        List[Section]: A combined list of Section objects extracted from all batches.
    """
    all_titles = []
    for i, batch in enumerate(batches):
        print(f"Processing batch {i+1} of {len(batches)}")
        titles = extract_titles(batch)
        if not titles:
            print(f"Error occurred while processing batch {i+1}. Stopping processing.")
            break
        print(f"Titles for batch {i+1}:\n{titles}")
        all_titles.extend(titles)
    return all_titles


# Process v1_batches
all_v1_sections = process_batches(v1_batches)
print("All extracted sections from v1_batches:")
for section in all_v1_sections:
    print(section.model_dump())

all_v2_sections = process_batches(v2_batches)
print("All extracted sections from v2_batches:")
for section in all_v2_sections:
    print(section.model_dump())

v1 tokens: 20532.0, v2 tokens: 11181.333333333334
Processing batch 1 of 3
Titles for batch 1:
[Section(title='TERMS AND CONDITIONS', Purpose='Sets the legal framework governing the use of the iTunes Store and related services.'), Section(title='PAYMENTS, TAXES, AND REFUND POLICY', Purpose='Outlines payment responsibilities, taxes, and conditions for order cancellations and refunds.'), Section(title='1-Click', Purpose='Explains the 1-Click purchasing feature for quick transactions on the Stores.'), Section(title='GIFT CERTIFICATES, ITUNES CARDS AND CODES, ALLOWANCES, AND CONTENT CODES', Purpose='Details the terms regarding the use of gift certificates, iTunes cards, and codes, including redeemability and limitations.'), Section(title='GIFTS', Purpose='States the conditions for purchasing and redeeming gifts from the Stores.'), Section(title='IMPORTANT SAFETY INFORMATION', Purpose='Provides guidelines for safe use of products to prevent strain and health issues.'), Section(title='PRE-ORD

In [49]:
all_v1_sections

[Section(title='TERMS AND CONDITIONS', Purpose='Sets the legal framework governing the use of the iTunes Store and related services.'),
 Section(title='PAYMENTS, TAXES, AND REFUND POLICY', Purpose='Outlines payment responsibilities, taxes, and conditions for order cancellations and refunds.'),
 Section(title='1-Click', Purpose='Explains the 1-Click purchasing feature for quick transactions on the Stores.'),
 Section(title='GIFT CERTIFICATES, ITUNES CARDS AND CODES, ALLOWANCES, AND CONTENT CODES', Purpose='Details the terms regarding the use of gift certificates, iTunes cards, and codes, including redeemability and limitations.'),
 Section(title='GIFTS', Purpose='States the conditions for purchasing and redeeming gifts from the Stores.'),
 Section(title='IMPORTANT SAFETY INFORMATION', Purpose='Provides guidelines for safe use of products to prevent strain and health issues.'),
 Section(title='PRE-ORDERS', Purpose='Describes the process and conditions surrounding pre-ordering items from 

In [50]:
# save the extracted sections to a JSON file
def save_sections(sections, filename):
    with open(filename,
              'w') as f:
        json.dump([section.dict() for section in sections], f, indent=2)

save_sections(all_v1_sections, 'data/output/v1_sections.json')
save_sections(all_v2_sections, 'data/v2_sections.json')



/var/folders/l7/x3prn1_x3hb6nk9gkfb45jpw0000gn/T/ipykernel_41184/3090131029.py:5: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  json.dump([section.dict() for section in sections], f, indent=2)


In [51]:
# load the extracted sections from the JSON file
def load_sections(filename):
    with open(filename, 'r') as f:
        sections = json.load(f)
    return [Section(**section) for section in sections]

load_sections('data/v1_sections.json')

[Section(title='TERMS AND CONDITIONS', Purpose='Sets the legal framework governing the use of the iTunes Store and related services.'),
 Section(title='PAYMENTS, TAXES, AND REFUND POLICY', Purpose='Outlines payment responsibilities, taxes, and conditions for order cancellations and refunds.'),
 Section(title='1-Click', Purpose='Explains the 1-Click purchasing feature for quick transactions on the Stores.'),
 Section(title='GIFT CERTIFICATES, ITUNES CARDS AND CODES, ALLOWANCES, AND CONTENT CODES', Purpose='Details the terms regarding the use of gift certificates, iTunes cards, and codes, including redeemability and limitations.'),
 Section(title='GIFTS', Purpose='States the conditions for purchasing and redeeming gifts from the Stores.'),
 Section(title='IMPORTANT SAFETY INFORMATION', Purpose='Provides guidelines for safe use of products to prevent strain and health issues.'),
 Section(title='PRE-ORDERS', Purpose='Describes the process and conditions surrounding pre-ordering items from 