In [15]:
def create_specialization_documents():
    """Extract specializations and create specialization vector store"""
    print("ðŸ”„ Creating specialization documents...")
    print("   - Loading chroma documents...")

    import json

    # Load chroma documents
    with open(
        "../chroma_documents/chroma_json_documents.json", "r", encoding="utf-8"
    ) as f:
        data = json.load(f)

    print(f"   - Loaded {len(data)} documents")

    # Extract specialization documents (based on your notebook logic)
    specializations_documents = []
    for json_document in data.copy():
        if "specializations" in json_document["page_content"].keys():
            specializations_documents.append(json_document)

    # Create specialization chroma documents (based on your notebook)
    specialization_chroma_documents = []
    for specialization_document in specializations_documents:
        metadata = specialization_document["metadata"]
        page_content = specialization_document["page_content"]
        school = page_content["school"]
        school_type = page_content["school_type"]
        field = page_content["field"]
        # campuses = page_content["campus"]

        for year in page_content["year_details"].keys():
            current_specialization_list = page_content["year_details"][year]

            # Collect program intake info for this year
            program_intake_info = {}
            for item in current_specialization_list:
                if "program_intake" in item.keys():
                    program_intake_info = item
                    break

            # Process specializations for this year
            for specialization_dict in current_specialization_list:
                if "specialization" in specialization_dict.keys():
                    final_dict = {}
                    dummy_dict = {}

                    specialization = specialization_dict["specialization"]
                    spec_intake = specialization_dict.get("intake", [])
                    spec_campus = specialization_dict.get("campus", "")
                    spec_language = specialization_dict.get("language", "")
                    spec_alternance = specialization_dict.get("alternance", "")
                    price = (
                        program_intake_info.get("price", "")
                        if program_intake_info
                        else ""
                    )

                    dummy_dict["specialization"] = specialization
                    dummy_dict["school"] = school
                    dummy_dict["school_type"] = school_type
                    dummy_dict["price"] = price
                    dummy_dict["field"] = field
                    dummy_dict["campus"] = spec_campus
                    dummy_dict["intake"] = spec_intake
                    dummy_dict["year"] = year
                    dummy_dict["language"] = spec_language
                    dummy_dict["alternance"] = spec_alternance

                    final_dict["page_content"] = dummy_dict
                    final_dict["metadata"] = metadata

                    specialization_chroma_documents.append(final_dict)

    print(
        f"   - Processed {len(specialization_chroma_documents)} specialization entries"
    )
    print("   - Saving specialization documents...")
    # save specialization documents
    with open(
        "../specialisation_data/specialization_chroma_documents.json", "w", encoding="utf-8"
    ) as f:
        json.dump(specialization_chroma_documents, f, indent=4, ensure_ascii=False)

In [16]:
create_specialization_documents()

ðŸ”„ Creating specialization documents...
   - Loading chroma documents...
   - Loaded 1380 documents
   - Processed 1630 specialization entries
   - Saving specialization documents...
