In [17]:
from typing import List, Optional, Literal
from pydantic import BaseModel, Field
import os
import pandas as pd
import uuid
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langgraph.graph import StateGraph, END
from typing import TypedDict, Any


In [18]:
class FlashCardData(BaseModel):
    word_hanzi: str
    pinyin: str
    definition_en: str
    definition_vn: str
    example_sentence: str
    example_pinyin: str
    example_meaning_en: str
    example_meaning_vn: str
    extra_info_md: Optional[str] = None

class QuizOptionData(BaseModel):
    text: str
    is_correct: bool
    order: int

class MatchPairData(BaseModel):
    left_text: str
    right_text: str
    order: int

class SentenceQuizData(BaseModel):
    chinese_sentence: str
    pinyin_sentence: str
    correct_answer: Optional[str] = Field(None, description="The word to fill in the blank")
    distractors: List[str] = Field(default_factory=list, description="Wrong options for the blank")

# --- Core Models ---

class GeneratedQuiz(BaseModel):
    quiz_type: Literal['flashcard', 'single_choice', 'matching', 'sentence', 'text_response', 'speaking']
    question_en: str
    question_vn: str
    explanation_en: Optional[str] = None
    explanation_vn: Optional[str] = None
    order: int
    
    # Polymorphic fields: Only ONE of these should be filled based on quiz_type
    flashcard: Optional[FlashCardData] = None
    options: Optional[List[QuizOptionData]] = None
    pairs: Optional[List[MatchPairData]] = None
    sentence: Optional[SentenceQuizData] = None

class GeneratedSection(BaseModel):
    section_type: Literal['vocabulary', 'grammar', 'reading', 'practice', 'listening']
    title_en: str
    title_vn: str
    content_md: Optional[str] = Field(None, description="Markdown theory for grammar/reading")
    order: int
    quizzes: List[GeneratedQuiz] = Field(..., description="List of quizzes in this section")

class GeneratedLessonContent(BaseModel):
    """Output for a single Lesson"""
    sections: List[GeneratedSection]

In [None]:
llm = ChatOpenAI(model="gpt-4o", temperature=0.7, api_key="") # GPT-4o is recommended for complex JSON

# 2. Define State
class GeneratorState(TypedDict):
    course_info: dict
    lesson_info: dict
    generated_content: Any
    errors: list

# 3. Define the Generation Node
def generate_lesson_content(state: GeneratorState):
    course = state['course_info']
    lesson = state['lesson_info']
    
    parser = PydanticOutputParser(pydantic_object=GeneratedLessonContent)

    system_msg = """
    You are an expert Chinese HSK Curriculum Developer.
    Your goal is to generate concrete lesson content (Sections -> Quizzes) based on the provided metadata.
    
    GUIDELINES:
    - Language Level: {hsk_level}
    - Support Languages: English & Vietnamese
    - Structure: Create distinct sections (Vocabulary, Grammar, Practice).
    - Vocabulary: Must use 'flashcard' quiz type.
    - Grammar: Must provide markdown explanations (`content_md`) and then 'single_choice' or 'sentence' quizzes.
    - JSON: You must strictly follow the output schema.
    """
    
    user_msg = """
    GENERATE CONTENT FOR:
    Course: {course_name}
    Lesson Title: {lesson_title}
    Objectives: {objectives}
    Description: {description}
    """

    prompt = ChatPromptTemplate.from_messages([
        ("system", system_msg),
        ("user", user_msg),
        ("user", "Format Instructions: {format_instructions}")
    ])

    formatted_prompt = prompt.invoke({
        "hsk_level": course.get('hsk_level', 'HSK1'),
        "course_name": course.get('name'),
        "lesson_title": lesson.get('title'),
        "objectives": lesson.get('learning_objectives'),
        "description": lesson.get('description_en'),
        "format_instructions": parser.get_format_instructions()
    })

    try:
        response = llm.invoke(formatted_prompt)
        parsed_data = parser.parse(response.content)
        return {"generated_content": parsed_data}
    except Exception as e:
        print(f"Error generating {lesson.get('title')}: {e}")
        return {"errors": [str(e)]}

# 4. Build Graph
workflow = StateGraph(GeneratorState)
workflow.add_node("generator", generate_lesson_content)
workflow.set_entry_point("generator")
workflow.add_edge("generator", END)
app = workflow.compile()

# 5. Main Execution Logic
def main():
    # Load inputs
    courses_df = pd.read_csv('../courses_data/courses.csv')
    lessons_df = pd.read_csv('../courses_data/lessons.csv')

    # Data holders for CSV export
    data_sections = []
    data_quizzes = []
    data_flashcards = []
    data_options = []
    data_pairs = []
    data_sentences = []

    for _, lesson_row in lessons_df.iterrows():
        print(f"Generating content for: {lesson_row['title']}...")
        
        # Get parent course data
        course_row = courses_df[courses_df['name'] == lesson_row['course']].iloc[0]

        # Run Graph
        inputs = {
            "course_info": course_row.to_dict(),
            "lesson_info": lesson_row.to_dict(),
            "generated_content": None,
            "errors": []
        }
        result = app.invoke(inputs)

        if result['errors']:
            continue
        
        content: GeneratedLessonContent = result['generated_content']
        
        # Flatten Data for CSV
        # We use the Lesson Title to link sections back to the lesson in the DB import stage
        lesson_key = lesson_row['title'] 

        for sec in content.sections:
            sec_id = str(uuid.uuid4()) # Temporary ID for linking
            
            data_sections.append({
                "id": sec_id,
                "lesson_key": lesson_key, # Link to Lesson
                "section_type": sec.section_type,
                "title_en": sec.title_en,
                "title_vn": sec.title_vn,
                "content_md": sec.content_md,
                "order": sec.order
            })

            for quiz in sec.quizzes:
                quiz_id = str(uuid.uuid4())
                
                data_quizzes.append({
                    "id": quiz_id,
                    "section_id": sec_id, # Link to Section
                    "quiz_type": quiz.quiz_type,
                    "question_en": quiz.question_en,
                    "question_vn": quiz.question_vn,
                    "explanation_en": quiz.explanation_en,
                    "explanation_vn": quiz.explanation_vn,
                    "order": quiz.order
                })

                # Process specific types
                if quiz.flashcard:
                    data_flashcards.append({
                        "quiz_id": quiz_id,
                        **quiz.flashcard.model_dump()
                    })
                elif quiz.options:
                    for opt in quiz.options:
                        data_options.append({
                            "quiz_id": quiz_id,
                            **opt.model_dump()
                        })
                elif quiz.pairs:
                    for pair in quiz.pairs:
                        data_pairs.append({
                            "quiz_id": quiz_id,
                            **pair.model_dump()
                        })
                elif quiz.sentence:
                    data_sentences.append({
                        "quiz_id": quiz_id,
                        **quiz.sentence.model_dump()
                    })

    # Export to CSV
    os.makedirs('output_data', exist_ok=True)
    pd.DataFrame(data_sections).to_csv('output_data/sections.csv', index=False)
    pd.DataFrame(data_quizzes).to_csv('output_data/quizzes.csv', index=False)
    pd.DataFrame(data_flashcards).to_csv('output_data/flashcards.csv', index=False)
    pd.DataFrame(data_options).to_csv('output_data/options.csv', index=False)
    pd.DataFrame(data_pairs).to_csv('output_data/pairs.csv', index=False)
    pd.DataFrame(data_sentences).to_csv('output_data/sentences.csv', index=False)
    
    print("All CSVs generated successfully in output_data/")

In [20]:
if __name__ == "__main__":
    main()

Generating content for: Hello - Ni Hao...
Generating content for: Thank you - Xie Xie...
All CSVs generated successfully in output_data/
