In [193]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [194]:
import adalflow as adal
from adalflow.datasets.types import BaseData
from adalflow.core.model_client import ModelClient
from adalflow.components.model_client import AzureAIClient
from dataclasses import dataclass, field
from typing import Dict, Any, Optional, Literal, Union, Tuple, Callable

In [195]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams, LLMTestCase
from deepeval.models import AzureOpenAIModel

In [196]:
from dotenv import load_dotenv
load_dotenv()

True

# 0. Set up model config

In [197]:
# We use DeepEval for G-Eval evaluation
evaluation_model = AzureOpenAIModel(
    model_name="gpt-4o",
    temperature=0.0,
    top_p=0
)

In [198]:
# We use Adalflow clients for student and teacher model
student_model = {
    "model_client": AzureAIClient(
        api_key=os.getenv("AZURE_API_KEY"),
        api_version="2024-08-01-preview",
        azure_endpoint=os.getenv("AZURE_4O_MINI_BASE_URL")
    ),
    "model_kwargs": {
        "model": "gpt-4o-mini",
        "temperature": 0,
        "max_tokens": 4000,
        "top_p": 0
    },
}
# teacher
teacher_model = {
    "model_client": AzureAIClient(
        api_key=os.getenv("AZURE_API_KEY"),
        api_version="2024-08-01-preview",
        azure_endpoint=os.getenv("AZURE_4O_BASE_URL")
    ),
    "model_kwargs": {
        "model": "gpt-4o",
        "temperature": 0.9,
        "max_tokens": 4000,
        "top_p": 0.99,
    },
}

# 1. Create Adalflow Dataclasses

In [199]:
### DATA CLASSES ###
@dataclass
class MovieData(BaseData):
    __doc__ = """A dataclass for representing movie plots."""
    plot_without_ending: str = field(
        metadata={"desc": "plot before ending"},
        default=None,
    )
    ending: str = field(
        metadata={"desc": "ending"},
        default=None,
    )
    
    __input_fields__ = ["plot_without_ending"]
    __output_fields__ = ["ending"]

    @classmethod
    def from_dict(cls, data: Dict[str, object]):
        data = {
            "id": str(data["Title"]),
            "plot_without_ending": str(data["plot_without_ending"]),
            "ending": str(data["ending"]),
        }
        return super().from_dict(data)

@dataclass
class MovieDataSimple(BaseData):
    __doc__ = """A dataclass for representing movie plots."""
    plot_without_ending: str = field(
        metadata={"desc": "plot before ending"},
        default=None,
    )
    ending: str = field(
        metadata={"desc": "ending"},
        default=None,
    )
    
    __input_fields__ = ["plot_without_ending"]
    __output_fields__ = ["ending"]


# 2. Train-Val-Test Split

In [200]:
def adalflow_train_val_test_split(
    df,
    test_size=50,
    val_size=25,
    random_state=0
):
    df_trainval, df_test = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state
    )
    df_train, df_val = train_test_split(
        df,
        test_size=val_size,
        random_state=random_state
    )
    
    records_train = df_train.to_dict(orient='records')
    records_val = df_val.to_dict(orient='records')    
    records_test = df_test.to_dict(orient='records')
    
    train_dataset = [MovieData.from_dict(record) for record in records_train]
    val_dataset = [MovieData.from_dict(record) for record in records_val]
    test_dataset = [MovieData.from_dict(record) for record in records_test]

    return train_dataset, val_dataset, test_dataset    


# 3. LLM-as-Judge Evaluator

In [201]:
### LLM Judge Evaluator ###
class MovieEndingLLMJudge():
    def __init__(
        self,
        df_criteria,
        evaluation_model,
        output_type: Literal["bool", "float"] = "float",
        use_cache: bool = True,
    ):
        #super().__init__()
        self.__name__ = "geval"

        metrics = [
            GEval(
                name=criterion,
                evaluation_steps=df_criteria.loc[criterion],
                evaluation_params=[
                    LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT
                ],
                model=evaluation_model,
                async_mode=False
            ) for criterion in df_criteria.index
        ]
        self.metrics = metrics

    
    def __call__(
        self,
        plot_without_ending: str,
        ending: str
    ) -> Union[bool, float]:

        # convert to deepeval test case
        test_case = LLMTestCase(
            input = plot_without_ending,
            actual_output = ending
        )

        # run g_eval
        scores = []
        for metric in self.metrics:
            try:
                metric_score = metric.measure(
                    test_case,
                    _show_indicator=False
                )
            except Exception as e:
                metric_score = 0
                
            scores.append(metric_score)

        return float(np.mean(scores))

# 4. Set up task pipeline (i.e. generate movie ending from plot)

In [202]:
class GenerateMovieEnding(adal.Component):

    def __init__(self, 
        template,
        instructions,
        model_client: adal.ModelClient, 
        model_kwargs: Dict
    ):
        super().__init__()

        self.data_class = MovieDataSimple

        prompt_kwargs = {
            "instructions": adal.Parameter(
                data=instructions,
                role_desc="Instructions to generate movie",
                requires_opt=True,
                param_type=adal.ParameterType.PROMPT,
            ),
            "few_shot_demos": adal.Parameter(
                data=None,
                requires_opt=True,
                role_desc="Few shot examples to help the model",
                param_type=adal.ParameterType.DEMOS,
            ),
        }

        self.llm = adal.Generator(
            model_client=model_client,
            model_kwargs=model_kwargs,
            prompt_kwargs=prompt_kwargs,
            template=template,
            use_cache=True,
        )

    def bicall(
        self, 
        plot_without_ending: str, 
        id: Optional[str] = None
    ) -> Union[adal.GeneratorOutput, adal.Parameter]:
        output = self.llm(
            prompt_kwargs={
                "plot_without_ending": plot_without_ending
            }, 
            id=id
        )
        return output

# 5. Create training pipeline

In [203]:
class MovieEndingOptAdal(adal.AdalComponent):
    def __init__(
        self,
        template,
        eval_fn_desc,
        df_criteria,
        instructions,
        model_client: adal.ModelClient,
        model_kwargs: Dict,
        teacher_model_config: Dict,
        evaluation_model,
        backward_engine_model_config: Dict,
        text_optimizer_model_config: Dict,
    ):
        task = GenerateMovieEnding(
            template,
            instructions,
            model_client, 
            model_kwargs
        )

        eval_fn = MovieEndingLLMJudge(
            df_criteria=df_criteria,
            evaluation_model=evaluation_model,
        )
        loss_fn = adal.EvalFnToTextLoss(
            eval_fn=eval_fn,
            eval_fn_desc=eval_fn_desc
        )
        
        super().__init__(
            task=task,
            eval_fn=eval_fn,
            loss_fn=loss_fn,
            backward_engine_model_config=backward_engine_model_config,
            text_optimizer_model_config=text_optimizer_model_config,
            teacher_model_config=teacher_model_config,
        )

    def prepare_task(self, sample: MovieDataSimple):
        return self.task.call, {
            "plot_without_ending": sample.plot_without_ending, 
            "id": sample.id
        }

    def prepare_eval(
        self, sample: MovieDataSimple, output_obj: adal.GeneratorOutput
    ) -> float:
        out_string = output_obj.data
        return self.eval_fn, {
            "plot_without_ending": sample.plot_without_ending, 
            "ending": out_string
        }

    def prepare_loss(
        self, sample: MovieDataSimple, pred: adal.Parameter, *args, **kwargs
    ) -> Tuple[Callable[..., Any], Dict]:

        pred.eval_input = pred.full_response.data
        
        plot_without_ending_param = adal.Parameter(
            name="plot without ending",
            data=sample.plot_without_ending,
            eval_input=sample.plot_without_ending,
            requires_opt=False,
        )
        
        return self.loss_fn, {"kwargs": {
            "plot_without_ending": plot_without_ending_param,
            "ending": pred
        }, "id": sample.id}

def train(
    template,
    eval_fn_desc,
    df_criteria,
    instructions,
    train_dataset,
    val_dataset,
    test_dataset,
    model_client,
    model_kwargs,
    teacher_model,
    evaluation_model,
    train_batch_size=4,
    raw_shots=1,
    bootstrap_shots=1,
    max_steps=12,
    num_workers=4,
    strategy="constrained",
    optimization_order="sequential",
    debug=False,
    ckpt_path='ckpts/'
):

    adal_component = MovieEndingOptAdal(
        template=template,
        eval_fn_desc=eval_fn_desc,
        df_criteria=df_criteria,
        instructions=instructions,
        model_client=model_client,
        model_kwargs=model_kwargs,
        teacher_model_config=teacher_model,
        evaluation_model=evaluation_model,
        text_optimizer_model_config=teacher_model,
        backward_engine_model_config=teacher_model,
    )
    trainer = adal.Trainer(
        train_batch_size=train_batch_size,
        adaltask=adal_component,
        strategy=strategy,
        max_steps=max_steps,
        num_workers=num_workers,
        raw_shots=raw_shots,
        bootstrap_shots=bootstrap_shots,
        debug=debug,
        weighted_sampling=True,
        optimization_order=optimization_order,
        exclude_input_fields_from_bootstrap_demos=False,
        ckpt_path=ckpt_path
    )

    trainer.fit(
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        test_dataset=test_dataset,
        debug=debug,
    )

# 6. Run the training pipeline

In [204]:
df_plot = pd.read_csv("data/wiki_movie_plots_base.csv").iloc[:,1:][['Title', 'plot_without_ending', 'ending']]

train_dataset, val_dataset, test_dataset = adalflow_train_val_test_split(
    df_plot,
    test_size=50, # 50 test rows
    val_size=25, # 25 val rows, so 108-50-25=33 train rows
    random_state=0
)

In [205]:
df_criteria = pd.read_csv("data/movie_ending_criteria.csv")
df_criteria['evaluation_steps'] = df_criteria['evaluation_steps'].str.split("\n")
df_criteria = df_criteria.set_index("criteria")['evaluation_steps']

template = open("prompts/template.txt").read()
instructions = "Continue the movie plot by writing the ending."
eval_fn_desc = open("prompts/eval_fn_desc.txt").read()

In [206]:

# Test first if the LLM judge works
llm_judge = MovieEndingLLMJudge(
    df_criteria,
    evaluation_model = evaluation_model
)
llm_judge(test_dataset[0].plot_without_ending, test_dataset[0].ending)

0.8166666666666668

In [207]:
task_pipeline = GenerateMovieEnding(
    template,
    instructions,
    model_client = student_model["model_client"],
    model_kwargs = student_model["model_kwargs"]
)

# Test if the task pipeline works
task_pipeline(test_dataset[0].plot_without_ending)

GeneratorOutput(id=None, data="As the bar mitzvah approaches, tensions rise within the Jordan family. Trish, feeling the weight of her past and the lies she has told her children, decides to confront Bill, who has returned to Florida, demanding he take responsibility for his actions and the pain he has caused. In a heated confrontation, Trish reveals the truth about his past to Timmy and Chloe, who are devastated but ultimately find strength in their mother's honesty. Meanwhile, Joy grapples with her grief over Allen's suicide, channeling her emotions into a heartfelt speech at Timmy's bar mitzvah, where she emphasizes the importance of truth and resilience. The ceremony becomes a cathartic moment for the family, as they come together to support one another, acknowledging their scars while vowing to break the cycle of trauma. In the final scene, the sisters share a quiet moment on the porch, reflecting on their tumultuous lives, but with a newfound sense of hope and determination to fo

In [None]:
# Run the training pipeline
train(
    template,
    eval_fn_desc,
    df_criteria,
    instructions,
    train_dataset,
    val_dataset,
    test_dataset,
    student_model['model_client'],
    student_model['model_kwargs'],
    teacher_model,
    evaluation_model,
    train_batch_size=4,
    raw_shots=1,
    bootstrap_shots=1,
    max_steps=12,
    num_workers=4,
    strategy="constrained",
    optimization_order="sequential",
    debug=False,
)

raw_shots: 1, bootstrap_shots: 1
BootstrapFewShot: ['llm.few_shot_demos']
[36m2025-03-28 14:41:10 - [adal.py:852:configure_text_optimizer_helper] - Text optimizer configured for 1 parameters. names: [('llm.instructions', 'Continue the movie plot by writing the ending.')][0m
Configuring teacher generator.
Configuring teacher generator for Generator(
  model_kwargs={'model': 'gpt-4o', 'temperature': 0.9, 'max_tokens': 4000, 'top_p': 0.99}, trainable_prompt_kwargs=[]
  (prompt): template: {{instructions}}
  
  Your response should consist of exactly one paragraph.
  
  Plot so far:
  {{plot_without_ending}}
  
  {% if few_shot_demos is not none %}
  Here are some examples:
  {{few_shot_demos}}
  {% endif %}, prompt_kwargs: {'instructions': 'Continue the movie plot by writing the ending.', 'few_shot_demos': 'None'}, prompt_variables: ['plot_without_ending', 'few_shot_demos', 'instructions']
  (model_client): AzureAIClient()
)
Teacher generator set: Generator(
  model_kwargs={'model': 'gp


Loading Data: 100%|██████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 20883.81it/s][A
Predicting: step(0): 0.7667 across 1 samples, Max potential: 0.9907:   4%|▎      | 1/25 [00:12<04:51, 12.14s/it]