<a target="_blank" href="https://colab.research.google.com/github/okareo-ai/okareo-python-sdk/blob/main/notebooks/intent_class_synthetic/generating_train_scenarios.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>



## Generate Training Scenarios

In [None]:
OKAREO_API_KEY = "YOUR_API_KEY"

In [None]:
%pip install okareo 

We start with the human-generated training data that we have in the data/ directory.

In [None]:
# Load libraries
from okareo import Okareo
import os
import tempfile
import json
import pandas as pd

# Initialize Okareo client
okareo = Okareo(OKAREO_API_KEY)

# Load the seed data
data = pd.read_csv("data/training.csv")
rows = data.to_dict(orient="records")

# Write to a .jsonl file
temp_dir = tempfile.gettempdir()
file_path = os.path.join(temp_dir, "seed_data_sample.jsonl")
with open(file_path, "w+") as file:
    for row in rows:
        file.write(json.dumps(row) + '\n')

# Create scenario set with seed data file
source_scenario = okareo.upload_scenario_set(file_path=file_path, scenario_name="Blog Training Set")
print(source_scenario.app_link)

# make sure to clean up tmp file
os.remove(file_path)

From the seed Scenario Set, we create a new Scenario Set using Okareo's Rephrasing generator.

In [None]:
from okareo_api_client.models import ScenarioType
# Use scenario set id or scenario set object from previous step as source for generation
rephrased_scenario = okareo.generate_scenarios(
    source_scenario=source_scenario,
    name="Blog - train - rephrase",
    number_examples=3,
    generation_type=ScenarioType.REPHRASE_INVARIANT
)

print(rephrased_scenario.app_link)

In [None]:
# Save the rephrased data to a csv file
dps = okareo.get_scenario_data_points(rephrased_scenario.scenario_id)
formatted = [{"input": dp.input_, "result": dp.result} for dp in dps]
pd.DataFrame(formatted).to_csv("data/rephrased.csv", index=False)

Now we do the same for the other generators.

In [None]:
spelling_scenario = okareo.generate_scenarios(
    source_scenario=source_scenario,
    name="Blog - train - spelling",
    number_examples=3,
    generation_type=ScenarioType.COMMON_MISSPELLINGS
)

print(spelling_scenario.app_link)

In [None]:
dps = okareo.get_scenario_data_points(spelling_scenario.scenario_id)
formatted = [{"input": dp.input_, "result": dp.result} for dp in dps]
pd.DataFrame(formatted).to_csv("data/spelling.csv", index=False)

In [None]:
contr_scenario = okareo.generate_scenarios(
    source_scenario=source_scenario,
    name="Blog - train - contractions",
    number_examples=3,
    generation_type=ScenarioType.COMMON_CONTRACTIONS
)

print(contr_scenario.app_link)

In [None]:
dps = okareo.get_scenario_data_points(contr_scenario.scenario_id)
formatted = [{"input": dp.input_, "result": dp.result} for dp in dps]
pd.DataFrame(formatted).to_csv("data/contr.csv", index=False)

In [None]:
cond_scenario = okareo.generate_scenarios(
    source_scenario=source_scenario,
    name="Blog - train - conditional",
    number_examples=3,
    generation_type=ScenarioType.CONDITIONAL
)

print(cond_scenario.app_link)

In [None]:
dps = okareo.get_scenario_data_points(cond_scenario.scenario_id)
formatted = [{"input": dp.input_, "result": dp.result} for dp in dps]
pd.DataFrame(formatted).to_csv("data/cond.csv", index=False)