In [1]:
from simulation_modules import chatbot_sim as cs
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
import json
from pprint import pprint

In [14]:
# load the dataset {'gender': gender, 'name': name}
cli_manager = cs.InputManager("simulation_modules/client_names.jsonl") # loads the entire dataset
cli_data = cli_manager.get_sample(absolute=80) # retrieve 20 random names
train, test = train_test_split(cli_data, test_size=0.2, random_state=42)
print("Train size:", len(train), "Test size:", len(test), ". Expected total conversations (all possible fluxes): ", len(train)*12)

Train size: 64 Test size: 16 . Expected total conversations (all possible fluxes):  768


In [15]:
pprint(test)

[{'gender': 'f', 'name': 'Valeria Velásquez'},
 {'gender': 'f', 'name': 'Guadalupe Zárate'},
 {'gender': 'm', 'name': 'José Antonio Fernández'},
 {'gender': 'm', 'name': 'Fernando Díaz'},
 {'gender': 'f', 'name': 'Julia Campos'},
 {'gender': 'f', 'name': 'Karina Rodríguez'},
 {'gender': 'f', 'name': 'Lucía Guzmán'},
 {'gender': 'f', 'name': 'Dulce Villaseñor'},
 {'gender': 'f', 'name': 'Marina Espinoza'},
 {'gender': 'm', 'name': 'Ciro Bustamante'},
 {'gender': 'm', 'name': 'Diego Martínez'},
 {'gender': 'm', 'name': 'Gonzalo Esteban Morales'},
 {'gender': 'f', 'name': 'Graciela Becerra'},
 {'gender': 'm', 'name': 'Jaime Moreno'},
 {'gender': 'f', 'name': 'Luciana Paola Hernández'},
 {'gender': 'f', 'name': 'Beatriz Cordero'}]


In [16]:
# random dates start
s_date = datetime(2024, 1, 1)

# random dates end
e_date = datetime.now()

agent_names = ["Raúl"]
bank_name = "Banco Azteca"

dates_generator = cs.DatesGenerator()

train_entries_generator = cs.RandomEntryGenerator("random_day", agent_names, train, bank_name, dates_generator)
test_entries_generator = cs.RandomEntryGenerator("current_day", agent_names, test, bank_name, dates_generator)

train_entries = train_entries_generator.generate_random_entries(s_date, e_date)
test_entries = test_entries_generator.generate_random_entries(s_date, e_date)

pprint(test_entries)

[{'amount_pesos': 2354,
  'ask_for_payment_day': 'jueves',
  'bank_name': 'Banco Azteca',
  'client_name': {'gender': 'f', 'name': 'Valeria Velásquez'},
  'current_date': '11 de diciembre de 2024',
  'days_late': 41,
  'name_of_the_agent': 'Raúl',
  'system_current_date_time': 'miércoles 2024-12-11 04:10 PM',
  'system_current_datetime_object': datetime.datetime(2024, 12, 11, 16, 10, 15, 414364),
  'system_tomorrow_date': 'jueves 2024-12-12',
  'tomorrow_date': '12 de diciembre de 2024'},
 {'amount_pesos': 27076,
  'ask_for_payment_day': 'jueves',
  'bank_name': 'Banco Azteca',
  'client_name': {'gender': 'f', 'name': 'Guadalupe Zárate'},
  'current_date': '11 de diciembre de 2024',
  'days_late': 42,
  'name_of_the_agent': 'Raúl',
  'system_current_date_time': 'miércoles 2024-12-11 04:10 PM',
  'system_current_datetime_object': datetime.datetime(2024, 12, 11, 16, 10, 15, 414364),
  'system_tomorrow_date': 'jueves 2024-12-12',
  'tomorrow_date': '12 de diciembre de 2024'},
 {'amount_pe

In [17]:
# create an instance of the AztecaBot
bot = cs.AztecaBot()
print(bot.default_chat_flux)

[[True, True, True, False, False, False, False, False, False, False, False], [True, True, False, False, False, False, False, False, False, False, False], [True, True, False, True, False, False, False, False, False, False, False], [True, False, False, False, True, True, False, False, False, False, False], [True, False, False, False, True, False, False, False, False, False, False], [True, False, False, False, True, False, True, False, False, False, False], [True, False, False, False, False, False, False, True, True, False, False], [True, False, False, False, False, False, False, True, False, False, False], [True, False, False, False, False, False, False, True, False, True, False], [True, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, True], [False, False, False, False, False, False, False, False, False, False, False]]


Create the train dataset for gpt-4o-mini fine tuning

In [18]:
cs.FineTuningDataset.create_dataset(
    entries=train_entries,
    chatbot=bot,
    chat_flux=bot.default_chat_flux,
    generate_weights=True, #all weights have the value of 1 by default
    filename="train_azteca_v4.jsonl"
)

Dataset created successfully and saved in train_azteca_v4.jsonl


Create the test dataset for gpt-4o-mini fine tuning

In [19]:
cs.FineTuningDataset.create_dataset(
    entries=test_entries,
    chatbot=bot,
    chat_flux=bot.default_chat_flux,
    filename="test_azteca_v4.jsonl"
)

Dataset created successfully and saved in test_azteca_v4.jsonl


### Analyze Assistant Responses Patterns

Perform an analysis of the amount of responses given an 'assistant' pattern. If there are more responses of certain assistant pattern there is more probability that the model learns quickly that specific assistant response.

In [20]:
patterns = bot.assistant_patterns
pattern_statistics = cs.FineTuningDataset.analyze_dataset_patterns("train_azteca_v4.jsonl", patterns)

In [21]:
print("Pattern statistics:", pattern_statistics)

Pattern statistics: {'greeting': 768, 'primary_info': 640, 'amount_reconfirmation': 576, 'final_reconfirmation': 384, 'ask_for_tomorrow_pay': 384, 'contact_you_later': 256, 'first_attempt_agreement': 448, 'second_attempt_agreement': 256, 'ask_for_line_holder': 128, 'ask_for_callback': 64, 'wrong_person': 64}


### Configure the weights of the Assistant Responses
Configure the weights for OpenAI chat completion format: 0 for a content that is no longer needed in order to train the model; 1 for a content that the model still needs to learn. In example the 'greeting' pattern appears in 100% of the conversations, its spected that the model learns 

In [22]:
assistant_thresholds_config = {
    "greeting": 0.5,  # After 50% of the dataset, set weight to 0. Set the 30% of the dataset with weight = 0
    "primary_info": 0.7,  # After 70% of the dataset, set weight to 0. Set the 30% of the dataset with weight = 0
    "amount_reconfirmation": 1, # After 100% of the dataset, set weight to 0. Set the 0% of the dataset with weight = 0
    "final_reconfirmation": 1, 
    "ask_for_tomorrow_pay": 1,
    "contact_you_later": 1,
    "first_attempt_agreement": 1,
    "second_attempt_agreement": 1,
    "ask_for_line_holder": 1,
    "ask_for_callback": 1,
    "wrong_person": 1
}
patterns = bot.assistant_patterns

In [15]:
cs.FineTuningDataset.adjust_weights_by_line(
    input_file="train_azteca_v1.jsonl", 
    output_file="train_azteca_weight.jsonl", 
    thresholds=assistant_thresholds_config, 
    patterns=patterns
)

Dataset processed successfully. Saved to train_azteca_weight.jsonl.
