In [3]:
# Importing necessary libraries
import pandas as pd
import numpy as np

np.random.seed(42)  # for reproducibility

In [4]:
# Reading data
data = pd.read_csv('../data/CRM_data_no_comments.csv')
len(data)

412

In [5]:
# Removing backticks (`) from the 'context' column in the DataFrame [Optional]
data['context'] = data['context'].apply(lambda x: x.replace('`', ''))
data.head()

Unnamed: 0,question,context,solution,topic,train_test_ind,solution_no_comments
0,Highest Average Score for Products,Pandas DataFrame df containing transaction dat...,def highest_avg_score_for_products(df):\n #...,article_recommendation,Train,def highest_avg_score_for_products(df):\n a...
1,Can you find the most common product category ...,Pandas DataFrame df containing transaction dat...,"def most_common_product_category_in_city(df, c...",article_recommendation,Train,"def most_common_product_category_in_city(df, c..."
2,What is the most common product category purch...,Pandas DataFrame df containing transaction dat...,"def most_common_product_category_in_city(df, c...",article_recommendation,Test,"def most_common_product_category_in_city(df, c..."
3,I want to find the top 3 customers who have pu...,Pandas DataFrame df containing transaction dat...,"def top_n_customers_and_products(df, n=3):\n ...",article_recommendation,Train,"def top_n_customers_and_products(df, n=3):\n ..."
4,Can you provide me with the top 5 customers wh...,Pandas DataFrame df containing transaction dat...,"def top_n_customers_and_products(df, n=5):\n ...",article_recommendation,Test,"def top_n_customers_and_products(df, n=5):\n ..."


In [6]:
# Splitting data into train, validation and test sets

shuffled_indices = np.random.permutation(len(data)) # Shuffle the indices

# Define the proportions for each set
train_prop = 0.8
val_prop = 0.1
test_prop = 0.1

# Calculate the sizes of each set
num_train = int(len(data) * train_prop)
num_val = int(len(data) * val_prop)
num_test = len(data) - num_train - num_val

# Split the shuffled indices into train, validation, and test sets
train_indices = shuffled_indices[:num_train]
val_indices = shuffled_indices[num_train:num_train + num_val]
test_indices = shuffled_indices[num_train + num_val:]

# Create the subsets using the indices
train_df = data.iloc[train_indices]
val_df = data.iloc[val_indices]
test_df = data.iloc[test_indices]

In [7]:
# Define a message template
system_message = system_message = """
You are a Python function generator. Users will ask you questions in English, 
and you will produce a Python function as answer to the question based on the provided CONTEXT.

CONTEXT:
{context}
"""

def create_messages(sample):
    """
    Create a message structure from a sample containing context, question, and solution.

    Args:
        sample (pd.Series): A pandas Series containing 'context', 'question', and 'solution_no_comments' fields.

    Returns:
        dict: A dictionary containing messages structured for a conversation. Each message includes a role
              (system, user, or assistant) and content (context, question, or solution).
    """
    return {
        "messages": [
            {"role": "system", "content": system_message.format(context=sample["context"].replace('`', ''))},
            {"role": "user", "content": sample["question"].replace('`', '')},
            {"role": "assistant", "content": sample["solution_no_comments"].replace('`', '')}
        ]
    }

# Apply create_messages function to create datasets
train_dataset = train_df.apply(create_messages, axis=1)
val_dataset = val_df.apply(create_messages, axis=1)
test_dataset = test_df.apply(create_messages, axis=1)

# Shuffle the datasets
random_train_dataset = train_dataset.sample(frac=1).reset_index(drop=True)
random_val_dataset = val_dataset.sample(frac=1).reset_index(drop=True)
random_test_dataset = test_dataset.sample(frac=1).reset_index(drop=True)

# Save datasets to JSON files
random_train_dataset.to_json("../data/train_CRM_data.json", orient="records")
random_val_dataset.to_json("../data/val_CRM_data.json", orient="records")
random_test_dataset.to_json("../data/test_CRM_data.json", orient="records")