In [18]:
import re
import pandas as pd
import numpy as np

### Data Source 1 ###

data = pd.read_json("sanitized-mbpp.json")

def extract_function_info(row):
    match = re.search(r'def\s+([\w_]+)\((.*?)\):([\s\S]*?)\n', row.code)
    if match:
        function_name = match.group(1)
        arguments = match.group(2).split(',')
        return "Given function arguments " + ','.join(arguments)

data['context'] = data.apply(extract_function_info, axis=1)

system_message = """You are an text to PYTHON function translator. Users will ask you questions in English and you will generate a PYTHON function based on the provided CONTEXT.
CONTEXT:
{context}"""

def create_conversation(sample):
    return {
        "messages": [
            {"role": "system", "content": system_message.format(context=sample["context"])},
            {"role": "user", "content": sample["prompt"]},
            {"role": "assistant", "content": sample["code"]}
        ]
    }

dataset = data.apply(create_conversation, axis=1)
dataset = dataset.sample(frac=1, random_state=42).reset_index(drop=True)
train_idx = int(len(dataset)*0.8)
train = dataset[:train_idx]
test = dataset[train_idx:]

### Data Source 2 ###

with open("english_python_data.txt", "r") as f:
    file_lines = f.readlines()

dps = []
current_question = None
current_solution = []

for line in file_lines:
    if line.startswith("#"):
        if current_question is not None:
            dps.append({"question": current_question, "solution": ''.join(current_solution)})
        current_question = line[1:].strip()
        current_solution = []
    else:
        current_solution.append(line)

if current_question is not None:
    dps.append({"question": current_question, "solution": ''.join(current_solution)})

python_problems_df = pd.DataFrame(dps)

msk = np.random.rand(len(python_problems_df)) < 0.8

train_df = python_problems_df[msk]
val_df = python_problems_df[~msk]

def alt_extract_function_info(row):
    match = re.search(r'def\s+([\w_]+)\((.*?)\):([\s\S]*?)\n', row.solution)
    if match:
        function_name = match.group(1)
        arguments = match.group(2).split(',')
        return "Given function arguments " + ','.join(arguments)

train_df['context'] = train_df.apply(alt_extract_function_info, axis=1)
val_df['context'] = val_df.apply(alt_extract_function_info, axis=1)

system_message = """You are an text to PYTHON function translator. Users will ask you questions in English and you will generate a PYTHON function based on the provided CONTEXT.
CONTEXT:
{context}"""

def create_conversation(sample):
    return {
        "messages": [
            {"role": "system", "content": system_message.format(context=sample["context"])},
            {"role": "user", "content": sample["question"]},
            {"role": "assistant", "content": sample["solution"]}
        ]
    }

train_dataset = train_df.apply(create_conversation, axis=1)
val_dataset = val_df.apply(create_conversation, axis=1)

concatenated_train = pd.concat([train, train_dataset], axis=0)
concatenated_test = pd.concat([test, val_dataset], axis=0)

randomized_concatenated_train = concatenated_train.sample(frac=1).reset_index(drop=True)
randomized_concatenated_test = concatenated_test.sample(frac=1).reset_index(drop=True)

randomized_concatenated_train.to_json("train_dataset.json", orient="records")
randomized_concatenated_test.to_json("test_dataset.json", orient="records")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['context'] = train_df.apply(alt_extract_function_info, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df['context'] = val_df.apply(alt_extract_function_info, axis=1)


In [19]:
len(randomized_concatenated_train), len(randomized_concatenated_test)

(4309, 1076)