In [None]:
import os
import json
import random

def split_jsonl_for_training(directory, seed=42, train_size=50):
    for root, dirs, files in os.walk(directory):
        for file in files:            
            
            if file.endswith(".json") and "train" not in file and "test" not in file:
                file_path = os.path.join(root, file)
            
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f) 
            
                examples = data.get("examples", [])

                random.seed(seed)

                if len(examples) > train_size:
                    train_indices = random.sample(range(len(examples)), train_size)
                else:
                    train_indices = list(range(len(examples)))

                train_data = [examples[i] for i in train_indices]
                test_data = [examples[i] for i in range(len(examples)) if i not in train_indices]
                
                # print(train_data[0])
                print(len(examples))

                base_file_name = file.replace('.json', '')
                train_file_path = os.path.join(root, f"{base_file_name}.train.jsonl")
                test_file_path = os.path.join(root, f"{base_file_name}.test.jsonl")
                
                with open(train_file_path, 'w', encoding='utf-8') as f:
                    for item in train_data:
                        json_str = json.dumps(item) + "\n"
                        f.write(json_str)

                with open(test_file_path, 'w', encoding='utf-8') as f:
                    for item in test_data:
                        json_str = json.dumps(item) + "\n"
                        f.write(json_str)



In [None]:

directory = "BBH_preprocess/BBHI" 
split_jsonl_for_training(directory)

In [None]:
import os
import json

def process_train_jsonl(directory, N_values=[3, 9]):
    for root, dirs, files in os.walk(directory):
        dir_name = os.path.basename(root) 
        for file in files:
            if file.endswith(".train.jsonl"):
                file_path = os.path.join(root, file)
                base_file_name = file.replace('.train.jsonl', '')

                system_content = None
                is_content_same = True
                user_contents = []

                with open(file_path, 'r', encoding='utf-8') as f:
                    for line_number, line in enumerate(f, 1):
                        try:
                            data = json.loads(line)
                            if "input" in data:
                                user_contents.append((data["input"], data["target"]))      
                        except json.JSONDecodeError:
                            print(f"Json error in file {file_path} at line {line_number}.")
                        except KeyError as e:
                            print(f"Key error in file {file_path} at line {line_number}. Key: {e}")
                        if not is_content_same:
                            break

                
                for N in N_values:
                    examples_content = ""
                    for i in range(min(N, len(user_contents))):
                        input, target = user_contents[i]
                        examples_content += f"[Question]\n{input}\n\n[Answer]\n{target}\n\n"
                    with open(os.path.join(root, f"{base_file_name}.{N}-shot_example.txt"), 'w', encoding='utf-8') as f:
                        f.write(examples_content)


directory = "BBH_preprocess/BBHI"
process_train_jsonl(directory)