In [1]:
import os
import json
import openai
import random
def read_jsonl_file(filepath):
    """Read a JSONL file and return a list of dictionaries."""
    with open(filepath, 'r', encoding='utf-8') as file:
        return [json.loads(line.strip()) for line in file if line.strip()]

def record_tasks_and_jsonl(root_folder):
    """Record task names and corresponding JSONL file contents."""
    tasks_dict = {}
    for root, dirs, files in os.walk(root_folder):
        for directory in dirs:
            task_path = os.path.join(root, directory)
            jsonl_files = [file for file in os.listdir(task_path) if file.endswith('.jsonl')]
            task_data = []
            for jsonl_file in jsonl_files:
                file_path = os.path.join(task_path, jsonl_file)
                task_data.extend(read_jsonl_file(file_path))
            tasks_dict[directory] = task_data
    return tasks_dict

def split_data(data, train_ratio=0.8):
    """Split data into training and testing sets based on the specified ratio."""
    random.shuffle(data)  # Shuffle the data to ensure randomness
    split_point = int(len(data) * train_ratio)
    return data[:split_point], data[split_point:]

In [2]:
# Use the current directory as the root folder
root_folder = os.getcwd()
tasks_info = record_tasks_and_jsonl(root_folder)

In [3]:
# Apply the train-test split for each task
train_test_split = {}
for task, data in tasks_info.items():
    train, test = split_data(data)
    train_test_split[task] = {'train': train, 'test': test}

# Print summary information about the splits
for task, splits in train_test_split.items():
    print(f"Task: {task}")
    print(f"Training set size: {len(splits['train'])}")
    print(f"Testing set size: {len(splits['test'])}")
    print()  # Print a newline for better readability

Task: ethos-national_origin
Training set size: 476
Testing set size: 119

Task: glue-cola
Training set size: 4300
Testing set size: 1075

Task: anli
Training set size: 4128
Testing set size: 1032

Task: lama-google_re
Training set size: 5016
Testing set size: 1254

Task: yelp_polarity
Training set size: 30528
Testing set size: 7632

Task: rotten_tomatoes
Training set size: 4392
Testing set size: 1098

Task: blimp-anaphor_number_agreement
Training set size: 928
Testing set size: 232

Task: sick
Training set size: 2108
Testing set size: 527

Task: tweet_eval-irony
Training set size: 3948
Testing set size: 987

Task: wino_grande
Training set size: 5196
Testing set size: 1299

Task: glue-sst2
Training set size: 3616
Testing set size: 904

Task: sciq
Training set size: 3676
Testing set size: 919

Task: trec
Training set size: 4492
Testing set size: 1123

Task: health_fact
Training set size: 4984
Testing set size: 1246

Task: superglue-rte
Training set size: 1236
Testing set size: 309

Task:

In [4]:
with open('task_data_splits.json', 'w', encoding='utf-8') as json_file:
    json.dump(train_test_split, json_file, indent=4)

In [1]:
import os
import json
import openai
import random
with open('task_data_splits.json', 'r', encoding='utf-8') as json_file:
    train_test_split = json.load(json_file)
selected_task_names=['superglue-cb', 'tweet_eval-stance_hillary', 'ethos-national_origin', 'blimp-anaphor_number_agreement', 'superglue-rte', 'crows_pairs', 'quartz-with_knowledge', 'sick', 'glue-sst2', 'sciq']

In [2]:
# Function to create embeddings
from openai import OpenAI
client = OpenAI(
  api_key="", 
)
def create_embedding(text):
    try:
        embedding = client.embeddings.create(
            model="text-embedding-3-large",
            input=text,
            encoding_format="float"
        )
        return embedding.data[0].embedding
    except Exception as e:
        print(f"Error in embedding creation: {e[:50]}")
        return [] 

# Initialize results dictionary
results = {task: {'train': [], 'test': []} for task in selected_task_names}

# Iterate over selected tasks and process each entry
for task in selected_task_names:
    for dataset_type in ['train', 'test']:
        for entry in train_test_split[task][dataset_type]:
            combined_text = f"{entry['input']} {entry['output']}"
            embedding = create_embedding(combined_text)
            results[task][dataset_type].append({
                'input': entry['input'],
                'output': entry['output'],
                'options': entry['options'],
                'combined_text': combined_text,
                'embedding': embedding
            })

# Save results to a JSON file
with open('embedding_results.json', 'w') as outfile:
    json.dump(results, outfile, indent=4)
