In [18]:
import os
import json
import glob
from collections import defaultdict
from tqdm import tqdm
import re

def post_process(model_name, extract_func):
    # Directory where your files are stored
    directory = './results/'

    # Pattern to match the files of interest
    pattern = directory + f'*{model_name}*.json'

    # Process each file
    for file_path in glob.glob(pattern):
        print(f"Processing {file_path}...")
        
        # Read the file
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Extract triples
        text_triples = extract_func(data)
        
        # Define new file name for processed results
        base_name = os.path.basename(file_path)
        new_file_name = base_name.replace('results', 'processed_results')
        new_file_path = os.path.join('./processed_results/', new_file_name)
        
        # Write the processed data to a new file
        with open(new_file_path, 'w') as f:
            json.dump(text_triples, f, indent=6)
        
        print(f"Processed data saved to {new_file_path}")
        print()

## gpt-3.5-turbo-instruct

In [19]:
import re
import json
from collections import defaultdict
from tqdm import tqdm

def extract_triples_gpt35(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        # Use regex to find all JSON array-like structures
        matches = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches:
            try:
                # Attempt to parse each match as JSON
                triples_raw = json.loads(match)
                for triple in triples_raw:
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples


In [20]:
post_process('gpt-3.5', extract_triples_gpt35)

Processing ./results/nyt10m_rand_500_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 500/500 [00:00<00:00, 125412.75it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-3.5-turbo-instruct_1.json
Processing ./results/docred_rand_200_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 200/200 [00:00<00:00, 51391.34it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-3.5-turbo-instruct_1.json
Processing ./results/cdr_rand_200_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 200/200 [00:00<00:00, 56058.59it/s]

Processed data saved to ./processed_results/cdr_rand_200_gpt-3.5-turbo-instruct_1.json



