In [191]:
import os
import json
import glob
from collections import defaultdict
from tqdm import tqdm
import re

def post_process(model_name, extract_func):
    # Directory where your files are stored
    directory = './results/'

    # Pattern to match the files of interest
    pattern = directory + f'*{model_name}*.json'

    # Process each file
    for file_path in glob.glob(pattern):
        print(f"Processing {file_path}...")
        
        # Read the file
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Extract triples
        text_triples = extract_func(data)
        
        # Define new file name for processed results
        base_name = os.path.basename(file_path)
        new_file_name = base_name.replace('results', 'processed_results')
        new_file_path = os.path.join('./processed_results/', new_file_name)
        
        # Write the processed data to a new file
        with open(new_file_path, 'w') as f:
            json.dump(text_triples, f, indent=6)
        
        print(f"Processed data saved to {new_file_path}")
        print()

### GPT Family: gpt-3.5-turbo-instruct & gpt-4 & text-davinci-003

In [192]:
import re
import json
from collections import defaultdict
from tqdm import tqdm

def extract_triples_gpt(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        # Use regex to find all JSON array-like structures
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples


In [193]:
post_process('gpt-3.5-turbo', extract_triples_gpt)
post_process('gpt-4', extract_triples_gpt)
post_process('davinci', extract_triples_gpt)
post_process('gpt-4-1106', extract_triples_gpt)


Processing ./results/nyt10m_rand_500_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 500/500 [00:00<00:00, 28885.60it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-3.5-turbo-instruct_1.json

Processing ./results/nyt10m_rand_500_gpt-3.5-turbo-1106_1.json...


100%|██████████| 500/500 [00:00<00:00, 45519.01it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-3.5-turbo-1106_1.json

Processing ./results/wiki80_rand_800_gpt-3.5-turbo-1106_1.json...


100%|██████████| 800/800 [00:00<00:00, 58369.75it/s]


Processed data saved to ./processed_results/wiki80_rand_800_gpt-3.5-turbo-1106_1.json

Processing ./results/docred_rand_200_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 200/200 [00:00<00:00, 10013.26it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-3.5-turbo-instruct_1.json

Processing ./results/cdr_rand_200_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 200/200 [00:00<00:00, 10366.67it/s]


Processed data saved to ./processed_results/cdr_rand_200_gpt-3.5-turbo-instruct_1.json

Processing ./results/tacred_rand_800_gpt-3.5-turbo-1106_1.json...


100%|██████████| 800/800 [00:00<00:00, 13554.17it/s]


Processed data saved to ./processed_results/tacred_rand_800_gpt-3.5-turbo-1106_1.json

Processing ./results/tacred_rand_800_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 800/800 [00:00<00:00, 33845.50it/s]


Processed data saved to ./processed_results/tacred_rand_800_gpt-3.5-turbo-instruct_1.json

Processing ./results/wiki80_rand_800_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 800/800 [00:00<00:00, 38253.93it/s]


Processed data saved to ./processed_results/wiki80_rand_800_gpt-3.5-turbo-instruct_1.json

Processing ./results/docred_rand_200_gpt-3.5-turbo-1106_1.json...


100%|██████████| 200/200 [00:00<00:00, 12046.71it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-3.5-turbo-1106_1.json

Processing ./results/cdr_rand_200_gpt-3.5-turbo-1106_1.json...


100%|██████████| 200/200 [00:00<00:00, 14472.35it/s]


Processed data saved to ./processed_results/cdr_rand_200_gpt-3.5-turbo-1106_1.json

Processing ./results/wiki20m_rand_500_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 500/500 [00:00<00:00, 36115.45it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-3.5-turbo-instruct_1.json

Processing ./results/wiki20m_rand_500_gpt-3.5-turbo-1106_1.json...


100%|██████████| 500/500 [00:00<00:00, 63563.54it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-3.5-turbo-1106_1.json

Processing ./results/wiki20m_rand_500_gpt-3.5-turbo-1106_54.json...


100%|██████████| 500/500 [00:00<00:00, 61307.69it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-3.5-turbo-1106_54.json

Processing ./results/wiki20m_rand_500_gpt-3.5-turbo-1106_64.json...


100%|██████████| 500/500 [00:00<00:00, 61706.35it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-3.5-turbo-1106_64.json

Processing ./results/wiki20m_rand_500_gpt-3.5-turbo-1106_74.json...


100%|██████████| 500/500 [00:00<00:00, 66582.60it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-3.5-turbo-1106_74.json

Processing ./results/wiki20m_rand_500_gpt-3.5-turbo-1106_84.json...


100%|██████████| 500/500 [00:00<00:00, 63551.99it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-3.5-turbo-1106_84.json

Processing ./results/nyt10m_rand_500_gpt-4_1.json...


100%|██████████| 500/500 [00:00<00:00, 34730.26it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-4_1.json

Processing ./results/docred_rand_200_gpt-4-1106-preview_1.json...


100%|██████████| 200/200 [00:00<00:00, 8192.00it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-4-1106-preview_1.json

Processing ./results/docred_rand_200_gpt-4_1.json...


100%|██████████| 200/200 [00:00<00:00, 10205.00it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-4_1.json

Processing ./results/cdr_rand_200_gpt-4_1.json...


100%|██████████| 200/200 [00:00<00:00, 12365.28it/s]


Processed data saved to ./processed_results/cdr_rand_200_gpt-4_1.json

Processing ./results/nyt10m_rand_500_gpt-4-1106-preview_1.json...


100%|██████████| 500/500 [00:00<00:00, 33932.30it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-4-1106-preview_1.json

Processing ./results/wiki80_rand_800_gpt-4_1.json...


100%|██████████| 800/800 [00:00<00:00, 43681.57it/s]


Processed data saved to ./processed_results/wiki80_rand_800_gpt-4_1.json

Processing ./results/tacred_rand_800_gpt-4_1.json...


100%|██████████| 800/800 [00:00<00:00, 40340.03it/s]


Processed data saved to ./processed_results/tacred_rand_800_gpt-4_1.json

Processing ./results/cdr_rand_200_gpt-4-1106-preview_1.json...


100%|██████████| 200/200 [00:00<00:00, 9860.60it/s]


Processed data saved to ./processed_results/cdr_rand_200_gpt-4-1106-preview_1.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_1.json...


100%|██████████| 500/500 [00:00<00:00, 39754.93it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_1.json

Processing ./results/wiki20m_rand_500_gpt-4_1.json...


100%|██████████| 500/500 [00:00<00:00, 45459.81it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4_1.json

Processing ./results/tacred_rand_800_gpt-4-1106-preview_1.json...


100%|██████████| 800/800 [00:00<00:00, 39927.69it/s]


Processed data saved to ./processed_results/tacred_rand_800_gpt-4-1106-preview_1.json

Processing ./results/wiki80_rand_800_gpt-4-1106-preview_1.json...


100%|██████████| 800/800 [00:00<00:00, 40591.35it/s]


Processed data saved to ./processed_results/wiki80_rand_800_gpt-4-1106-preview_1.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_54.json...


100%|██████████| 500/500 [00:00<00:00, 40618.08it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_54.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_64.json...


100%|██████████| 500/500 [00:00<00:00, 40126.13it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_64.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_74.json...


100%|██████████| 500/500 [00:00<00:00, 38863.50it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_74.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_84.json...


100%|██████████| 500/500 [00:00<00:00, 39366.87it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_84.json

Processing ./results/nyt10m_rand_500_text-davinci-003_1.json...


100%|██████████| 500/500 [00:00<00:00, 36684.66it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_text-davinci-003_1.json

Processing ./results/docred_rand_200_text-davinci-003_1.json...


100%|██████████| 200/200 [00:00<00:00, 11625.66it/s]


Processed data saved to ./processed_results/docred_rand_200_text-davinci-003_1.json

Processing ./results/cdr_rand_200_text-davinci-003_1.json...


100%|██████████| 200/200 [00:00<00:00, 14048.68it/s]


Processed data saved to ./processed_results/cdr_rand_200_text-davinci-003_1.json

Processing ./results/tacred_rand_800_text-davinci-003_1.json...


100%|██████████| 800/800 [00:00<00:00, 40774.84it/s]


Processed data saved to ./processed_results/tacred_rand_800_text-davinci-003_1.json

Processing ./results/wiki80_rand_800_text-davinci-003_1.json...


100%|██████████| 800/800 [00:00<00:00, 40659.72it/s]


Processed data saved to ./processed_results/wiki80_rand_800_text-davinci-003_1.json

Processing ./results/wiki20m_rand_500_text-davinci-003_1.json...


100%|██████████| 500/500 [00:00<00:00, 11605.90it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_text-davinci-003_1.json

Processing ./results/docred_rand_200_gpt-4-1106-preview_1.json...


100%|██████████| 200/200 [00:00<00:00, 8560.07it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-4-1106-preview_1.json

Processing ./results/nyt10m_rand_500_gpt-4-1106-preview_1.json...


100%|██████████| 500/500 [00:00<00:00, 32263.88it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-4-1106-preview_1.json

Processing ./results/cdr_rand_200_gpt-4-1106-preview_1.json...


100%|██████████| 200/200 [00:00<00:00, 9831.71it/s]


Processed data saved to ./processed_results/cdr_rand_200_gpt-4-1106-preview_1.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_1.json...


100%|██████████| 500/500 [00:00<00:00, 41324.01it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_1.json

Processing ./results/tacred_rand_800_gpt-4-1106-preview_1.json...


100%|██████████| 800/800 [00:00<00:00, 38924.00it/s]


Processed data saved to ./processed_results/tacred_rand_800_gpt-4-1106-preview_1.json

Processing ./results/wiki80_rand_800_gpt-4-1106-preview_1.json...


100%|██████████| 800/800 [00:00<00:00, 42454.08it/s]


Processed data saved to ./processed_results/wiki80_rand_800_gpt-4-1106-preview_1.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_54.json...


100%|██████████| 500/500 [00:00<00:00, 40142.26it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_54.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_64.json...


100%|██████████| 500/500 [00:00<00:00, 39713.52it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_64.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_74.json...


100%|██████████| 500/500 [00:00<00:00, 40241.62it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_74.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_84.json...


100%|██████████| 500/500 [00:00<00:00, 40198.43it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_84.json



### LLAMA Family: Vicuna-7B, Vicuna-33B, LLAMA-2-7B, LLAMA-2-70B, Wizard-70B

In [194]:
def extract_triples_vicuna(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        triples_str = triples_str.split("\nASSISTANT:")[1]
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [195]:
post_process('vicuna', extract_triples_vicuna)

Processing ./results/tacred_rand_800_vicuna-1.5-7b_1.json...


100%|██████████| 800/800 [00:00<00:00, 51329.23it/s]


Processed data saved to ./processed_results/tacred_rand_800_vicuna-1.5-7b_1.json

Processing ./results/nyt10m_rand_500_vicuna-1.3-33b_1.json...


100%|██████████| 500/500 [00:00<00:00, 33163.37it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_vicuna-1.3-33b_1.json

Processing ./results/tacred_rand_800_vicuna-1.3-33b_1.json...


100%|██████████| 800/800 [00:00<00:00, 14583.68it/s]


Processed data saved to ./processed_results/tacred_rand_800_vicuna-1.3-33b_1.json

Processing ./results/wiki80_rand_800_vicuna-1.3-33b_1.json...


100%|██████████| 800/800 [00:00<00:00, 15844.30it/s]


Processed data saved to ./processed_results/wiki80_rand_800_vicuna-1.3-33b_1.json

Processing ./results/cdr_rand_200_vicuna-1.3-33b_1.json...


100%|██████████| 200/200 [00:00<00:00, 10099.94it/s]


Processed data saved to ./processed_results/cdr_rand_200_vicuna-1.3-33b_1.json

Processing ./results/cdr_rand_200_vicuna-1.5-7b_1.json...


100%|██████████| 200/200 [00:00<00:00, 6288.41it/s]


Processed data saved to ./processed_results/cdr_rand_200_vicuna-1.5-7b_1.json

Processing ./results/docred_rand_200_vicuna-1.3-33b_1.json...


100%|██████████| 200/200 [00:00<00:00, 5486.87it/s]


Processed data saved to ./processed_results/docred_rand_200_vicuna-1.3-33b_1.json

Processing ./results/docred_rand_200_vicuna-1.5-7b_1.json...


100%|██████████| 200/200 [00:00<00:00, 5721.05it/s]


Processed data saved to ./processed_results/docred_rand_200_vicuna-1.5-7b_1.json

Processing ./results/nyt10m_rand_500_vicuna-1.5-7b_1.json...


100%|██████████| 500/500 [00:00<00:00, 28522.98it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_vicuna-1.5-7b_1.json

Processing ./results/wiki20m_rand_500_vicuna-1.5-7b_1.json...


100%|██████████| 500/500 [00:00<00:00, 18707.53it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_vicuna-1.5-7b_1.json

Processing ./results/wiki20m_rand_500_vicuna-1.3-33b_1.json...


100%|██████████| 500/500 [00:00<00:00, 11921.33it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_vicuna-1.3-33b_1.json

Processing ./results/wiki80_rand_800_vicuna-1.5-7b_1.json...


100%|██████████| 800/800 [00:00<00:00, 24444.82it/s]


Processed data saved to ./processed_results/wiki80_rand_800_vicuna-1.5-7b_1.json

Processing ./results/wiki20m_rand_500_vicuna-1.3-33b_54.json...


100%|██████████| 500/500 [00:00<00:00, 27540.84it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_vicuna-1.3-33b_54.json

Processing ./results/wiki20m_rand_500_vicuna-1.3-33b_64.json...


100%|██████████| 500/500 [00:00<00:00, 27852.47it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_vicuna-1.3-33b_64.json

Processing ./results/wiki20m_rand_500_vicuna-1.3-33b_74.json...


100%|██████████| 500/500 [00:00<00:00, 28340.27it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_vicuna-1.3-33b_74.json

Processing ./results/wiki20m_rand_500_vicuna-1.3-33b_84.json...


100%|██████████| 500/500 [00:00<00:00, 28199.66it/s]

Processed data saved to ./processed_results/wiki20m_rand_500_vicuna-1.3-33b_84.json






### LLAMA

In [196]:
def extract_triples_llama(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        triples_str = triples_str.split("[/INST]")[1]
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [197]:
post_process('llama', extract_triples_llama)

Processing ./results/tacred_rand_800_llama-2-70b_1.json...


100%|██████████| 800/800 [00:00<00:00, 14228.05it/s]


Processed data saved to ./processed_results/tacred_rand_800_llama-2-70b_1.json

Processing ./results/docred_rand_200_llama-2-70b_1.json...


100%|██████████| 200/200 [00:00<00:00, 5196.69it/s]


Processed data saved to ./processed_results/docred_rand_200_llama-2-70b_1.json

Processing ./results/wiki20m_rand_500_llama-2-70b_1.json...


100%|██████████| 500/500 [00:00<00:00, 15812.76it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_llama-2-70b_1.json

Processing ./results/wiki80_rand_800_llama-2-70b_1.json...


100%|██████████| 800/800 [00:00<00:00, 27320.67it/s]


Processed data saved to ./processed_results/wiki80_rand_800_llama-2-70b_1.json

Processing ./results/docred_rand_200_llama-2-7b_1.json...


100%|██████████| 200/200 [00:00<00:00, 32192.06it/s]


Processed data saved to ./processed_results/docred_rand_200_llama-2-7b_1.json

Processing ./results/cdr_rand_200_llama-2-70b_1.json...


100%|██████████| 200/200 [00:00<00:00, 10004.30it/s]


Processed data saved to ./processed_results/cdr_rand_200_llama-2-70b_1.json

Processing ./results/cdr_rand_200_llama-2-7b_1.json...


100%|██████████| 200/200 [00:00<00:00, 5128.33it/s]


Processed data saved to ./processed_results/cdr_rand_200_llama-2-7b_1.json

Processing ./results/nyt10m_rand_500_llama-2-70b_1.json...


100%|██████████| 500/500 [00:00<00:00, 14341.76it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_llama-2-70b_1.json

Processing ./results/tacred_rand_800_llama-2-7b_1.json...


100%|██████████| 800/800 [00:00<00:00, 24251.72it/s]


Processed data saved to ./processed_results/tacred_rand_800_llama-2-7b_1.json

Processing ./results/wiki20m_rand_500_llama-2-7b_1.json...


100%|██████████| 500/500 [00:00<00:00, 23118.03it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_llama-2-7b_1.json

Processing ./results/wiki80_rand_800_llama-2-7b_1.json...


100%|██████████| 800/800 [00:00<00:00, 37186.84it/s]


Processed data saved to ./processed_results/wiki80_rand_800_llama-2-7b_1.json

Processing ./results/nyt10m_rand_500_llama-2-7b_44.json...


100%|██████████| 500/500 [00:00<00:00, 4707.82it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_llama-2-7b_44.json

Processing ./results/wiki20m_rand_500_llama-2-70b_54_.json...


100%|██████████| 500/500 [00:00<00:00, 22739.27it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_llama-2-70b_54_.json

Processing ./results/wiki20m_rand_500_llama-2-70b_54.json...


100%|██████████| 500/500 [00:00<00:00, 15498.75it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_llama-2-70b_54.json

Processing ./results/wiki20m_rand_500_llama-2-70b_74.json...


100%|██████████| 500/500 [00:00<00:00, 25576.58it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_llama-2-70b_74.json

Processing ./results/wiki20m_rand_500_llama-2-70b_64.json...


100%|██████████| 500/500 [00:00<00:00, 30523.12it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_llama-2-70b_64.json

Processing ./results/wiki20m_rand_500_llama-2-70b_84.json...


100%|██████████| 500/500 [00:00<00:00, 24695.33it/s]

Processed data saved to ./processed_results/wiki20m_rand_500_llama-2-70b_84.json






### Wizard-70B

In [198]:
post_process('wizard', extract_triples_vicuna)

Processing ./results/docred_rand_200_wizardlm-70b_1.json...


100%|██████████| 200/200 [00:00<00:00, 20326.66it/s]


Processed data saved to ./processed_results/docred_rand_200_wizardlm-70b_1.json

Processing ./results/wiki80_rand_800_wizardlm-70b_1.json...


100%|██████████| 800/800 [00:00<00:00, 33787.23it/s]


Processed data saved to ./processed_results/wiki80_rand_800_wizardlm-70b_1.json

Processing ./results/tacred_rand_800_wizardlm-70b_1.json...


100%|██████████| 800/800 [00:00<00:00, 31073.52it/s]


Processed data saved to ./processed_results/tacred_rand_800_wizardlm-70b_1.json

Processing ./results/nyt10m_rand_500_wizardlm-70b_1.json...


100%|██████████| 500/500 [00:00<00:00, 34864.79it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_wizardlm-70b_1.json

Processing ./results/cdr_rand_200_wizardlm-70b_1.json...


100%|██████████| 200/200 [00:00<00:00, 20228.14it/s]


Processed data saved to ./processed_results/cdr_rand_200_wizardlm-70b_1.json

Processing ./results/wiki20m_rand_500_wizardlm-70b_1.json...


100%|██████████| 500/500 [00:00<00:00, 58745.40it/s]

Processed data saved to ./processed_results/wiki20m_rand_500_wizardlm-70b_1.json






### Mistral

In [199]:
def extract_triples_mistral(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        triples_str = triples_str.split("[/INST]")[1]
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [200]:
post_process('mistral', extract_triples_mistral)

Processing ./results/nyt10m_rand_500_mistral_1.json...


100%|██████████| 500/500 [00:00<00:00, 31093.79it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_mistral_1.json

Processing ./results/cdr_rand_200_mistral_1.json...


100%|██████████| 200/200 [00:00<00:00, 10405.25it/s]


Processed data saved to ./processed_results/cdr_rand_200_mistral_1.json

Processing ./results/docred_rand_200_mistral_1.json...


100%|██████████| 200/200 [00:00<00:00, 12486.76it/s]


Processed data saved to ./processed_results/docred_rand_200_mistral_1.json

Processing ./results/tacred_rand_800_mistral_1.json...


100%|██████████| 800/800 [00:00<00:00, 42496.56it/s]


Processed data saved to ./processed_results/tacred_rand_800_mistral_1.json

Processing ./results/wiki20m_rand_500_mistral_1.json...


100%|██████████| 500/500 [00:00<00:00, 60483.72it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_mistral_1.json

Processing ./results/wiki80_rand_800_mistral_1.json...


100%|██████████| 800/800 [00:00<00:00, 67202.95it/s]

Processed data saved to ./processed_results/wiki80_rand_800_mistral_1.json






### GALACTICA

In [201]:
def extract_triples_galactica(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [202]:
post_process('galactica', extract_triples_galactica)

Processing ./results/wiki80_rand_800_galactica-30b_1.json...


100%|██████████| 800/800 [00:00<00:00, 31276.27it/s]


Processed data saved to ./processed_results/wiki80_rand_800_galactica-30b_1.json

Processing ./results/tacred_rand_800_galactica-30b_1.json...


100%|██████████| 800/800 [00:00<00:00, 23826.37it/s]


Processed data saved to ./processed_results/tacred_rand_800_galactica-30b_1.json

Processing ./results/wiki20m_rand_500_galactica-30b_1.json...


100%|██████████| 500/500 [00:00<00:00, 30907.28it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_galactica-30b_1.json

Processing ./results/docred_rand_200_galactica-30b_1.json...


100%|██████████| 200/200 [00:00<00:00, 579724.12it/s]


Processed data saved to ./processed_results/docred_rand_200_galactica-30b_1.json

Processing ./results/cdr_rand_200_galactica-30b_1.json...


100%|██████████| 200/200 [00:00<00:00, 283974.54it/s]


Processed data saved to ./processed_results/cdr_rand_200_galactica-30b_1.json

Processing ./results/nyt10m_rand_500_galactica-30b_1.json...


100%|██████████| 500/500 [00:00<00:00, 23600.36it/s]

Processed data saved to ./processed_results/nyt10m_rand_500_galactica-30b_1.json






### zephyr

In [203]:
def extract_triples_zephyr(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        triples_str = triples_str.split("\n<|assistant|>\n")[1]
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [204]:
post_process('zephyr', extract_triples_zephyr)

Processing ./results/tacred_rand_800_zephyr-7b-beta_1.json...


100%|██████████| 800/800 [00:00<00:00, 28055.08it/s]


Processed data saved to ./processed_results/tacred_rand_800_zephyr-7b-beta_1.json

Processing ./results/docred_rand_200_zephyr-7b-beta_1.json...


100%|██████████| 200/200 [00:00<00:00, 9053.30it/s]


Processed data saved to ./processed_results/docred_rand_200_zephyr-7b-beta_1.json

Processing ./results/cdr_rand_200_zephyr-7b-beta_1.json...


100%|██████████| 200/200 [00:00<00:00, 6556.62it/s]


Processed data saved to ./processed_results/cdr_rand_200_zephyr-7b-beta_1.json

Processing ./results/nyt10m_rand_500_zephyr-7b-beta_1.json...


100%|██████████| 500/500 [00:00<00:00, 21434.07it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_zephyr-7b-beta_1.json

Processing ./results/wiki20m_rand_500_zephyr-7b-beta_1.json...


100%|██████████| 500/500 [00:00<00:00, 29816.20it/s]

Processed data saved to ./processed_results/wiki20m_rand_500_zephyr-7b-beta_1.json

Processing ./results/wiki80_rand_800_zephyr-7b-beta_1.json...



100%|██████████| 800/800 [00:00<00:00, 35002.85it/s]

Processed data saved to ./processed_results/wiki80_rand_800_zephyr-7b-beta_1.json






### openchat

In [205]:
def extract_triples_openchat(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        triples_str = triples_str.split("GPT4 Correct Assistant:")[1]
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [206]:
post_process('openchat', extract_triples_openchat)

Processing ./results/wiki80_rand_800_openchat_1.json...


100%|██████████| 800/800 [00:00<00:00, 41204.45it/s]


Processed data saved to ./processed_results/wiki80_rand_800_openchat_1.json

Processing ./results/tacred_rand_800_openchat_1.json...


100%|██████████| 800/800 [00:00<00:00, 14167.97it/s]


Processed data saved to ./processed_results/tacred_rand_800_openchat_1.json

Processing ./results/nyt10m_rand_500_openchat_1.json...


100%|██████████| 500/500 [00:00<00:00, 33160.22it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_openchat_1.json

Processing ./results/cdr_rand_200_openchat_1.json...


100%|██████████| 200/200 [00:00<00:00, 15682.87it/s]


Processed data saved to ./processed_results/cdr_rand_200_openchat_1.json

Processing ./results/docred_rand_200_openchat_1.json...


100%|██████████| 200/200 [00:00<00:00, 11230.33it/s]


Processed data saved to ./processed_results/docred_rand_200_openchat_1.json

Processing ./results/wiki20m_rand_500_openchat_1.json...


100%|██████████| 500/500 [00:00<00:00, 40292.65it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_openchat_1.json

Processing ./results/wiki20m_rand_500_openchat_54.json...


100%|██████████| 500/500 [00:00<00:00, 40652.72it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_openchat_54.json

Processing ./results/wiki20m_rand_500_openchat_64.json...


100%|██████████| 500/500 [00:00<00:00, 39271.77it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_openchat_64.json

Processing ./results/wiki20m_rand_500_openchat_74.json...


100%|██████████| 500/500 [00:00<00:00, 38385.47it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_openchat_74.json

Processing ./results/wiki20m_rand_500_openchat_84.json...


100%|██████████| 500/500 [00:00<00:00, 39358.74it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_openchat_84.json



In [207]:
import re
import json
from collections import defaultdict

def extract_triples(data):
    text_triples = defaultdict(list)
    for text, triples_str in data.items():
        # triples_str = triples_str.split("\nASSISTANT:")[1] (For Vicuna)
        # triples_str = triples_str.split("[/INST]")[1] (For LLaMA & Mistral)
        # triples_str = triples_str.split("GPT4 Correct Assistant:")[1] (For OpenChat)
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            triple = json.loads(match)
            if len(triple) == 3:
                triple_list.append(triple)
            if str(triple) not in repeat_map:
                repeat_map[str(triple)] = 1
            else:
                repeat_map[str(triple)] += 1

        text_triples[text] = triple_list
        
        
        for match in matches_2:
            triples = json.loads(match)
            for triple in triples:
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                elif repeat_map[str(triple)] == 1:
                    repeat_map[str(triple)] += 1
                    continue
                else:
                    repeat_map[str(triple)] += 1
                if len(triple) == 3:
                    triple_list.append(triple)

        text_triples[text] = triple_list
    return text_triples


## semi-open

In [208]:
import re
import json
from collections import defaultdict
from tqdm import tqdm

def extract_triples_semi(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        # Use regex to find all JSON array-like structures
        repeat_map = {}
        triples_str = triples_str.replace("\'", '\"')
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
            
        triple_list_ = []
        for triple in triple_list:
            subject_, predicate_, object_ = triple
            if ":" in subject_:
                subject_ = subject_.split(":")[0]
            if ":" in object_:
                object_ = object_.split(":")[0]
            triple = (subject_, predicate_, object_)
            triple_list_.append(triple)
                
        text_triples[text] = triple_list_
    return text_triples


In [209]:
post_process('semi', extract_triples_semi)

Processing ./results/nyt10m_gpt-3.5_semi_1.json...


100%|██████████| 1/1 [00:00<00:00, 9279.43it/s]


Processed data saved to ./processed_results/nyt10m_gpt-3.5_semi_1.json

Processing ./results/nyt10m_rand_500_gpt-3.5_semi_1.json...


100%|██████████| 500/500 [00:00<00:00, 30943.31it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-3.5_semi_1.json

Processing ./results/cdr_rand_200_gpt-3.5_semi_1.json...


100%|██████████| 200/200 [00:00<00:00, 22504.05it/s]

Processed data saved to ./processed_results/cdr_rand_200_gpt-3.5_semi_1.json






## closed

In [2]:
import json

with open('/home/pj20/GREScore/results/nyt10m_gpt-3.5_closed_1.json', 'r') as f:
    data = json.load(f)

In [211]:
from collections import defaultdict

results = defaultdict(list)

for text, triples in data.items():
    for triple in triples:
        subject_, relation, object_ = triple
        if relation[0] == ' ':
            relation = relation[1:]
        results[text].append([subject_, relation, object_])
        
with open('/home/pj20/GREScore/processed_results/nyt10m_gpt-3.5_closed_1.json', 'w') as f:
    json.dump(results, f, indent=6)
        