In [128]:
import os
import json
import glob
from collections import defaultdict
from tqdm import tqdm
import re

def post_process(model_name, extract_func):
    # Directory where your files are stored
    directory = './results/'

    # Pattern to match the files of interest
    pattern = directory + f'*{model_name}*.json'

    # Process each file
    for file_path in glob.glob(pattern):
        print(f"Processing {file_path}...")
        
        # Read the file
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Extract triples
        text_triples = extract_func(data)
        
        # Define new file name for processed results
        base_name = os.path.basename(file_path)
        new_file_name = base_name.replace('results', 'processed_results')
        new_file_path = os.path.join('./processed_results/', new_file_name)
        
        # Write the processed data to a new file
        with open(new_file_path, 'w') as f:
            json.dump(text_triples, f, indent=6)
        
        print(f"Processed data saved to {new_file_path}")
        print()

### GPT Family: gpt-3.5-turbo-instruct & gpt-4 & text-davinci-003

In [129]:
import re
import json
from collections import defaultdict
from tqdm import tqdm

def extract_triples_gpt(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        # Use regex to find all JSON array-like structures
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples


In [130]:
post_process('gpt-3.5-turbo', extract_triples_gpt)
post_process('gpt-4', extract_triples_gpt)
post_process('davinci', extract_triples_gpt)
post_process('gpt-4-1106', extract_triples_gpt)


Processing ./results/nyt10m_rand_500_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 500/500 [00:00<00:00, 25982.83it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-3.5-turbo-instruct_1.json

Processing ./results/nyt10m_rand_500_gpt-3.5-turbo-1106_1.json...


100%|██████████| 500/500 [00:00<00:00, 41845.96it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-3.5-turbo-1106_1.json

Processing ./results/wiki80_rand_800_gpt-3.5-turbo-1106_1.json...


100%|██████████| 800/800 [00:00<00:00, 63801.40it/s]


Processed data saved to ./processed_results/wiki80_rand_800_gpt-3.5-turbo-1106_1.json

Processing ./results/docred_rand_200_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 200/200 [00:00<00:00, 9922.41it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-3.5-turbo-instruct_1.json

Processing ./results/cdr_rand_200_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 200/200 [00:00<00:00, 11006.22it/s]


Processed data saved to ./processed_results/cdr_rand_200_gpt-3.5-turbo-instruct_1.json

Processing ./results/tacred_rand_800_gpt-3.5-turbo-1106_1.json...


100%|██████████| 800/800 [00:00<00:00, 50990.70it/s]


Processed data saved to ./processed_results/tacred_rand_800_gpt-3.5-turbo-1106_1.json

Processing ./results/tacred_rand_800_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 800/800 [00:00<00:00, 34216.87it/s]


Processed data saved to ./processed_results/tacred_rand_800_gpt-3.5-turbo-instruct_1.json

Processing ./results/wiki80_rand_800_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 800/800 [00:00<00:00, 38028.48it/s]


Processed data saved to ./processed_results/wiki80_rand_800_gpt-3.5-turbo-instruct_1.json

Processing ./results/docred_rand_200_gpt-3.5-turbo-1106_1.json...


100%|██████████| 200/200 [00:00<00:00, 11971.75it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-3.5-turbo-1106_1.json

Processing ./results/cdr_rand_200_gpt-3.5-turbo-1106_1.json...


100%|██████████| 200/200 [00:00<00:00, 9599.60it/s]


Processed data saved to ./processed_results/cdr_rand_200_gpt-3.5-turbo-1106_1.json

Processing ./results/wiki20m_rand_500_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 500/500 [00:00<00:00, 35597.44it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-3.5-turbo-instruct_1.json

Processing ./results/wiki20m_rand_500_gpt-3.5-turbo-1106_1.json...


100%|██████████| 500/500 [00:00<00:00, 62566.07it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-3.5-turbo-1106_1.json

Processing ./results/nyt10m_rand_500_gpt-4_1.json...


100%|██████████| 500/500 [00:00<00:00, 35119.35it/s]

Processed data saved to ./processed_results/nyt10m_rand_500_gpt-4_1.json

Processing ./results/docred_rand_200_gpt-4-1106-preview_1.json...



100%|██████████| 200/200 [00:00<00:00, 8118.98it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-4-1106-preview_1.json

Processing ./results/docred_rand_200_gpt-4_1.json...


100%|██████████| 200/200 [00:00<00:00, 10198.67it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-4_1.json

Processing ./results/cdr_rand_200_gpt-4_1.json...


100%|██████████| 200/200 [00:00<00:00, 12559.86it/s]


Processed data saved to ./processed_results/cdr_rand_200_gpt-4_1.json

Processing ./results/nyt10m_rand_500_gpt-4-1106-preview_1.json...


100%|██████████| 500/500 [00:00<00:00, 32924.39it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-4-1106-preview_1.json

Processing ./results/wiki80_rand_800_gpt-4_1.json...


100%|██████████| 800/800 [00:00<00:00, 44210.49it/s]


Processed data saved to ./processed_results/wiki80_rand_800_gpt-4_1.json

Processing ./results/tacred_rand_800_gpt-4_1.json...


100%|██████████| 800/800 [00:00<00:00, 41756.18it/s]


Processed data saved to ./processed_results/tacred_rand_800_gpt-4_1.json

Processing ./results/cdr_rand_200_gpt-4-1106-preview_1.json...


100%|██████████| 200/200 [00:00<00:00, 9778.98it/s]


Processed data saved to ./processed_results/cdr_rand_200_gpt-4-1106-preview_1.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_1.json...


100%|██████████| 500/500 [00:00<00:00, 41048.19it/s]

Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_1.json

Processing ./results/wiki20m_rand_500_gpt-4_1.json...



100%|██████████| 500/500 [00:00<00:00, 45768.36it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4_1.json

Processing ./results/tacred_rand_800_gpt-4-1106-preview_1.json...


100%|██████████| 800/800 [00:00<00:00, 8897.38it/s]


Processed data saved to ./processed_results/tacred_rand_800_gpt-4-1106-preview_1.json

Processing ./results/wiki80_rand_800_gpt-4-1106-preview_1.json...


100%|██████████| 800/800 [00:00<00:00, 41654.06it/s]


Processed data saved to ./processed_results/wiki80_rand_800_gpt-4-1106-preview_1.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_54.json...


100%|██████████| 500/500 [00:00<00:00, 37975.37it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_54.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_64.json...


100%|██████████| 500/500 [00:00<00:00, 39829.68it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_64.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_74.json...


100%|██████████| 500/500 [00:00<00:00, 39722.55it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_74.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_84.json...


100%|██████████| 161/161 [00:00<00:00, 40569.72it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_84.json

Processing ./results/nyt10m_rand_500_text-davinci-003_1.json...


100%|██████████| 500/500 [00:00<00:00, 36605.26it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_text-davinci-003_1.json

Processing ./results/docred_rand_200_text-davinci-003_1.json...


100%|██████████| 200/200 [00:00<00:00, 11759.13it/s]


Processed data saved to ./processed_results/docred_rand_200_text-davinci-003_1.json

Processing ./results/cdr_rand_200_text-davinci-003_1.json...


100%|██████████| 200/200 [00:00<00:00, 14375.87it/s]


Processed data saved to ./processed_results/cdr_rand_200_text-davinci-003_1.json

Processing ./results/tacred_rand_800_text-davinci-003_1.json...


100%|██████████| 800/800 [00:00<00:00, 39764.45it/s]


Processed data saved to ./processed_results/tacred_rand_800_text-davinci-003_1.json

Processing ./results/wiki80_rand_800_text-davinci-003_1.json...


100%|██████████| 800/800 [00:00<00:00, 43316.16it/s]


Processed data saved to ./processed_results/wiki80_rand_800_text-davinci-003_1.json

Processing ./results/wiki20m_rand_500_text-davinci-003_1.json...


100%|██████████| 500/500 [00:00<00:00, 45337.95it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_text-davinci-003_1.json

Processing ./results/docred_rand_200_gpt-4-1106-preview_1.json...


100%|██████████| 200/200 [00:00<00:00, 8289.88it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-4-1106-preview_1.json

Processing ./results/nyt10m_rand_500_gpt-4-1106-preview_1.json...


100%|██████████| 500/500 [00:00<00:00, 32353.47it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-4-1106-preview_1.json

Processing ./results/cdr_rand_200_gpt-4-1106-preview_1.json...


100%|██████████| 200/200 [00:00<00:00, 9562.39it/s]


Processed data saved to ./processed_results/cdr_rand_200_gpt-4-1106-preview_1.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_1.json...


100%|██████████| 500/500 [00:00<00:00, 39518.20it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_1.json

Processing ./results/tacred_rand_800_gpt-4-1106-preview_1.json...


100%|██████████| 800/800 [00:00<00:00, 39528.35it/s]


Processed data saved to ./processed_results/tacred_rand_800_gpt-4-1106-preview_1.json

Processing ./results/wiki80_rand_800_gpt-4-1106-preview_1.json...


100%|██████████| 800/800 [00:00<00:00, 40704.60it/s]


Processed data saved to ./processed_results/wiki80_rand_800_gpt-4-1106-preview_1.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_54.json...


100%|██████████| 500/500 [00:00<00:00, 40416.90it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_54.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_64.json...


100%|██████████| 500/500 [00:00<00:00, 40735.64it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_64.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_74.json...


100%|██████████| 500/500 [00:00<00:00, 39834.22it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_74.json

Processing ./results/wiki20m_rand_500_gpt-4-1106-preview_84.json...


100%|██████████| 161/161 [00:00<00:00, 40257.72it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4-1106-preview_84.json



### LLAMA Family: Vicuna-7B, Vicuna-33B, LLAMA-2-7B, LLAMA-2-70B, Wizard-70B

In [131]:
def extract_triples_vicuna(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        triples_str = triples_str.split("\nASSISTANT:")[1]
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [132]:
post_process('vicuna', extract_triples_vicuna)

Processing ./results/tacred_rand_800_vicuna-1.5-7b_1.json...


100%|██████████| 800/800 [00:00<00:00, 49391.97it/s]


Processed data saved to ./processed_results/tacred_rand_800_vicuna-1.5-7b_1.json

Processing ./results/nyt10m_rand_500_vicuna-1.3-33b_1.json...


100%|██████████| 500/500 [00:00<00:00, 33257.51it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_vicuna-1.3-33b_1.json

Processing ./results/tacred_rand_800_vicuna-1.3-33b_1.json...


100%|██████████| 800/800 [00:00<00:00, 36634.28it/s]


Processed data saved to ./processed_results/tacred_rand_800_vicuna-1.3-33b_1.json

Processing ./results/wiki80_rand_800_vicuna-1.3-33b_1.json...


100%|██████████| 800/800 [00:00<00:00, 42100.39it/s]


Processed data saved to ./processed_results/wiki80_rand_800_vicuna-1.3-33b_1.json

Processing ./results/cdr_rand_200_vicuna-1.3-33b_1.json...


100%|██████████| 200/200 [00:00<00:00, 13351.49it/s]


Processed data saved to ./processed_results/cdr_rand_200_vicuna-1.3-33b_1.json

Processing ./results/cdr_rand_200_vicuna-1.5-7b_1.json...


100%|██████████| 200/200 [00:00<00:00, 13691.22it/s]


Processed data saved to ./processed_results/cdr_rand_200_vicuna-1.5-7b_1.json

Processing ./results/docred_rand_200_vicuna-1.3-33b_1.json...


100%|██████████| 200/200 [00:00<00:00, 11924.78it/s]


Processed data saved to ./processed_results/docred_rand_200_vicuna-1.3-33b_1.json

Processing ./results/docred_rand_200_vicuna-1.5-7b_1.json...


100%|██████████| 200/200 [00:00<00:00, 12927.03it/s]


Processed data saved to ./processed_results/docred_rand_200_vicuna-1.5-7b_1.json

Processing ./results/nyt10m_rand_500_vicuna-1.5-7b_1.json...


100%|██████████| 500/500 [00:00<00:00, 39408.30it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_vicuna-1.5-7b_1.json

Processing ./results/wiki20m_rand_500_vicuna-1.5-7b_1.json...


100%|██████████| 500/500 [00:00<00:00, 46749.86it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_vicuna-1.5-7b_1.json

Processing ./results/wiki20m_rand_500_vicuna-1.3-33b_1.json...


100%|██████████| 500/500 [00:00<00:00, 39918.38it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_vicuna-1.3-33b_1.json

Processing ./results/wiki80_rand_800_vicuna-1.5-7b_1.json...


100%|██████████| 800/800 [00:00<00:00, 53051.32it/s]

Processed data saved to ./processed_results/wiki80_rand_800_vicuna-1.5-7b_1.json






### LLAMA

In [133]:
def extract_triples_llama(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        triples_str = triples_str.split("[/INST]")[1]
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [134]:
post_process('llama', extract_triples_llama)

Processing ./results/tacred_rand_800_llama-2-70b_1.json...


100%|██████████| 800/800 [00:00<00:00, 37180.66it/s]


Processed data saved to ./processed_results/tacred_rand_800_llama-2-70b_1.json

Processing ./results/docred_rand_200_llama-2-70b_1.json...


100%|██████████| 200/200 [00:00<00:00, 10963.35it/s]


Processed data saved to ./processed_results/docred_rand_200_llama-2-70b_1.json

Processing ./results/wiki20m_rand_500_llama-2-70b_1.json...


100%|██████████| 500/500 [00:00<00:00, 9980.97it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_llama-2-70b_1.json

Processing ./results/wiki80_rand_800_llama-2-70b_1.json...


100%|██████████| 800/800 [00:00<00:00, 39540.93it/s]


Processed data saved to ./processed_results/wiki80_rand_800_llama-2-70b_1.json

Processing ./results/docred_rand_200_llama-2-7b_1.json...


100%|██████████| 200/200 [00:00<00:00, 44875.66it/s]


Processed data saved to ./processed_results/docred_rand_200_llama-2-7b_1.json

Processing ./results/cdr_rand_200_llama-2-70b_1.json...


100%|██████████| 200/200 [00:00<00:00, 14009.73it/s]


Processed data saved to ./processed_results/cdr_rand_200_llama-2-70b_1.json

Processing ./results/cdr_rand_200_llama-2-7b_1.json...


100%|██████████| 200/200 [00:00<00:00, 18463.28it/s]


Processed data saved to ./processed_results/cdr_rand_200_llama-2-7b_1.json

Processing ./results/nyt10m_rand_500_llama-2-70b_1.json...


100%|██████████| 500/500 [00:00<00:00, 31883.24it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_llama-2-70b_1.json

Processing ./results/tacred_rand_800_llama-2-7b_1.json...


100%|██████████| 800/800 [00:00<00:00, 47283.74it/s]


Processed data saved to ./processed_results/tacred_rand_800_llama-2-7b_1.json

Processing ./results/wiki20m_rand_500_llama-2-7b_1.json...


100%|██████████| 500/500 [00:00<00:00, 48272.53it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_llama-2-7b_1.json

Processing ./results/wiki80_rand_800_llama-2-7b_1.json...


100%|██████████| 800/800 [00:00<00:00, 57574.52it/s]


Processed data saved to ./processed_results/wiki80_rand_800_llama-2-7b_1.json

Processing ./results/wiki20m_rand_500_llama-2-70b_54.json...


100%|██████████| 500/500 [00:00<00:00, 34930.99it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_llama-2-70b_54.json

Processing ./results/nyt10m_rand_500_llama-2-7b_44.json...


100%|██████████| 500/500 [00:00<00:00, 40960.80it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_llama-2-7b_44.json

Processing ./results/wiki20m_rand_500_llama-2-70b_74.json...


100%|██████████| 121/121 [00:00<00:00, 36680.46it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_llama-2-70b_74.json

Processing ./results/wiki20m_rand_500_llama-2-70b_64.json...


100%|██████████| 81/81 [00:00<00:00, 37186.80it/s]

Processed data saved to ./processed_results/wiki20m_rand_500_llama-2-70b_64.json






### Wizard-70B

In [135]:
post_process('wizard', extract_triples_vicuna)

Processing ./results/docred_rand_200_wizardlm-70b_1.json...


100%|██████████| 200/200 [00:00<00:00, 29078.65it/s]

Processed data saved to ./processed_results/docred_rand_200_wizardlm-70b_1.json

Processing ./results/wiki80_rand_800_wizardlm-70b_1.json...



100%|██████████| 800/800 [00:00<00:00, 50792.33it/s]


Processed data saved to ./processed_results/wiki80_rand_800_wizardlm-70b_1.json

Processing ./results/tacred_rand_800_wizardlm-70b_1.json...


100%|██████████| 800/800 [00:00<00:00, 47500.61it/s]


Processed data saved to ./processed_results/tacred_rand_800_wizardlm-70b_1.json

Processing ./results/nyt10m_rand_500_wizardlm-70b_1.json...


100%|██████████| 500/500 [00:00<00:00, 41042.57it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_wizardlm-70b_1.json

Processing ./results/cdr_rand_200_wizardlm-70b_1.json...


100%|██████████| 200/200 [00:00<00:00, 20608.30it/s]


Processed data saved to ./processed_results/cdr_rand_200_wizardlm-70b_1.json

Processing ./results/wiki20m_rand_500_wizardlm-70b_1.json...


100%|██████████| 500/500 [00:00<00:00, 56536.15it/s]

Processed data saved to ./processed_results/wiki20m_rand_500_wizardlm-70b_1.json






### Mistral

In [136]:
def extract_triples_mistral(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        triples_str = triples_str.split("[/INST]")[1]
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [137]:
post_process('mistral', extract_triples_mistral)

Processing ./results/nyt10m_rand_500_mistral_1.json...


100%|██████████| 500/500 [00:00<00:00, 36774.08it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_mistral_1.json

Processing ./results/cdr_rand_200_mistral_1.json...


100%|██████████| 200/200 [00:00<00:00, 11221.32it/s]


Processed data saved to ./processed_results/cdr_rand_200_mistral_1.json

Processing ./results/docred_rand_200_mistral_1.json...


100%|██████████| 200/200 [00:00<00:00, 13182.38it/s]


Processed data saved to ./processed_results/docred_rand_200_mistral_1.json

Processing ./results/tacred_rand_800_mistral_1.json...


100%|██████████| 800/800 [00:00<00:00, 41110.05it/s]


Processed data saved to ./processed_results/tacred_rand_800_mistral_1.json

Processing ./results/wiki20m_rand_500_mistral_1.json...


100%|██████████| 500/500 [00:00<00:00, 58834.39it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_mistral_1.json

Processing ./results/wiki80_rand_800_mistral_1.json...


100%|██████████| 800/800 [00:00<00:00, 66091.06it/s]

Processed data saved to ./processed_results/wiki80_rand_800_mistral_1.json






### GALACTICA

In [138]:
def extract_triples_galactica(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [139]:
post_process('galactica', extract_triples_galactica)

Processing ./results/wiki80_rand_800_galactica-30b_1.json...


100%|██████████| 800/800 [00:00<00:00, 35243.29it/s]


Processed data saved to ./processed_results/wiki80_rand_800_galactica-30b_1.json

Processing ./results/tacred_rand_800_galactica-30b_1.json...


100%|██████████| 800/800 [00:00<00:00, 25013.37it/s]


Processed data saved to ./processed_results/tacred_rand_800_galactica-30b_1.json

Processing ./results/wiki20m_rand_500_galactica-30b_1.json...


100%|██████████| 500/500 [00:00<00:00, 29751.90it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_galactica-30b_1.json

Processing ./results/docred_rand_200_galactica-30b_1.json...


100%|██████████| 200/200 [00:00<00:00, 604802.31it/s]


Processed data saved to ./processed_results/docred_rand_200_galactica-30b_1.json

Processing ./results/cdr_rand_200_galactica-30b_1.json...


100%|██████████| 200/200 [00:00<00:00, 286202.93it/s]


Processed data saved to ./processed_results/cdr_rand_200_galactica-30b_1.json

Processing ./results/nyt10m_rand_500_galactica-30b_1.json...


100%|██████████| 500/500 [00:00<00:00, 24438.34it/s]

Processed data saved to ./processed_results/nyt10m_rand_500_galactica-30b_1.json






### zephyr

In [140]:
def extract_triples_zephyr(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        triples_str = triples_str.split("\n<|assistant|>\n")[1]
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [141]:
post_process('zephyr', extract_triples_zephyr)

Processing ./results/tacred_rand_800_zephyr-7b-beta_1.json...


100%|██████████| 800/800 [00:00<00:00, 28903.81it/s]


Processed data saved to ./processed_results/tacred_rand_800_zephyr-7b-beta_1.json

Processing ./results/docred_rand_200_zephyr-7b-beta_1.json...


100%|██████████| 200/200 [00:00<00:00, 9055.65it/s]


Processed data saved to ./processed_results/docred_rand_200_zephyr-7b-beta_1.json

Processing ./results/cdr_rand_200_zephyr-7b-beta_1.json...


100%|██████████| 200/200 [00:00<00:00, 6513.05it/s]


Processed data saved to ./processed_results/cdr_rand_200_zephyr-7b-beta_1.json

Processing ./results/nyt10m_rand_500_zephyr-7b-beta_1.json...


100%|██████████| 500/500 [00:00<00:00, 21075.21it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_zephyr-7b-beta_1.json

Processing ./results/wiki20m_rand_500_zephyr-7b-beta_1.json...


100%|██████████| 500/500 [00:00<00:00, 9445.40it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_zephyr-7b-beta_1.json

Processing ./results/wiki80_rand_800_zephyr-7b-beta_1.json...


100%|██████████| 800/800 [00:00<00:00, 34703.46it/s]


Processed data saved to ./processed_results/wiki80_rand_800_zephyr-7b-beta_1.json



### openchat

In [142]:
def extract_triples_openchat(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        triples_str = triples_str.split("GPT4 Correct Assistant:")[1]
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [143]:
post_process('openchat', extract_triples_openchat)

Processing ./results/wiki80_rand_800_openchat_1.json...


100%|██████████| 800/800 [00:00<00:00, 39946.70it/s]


Processed data saved to ./processed_results/wiki80_rand_800_openchat_1.json

Processing ./results/tacred_rand_800_openchat_1.json...


100%|██████████| 800/800 [00:00<00:00, 38657.63it/s]


Processed data saved to ./processed_results/tacred_rand_800_openchat_1.json

Processing ./results/nyt10m_rand_500_openchat_1.json...


100%|██████████| 500/500 [00:00<00:00, 33350.59it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_openchat_1.json

Processing ./results/cdr_rand_200_openchat_1.json...


100%|██████████| 200/200 [00:00<00:00, 15350.82it/s]


Processed data saved to ./processed_results/cdr_rand_200_openchat_1.json

Processing ./results/docred_rand_200_openchat_1.json...


100%|██████████| 200/200 [00:00<00:00, 11334.88it/s]


Processed data saved to ./processed_results/docred_rand_200_openchat_1.json

Processing ./results/wiki20m_rand_500_openchat_1.json...


100%|██████████| 500/500 [00:00<00:00, 38895.21it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_openchat_1.json

Processing ./results/wiki20m_rand_500_openchat_54.json...


100%|██████████| 161/161 [00:00<00:00, 38769.26it/s]

Processed data saved to ./processed_results/wiki20m_rand_500_openchat_54.json






In [144]:
import re
import json
from collections import defaultdict

def extract_triples(data):
    text_triples = defaultdict(list)
    for text, triples_str in data.items():
        # triples_str = triples_str.split("\nASSISTANT:")[1] (For Vicuna)
        # triples_str = triples_str.split("[/INST]")[1] (For LLaMA & Mistral)
        # triples_str = triples_str.split("GPT4 Correct Assistant:")[1] (For OpenChat)
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            triple = json.loads(match)
            if len(triple) == 3:
                triple_list.append(triple)
            if str(triple) not in repeat_map:
                repeat_map[str(triple)] = 1
            else:
                repeat_map[str(triple)] += 1

        text_triples[text] = triple_list
        
        
        for match in matches_2:
            triples = json.loads(match)
            for triple in triples:
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                elif repeat_map[str(triple)] == 1:
                    repeat_map[str(triple)] += 1
                    continue
                else:
                    repeat_map[str(triple)] += 1
                if len(triple) == 3:
                    triple_list.append(triple)

        text_triples[text] = triple_list
    return text_triples


## semi-open

In [145]:
import re
import json
from collections import defaultdict
from tqdm import tqdm

def extract_triples_semi(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        # Use regex to find all JSON array-like structures
        repeat_map = {}
        triples_str = triples_str.replace("\'", '\"')
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
            
        triple_list_ = []
        for triple in triple_list:
            subject_, predicate_, object_ = triple
            if ":" in subject_:
                subject_ = subject_.split(":")[0]
            if ":" in object_:
                object_ = object_.split(":")[0]
            triple = (subject_, predicate_, object_)
            triple_list_.append(triple)
                
        text_triples[text] = triple_list_
    return text_triples


In [146]:
post_process('semi', extract_triples_semi)

Processing ./results/nyt10m_gpt-3.5_semi_1.json...


100%|██████████| 1/1 [00:00<00:00, 8240.28it/s]


Processed data saved to ./processed_results/nyt10m_gpt-3.5_semi_1.json

Processing ./results/nyt10m_rand_500_gpt-3.5_semi_1.json...


100%|██████████| 500/500 [00:00<00:00, 29569.00it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-3.5_semi_1.json

Processing ./results/cdr_rand_200_gpt-3.5_semi_1.json...


100%|██████████| 200/200 [00:00<00:00, 22149.31it/s]

Processed data saved to ./processed_results/cdr_rand_200_gpt-3.5_semi_1.json






## closed

In [147]:
with open('/home/pj20/GREScore/results/cdr_rand_200_gpt-3.5_closed_1.json', 'r') as f:
    data = json.load(f)

In [148]:
from collections import defaultdict

results = defaultdict(list)

for text, triples in data.items():
    for triple in triples:
        subject_, relation, object_ = triple
        if relation[0] == ' ':
            relation = relation[1:]
        results[text].append([subject_, relation, object_])
        
with open('/home/pj20/GREScore/processed_results/cdr_rand_200_gpt-3.5_closed_1.json', 'w') as f:
    json.dump(results, f, indent=6)
        