In [1]:
import os
import json
import glob
from collections import defaultdict
from tqdm import tqdm
import re

def post_process(model_name, extract_func):
    # Directory where your files are stored
    directory = './results/'

    # Pattern to match the files of interest
    pattern = directory + f'*{model_name}*.json'

    # Process each file
    for file_path in glob.glob(pattern):
        print(f"Processing {file_path}...")
        
        # Read the file
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Extract triples
        text_triples = extract_func(data)
        
        # Define new file name for processed results
        base_name = os.path.basename(file_path)
        new_file_name = base_name.replace('results', 'processed_results')
        new_file_path = os.path.join('./processed_results/', new_file_name)
        
        # Write the processed data to a new file
        with open(new_file_path, 'w') as f:
            json.dump(text_triples, f, indent=6)
        
        print(f"Processed data saved to {new_file_path}")
        print()

## GPT Family: gpt-3.5-turbo-instruct & gpt-4 & text-davinci-003

In [41]:
import re
import json
from collections import defaultdict
from tqdm import tqdm

def extract_triples_gpt(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        # Use regex to find all JSON array-like structures
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples


In [42]:
post_process('gpt-3.5', extract_triples_gpt)
post_process('gpt-4', extract_triples_gpt)
post_process('davinci', extract_triples_gpt)


Processing ./results/nyt10m_rand_500_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 500/500 [00:00<00:00, 30116.35it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-3.5-turbo-instruct_1.json

Processing ./results/nyt10m_rand_500_gpt-3.5-turbo-1106_1.json...


100%|██████████| 500/500 [00:00<00:00, 46401.27it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-3.5-turbo-1106_1.json

Processing ./results/wiki80_rand_800_gpt-3.5-turbo-1106_1.json...


100%|██████████| 800/800 [00:00<00:00, 62373.47it/s]


Processed data saved to ./processed_results/wiki80_rand_800_gpt-3.5-turbo-1106_1.json

Processing ./results/docred_rand_200_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 200/200 [00:00<00:00, 10014.22it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-3.5-turbo-instruct_1.json

Processing ./results/cdr_rand_200_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 200/200 [00:00<00:00, 10671.31it/s]


Processed data saved to ./processed_results/cdr_rand_200_gpt-3.5-turbo-instruct_1.json

Processing ./results/tacred_rand_800_gpt-3.5-turbo-1106_1.json...


100%|██████████| 800/800 [00:00<00:00, 51623.00it/s]


Processed data saved to ./processed_results/tacred_rand_800_gpt-3.5-turbo-1106_1.json

Processing ./results/tacred_rand_800_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 800/800 [00:00<00:00, 33895.08it/s]


Processed data saved to ./processed_results/tacred_rand_800_gpt-3.5-turbo-instruct_1.json

Processing ./results/wiki20m_rand_500_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 500/500 [00:00<00:00, 37443.13it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-3.5-turbo-instruct_1.json

Processing ./results/wiki80_rand_800_gpt-3.5-turbo-instruct_1.json...


100%|██████████| 800/800 [00:00<00:00, 38182.10it/s]


Processed data saved to ./processed_results/wiki80_rand_800_gpt-3.5-turbo-instruct_1.json

Processing ./results/docred_rand_200_gpt-3.5-turbo_1.json...


100%|██████████| 200/200 [00:00<00:00, 11652.95it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-3.5-turbo_1.json

Processing ./results/wiki20m_rand_500_gpt-3.5-turbo-1106_1.json...


100%|██████████| 500/500 [00:00<00:00, 66311.01it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-3.5-turbo-1106_1.json

Processing ./results/nyt10m_rand_500_gpt-4_1.json...


100%|██████████| 500/500 [00:00<00:00, 33939.99it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_gpt-4_1.json

Processing ./results/docred_rand_200_gpt-4_1.json...


100%|██████████| 200/200 [00:00<00:00, 362.65it/s]


Processed data saved to ./processed_results/docred_rand_200_gpt-4_1.json

Processing ./results/cdr_rand_200_gpt-4_1.json...


100%|██████████| 200/200 [00:00<00:00, 11921.06it/s]


Processed data saved to ./processed_results/cdr_rand_200_gpt-4_1.json

Processing ./results/wiki80_rand_800_gpt-4_1.json...


100%|██████████| 800/800 [00:00<00:00, 42144.81it/s]


Processed data saved to ./processed_results/wiki80_rand_800_gpt-4_1.json

Processing ./results/tacred_rand_800_gpt-4_1.json...


100%|██████████| 800/800 [00:00<00:00, 39185.83it/s]


Processed data saved to ./processed_results/tacred_rand_800_gpt-4_1.json

Processing ./results/wiki20m_rand_500_gpt-4_1.json...


100%|██████████| 500/500 [00:00<00:00, 44498.12it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_gpt-4_1.json

Processing ./results/nyt10m_rand_500_text-davinci-003_1.json...


100%|██████████| 500/500 [00:00<00:00, 35586.57it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_text-davinci-003_1.json

Processing ./results/docred_rand_200_text-davinci-003_1.json...


100%|██████████| 200/200 [00:00<00:00, 11558.22it/s]


Processed data saved to ./processed_results/docred_rand_200_text-davinci-003_1.json

Processing ./results/cdr_rand_200_text-davinci-003_1.json...


100%|██████████| 200/200 [00:00<00:00, 13860.43it/s]


Processed data saved to ./processed_results/cdr_rand_200_text-davinci-003_1.json

Processing ./results/tacred_rand_800_text-davinci-003_1.json...


100%|██████████| 800/800 [00:00<00:00, 40561.91it/s]


Processed data saved to ./processed_results/tacred_rand_800_text-davinci-003_1.json

Processing ./results/wiki20m_rand_500_text-davinci-003_1.json...


100%|██████████| 500/500 [00:00<00:00, 43637.94it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_text-davinci-003_1.json

Processing ./results/wiki80_rand_800_text-davinci-003_1.json...


100%|██████████| 800/800 [00:00<00:00, 42733.61it/s]


Processed data saved to ./processed_results/wiki80_rand_800_text-davinci-003_1.json



## LLAMA Family: Vicuna-7B, Vicuna-33B, LLAMA-2-7B, LLAMA-2-70B

In [39]:
def extract_triples_vicuna(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        triples_str = triples_str.split("ASSISTANT:")[1]
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [40]:
post_process('vicuna', extract_triples_vicuna)

Processing ./results/nyt10m_rand_500_vicuna-1.3-33b_1.json...


100%|██████████| 500/500 [00:00<00:00, 32511.97it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_vicuna-1.3-33b_1.json

Processing ./results/tacred_rand_800_vicuna-1.3-33b_1.json...


100%|██████████| 800/800 [00:00<00:00, 36678.32it/s]


Processed data saved to ./processed_results/tacred_rand_800_vicuna-1.3-33b_1.json

Processing ./results/wiki20m_rand_500_vicuna-1.3-33b_1.json...


100%|██████████| 500/500 [00:00<00:00, 41254.91it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_vicuna-1.3-33b_1.json

Processing ./results/wiki80_rand_800_vicuna-1.3-33b_1.json...


100%|██████████| 800/800 [00:00<00:00, 41158.96it/s]


Processed data saved to ./processed_results/wiki80_rand_800_vicuna-1.3-33b_1.json

Processing ./results/cdr_rand_200_vicuna-1.3-33b_1.json...


100%|██████████| 200/200 [00:00<00:00, 13935.95it/s]


Processed data saved to ./processed_results/cdr_rand_200_vicuna-1.3-33b_1.json

Processing ./results/cdr_rand_200_vicuna-1.5-7b_1.json...


100%|██████████| 200/200 [00:00<00:00, 13515.41it/s]


Processed data saved to ./processed_results/cdr_rand_200_vicuna-1.5-7b_1.json

Processing ./results/docred_rand_200_vicuna-1.3-33b_1.json...


100%|██████████| 200/200 [00:00<00:00, 11505.74it/s]


Processed data saved to ./processed_results/docred_rand_200_vicuna-1.3-33b_1.json

Processing ./results/docred_rand_200_vicuna-1.5-7b_1.json...


100%|██████████| 200/200 [00:00<00:00, 12732.58it/s]


Processed data saved to ./processed_results/docred_rand_200_vicuna-1.5-7b_1.json

Processing ./results/wiki20m_rand_500_vicuna-1.5-7b_1.json...


100%|██████████| 1/1 [00:00<00:00, 11748.75it/s]

Processed data saved to ./processed_results/wiki20m_rand_500_vicuna-1.5-7b_1.json






## LLAMA

In [37]:
def extract_triples_llama(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        triples_str = triples_str.split("[/INST]")[1]
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [38]:
post_process('llama', extract_triples_llama)

Processing ./results/wiki20m_rand_500_llama-2-70b_1.json...


100%|██████████| 500/500 [00:00<00:00, 31431.19it/s]


Processed data saved to ./processed_results/wiki20m_rand_500_llama-2-70b_1.json

Processing ./results/docred_rand_200_llama-2-70b_1.json...


100%|██████████| 200/200 [00:00<00:00, 1100.69it/s]


Processed data saved to ./processed_results/docred_rand_200_llama-2-70b_1.json

Processing ./results/docred_rand_200_llama-2-7b_1.json...


100%|██████████| 200/200 [00:00<00:00, 44032.38it/s]


Processed data saved to ./processed_results/docred_rand_200_llama-2-7b_1.json

Processing ./results/cdr_rand_200_llama-2-70b_1.json...


100%|██████████| 141/141 [00:00<00:00, 13329.36it/s]


Processed data saved to ./processed_results/cdr_rand_200_llama-2-70b_1.json

Processing ./results/cdr_rand_200_llama-2-7b_1.json...


100%|██████████| 200/200 [00:00<00:00, 18854.19it/s]


Processed data saved to ./processed_results/cdr_rand_200_llama-2-7b_1.json

Processing ./results/nyt10m_rand_500_llama-2-70b_1.json...


100%|██████████| 500/500 [00:00<00:00, 31239.70it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_llama-2-70b_1.json

Processing ./results/cdr_rand_200_llama-2-70b_2.json...


100%|██████████| 200/200 [00:00<00:00, 14066.82it/s]

Processed data saved to ./processed_results/cdr_rand_200_llama-2-70b_2.json






## Mistral

In [43]:
def extract_triples_mistral(data):
    text_triples = defaultdict(list)
    for text, triples_str in tqdm(data.items()):
        triples_str = triples_str.split("[/INST]")[1]
        repeat_map = {}
        matches_1 = re.findall(r'\[.*?\]', triples_str, re.DOTALL)
        matches_2 = re.findall(r'\[\[.*?\]\]', triples_str, re.DOTALL)
        triple_list = []
        for match in matches_1:
            try:
                # Attempt to parse each match as JSON
                triple = json.loads(match)
                if len(triple) == 3:
                    triple_list.append(triple)
                if str(triple) not in repeat_map:
                    repeat_map[str(triple)] = 1
                else:
                    repeat_map[str(triple)] += 1

            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                    continue
        text_triples[text] = triple_list
        
        
        for match in matches_2:
            try:
                # Attempt to parse each match as JSON
                triples = json.loads(match)
                for triple in triples:
                    if str(triple) not in repeat_map:
                        repeat_map[str(triple)] = 1
                    elif repeat_map[str(triple)] == 1:
                        repeat_map[str(triple)] += 1
                        continue
                    else:
                        repeat_map[str(triple)] += 1
                    if len(triple) == 3:
                        triple_list.append(triple)
            except json.JSONDecodeError:
                # Handle cases where the match is not valid JSON
                continue
        text_triples[text] = triple_list
    return text_triples

In [44]:
post_process('mistral', extract_triples_mistral)

Processing ./results/nyt10m_rand_500_mistral_1.json...


100%|██████████| 500/500 [00:00<00:00, 34971.77it/s]


Processed data saved to ./processed_results/nyt10m_rand_500_mistral_1.json

Processing ./results/cdr_rand_200_mistral_1.json...


100%|██████████| 200/200 [00:00<00:00, 11048.98it/s]


Processed data saved to ./processed_results/cdr_rand_200_mistral_1.json

Processing ./results/docred_rand_200_mistral_1.json...


100%|██████████| 200/200 [00:00<00:00, 12780.30it/s]

Processed data saved to ./processed_results/docred_rand_200_mistral_1.json




