In [1]:
from config import Param

param = Param()
args = param.args
args

args.task_name = args.dataname

# rel_per_task
args.rel_per_task = 8 if args.dataname == "FewRel" else 4

In [2]:
print(args)

Namespace(gpu=0, dataname='FewRel', task_name='FewRel', device='cuda', batch_size=32, num_tasks=10, rel_per_task=8, pattern='entity_marker', max_length=128, encoder_output_size=768, vocab_size=30522, marker_size=4, num_workers=0, classifier_lr=0.01, encoder_lr=0.001, prompt_pool_lr=0.001, sgd_momentum=0.1, gmm_num_components=1, pull_constraint_coeff=0.1, classifier_epochs=10, encoder_epochs=10, prompt_pool_epochs=10, replay_s_e_e=256, replay_epochs=100, seed=2021, max_grad_norm=10, data_path='/home/luungoc/Thesis - 2023.2/datasets/', bert_path='bert-base-uncased', cov_mat=True, max_num_models=10, sample_freq=5, prompt_length=1, prompt_embed_dim=768, prompt_pool_size=80, prompt_top_k=8, prompt_init='uniform', prompt_key_init='uniform', drop_p=0.1, gradient_accumulation_steps=4, total_round=6, drop_out=0.5, use_gpu=True, rank=8, hidden_size=768)


In [3]:
import pickle
import random
import json, os
from transformers import BertTokenizer
import numpy as np  


def get_tokenizer(args):
    tokenizer = BertTokenizer.from_pretrained(args.bert_path, additional_special_tokens=["[E11]", "[E12]", "[E21]", "[E22]"])
    return tokenizer


class data_sampler(object):
    def __init__(self, args, seed=None):
        self.set_path(args)
        self.args = args

        # data path
        file_name = "{}.pkl".format("-".join([str(x) for x in [args.dataname, args.seed]]))
        mid_dir = ""
        for temp_p in ["datasets", "_process_path"]:
            mid_dir = os.path.join(mid_dir, temp_p)
            if not os.path.exists(mid_dir):
                os.mkdir(mid_dir)
        self.save_data_path = os.path.join(mid_dir, file_name)

        # import tokenizer
        self.tokenizer = get_tokenizer(args)

        # read relation data
        self.id2rel, self.rel2id = self._read_relations(args.relation_file)

        # random sampling
        self.seed = seed
        if self.seed is not None:
            random.seed(self.seed)
        self.shuffle_index = list(range(len(self.id2rel)))
        random.shuffle(self.shuffle_index)
        self.shuffle_index = np.argsort(self.shuffle_index)

        # regenerate data
        self.training_dataset, self.valid_dataset, self.test_dataset = self._read_data(self.args.data_file)

        # generate the task number
        self.batch = 0
        self.task_length = len(self.id2rel) // self.args.rel_per_task

        # record relations
        self.seen_relations = []
        self.history_test_data = {}
        
        if args.dataname in ["FewRel"]:
            self.id2rel = json.load(open(os.path.join(args.data_path, "id2rel.json"), 'r'))
        else:
            self.id2rel = json.load(open(os.path.join(args.data_path, "id2rel_tacred.json"), 'r'))
        
        self.rel2id = {label: idx for idx, label in enumerate(self.id2rel)}
        

    def set_path(self, args):
        use_marker = ""
        if args.dataname in ["FewRel"]:
            args.data_file = os.path.join(args.data_path, "data_with{}_marker.json".format(use_marker))
            args.relation_file = os.path.join(args.data_path, "id2rel.json")
            args.num_of_relation = 80
            args.num_of_train = 420
            args.num_of_val = 140
            args.num_of_test = 140
            
        elif args.dataname in ["TACRED"]:
            args.data_file = os.path.join(args.data_path, "data_with{}_marker_tacred.json".format(use_marker))
            args.relation_file = os.path.join(args.data_path, "id2rel_tacred.json")
            args.num_of_relation = 40
            args.num_of_train = 420
            args.num_of_val = 140
            args.num_of_test = 140

    def set_seed(self, seed):
        self.seed = seed
        if self.seed != None:
            random.seed(self.seed)
        self.shuffle_index = list(range(len(self.id2rel)))
        random.shuffle(self.shuffle_index)
        self.shuffle_index = np.argsort(self.shuffle_index)

    def __iter__(self):
        return self

    def __next__(self):
        if self.batch == self.task_length:
            raise StopIteration()

        indexs = self.shuffle_index[self.args.rel_per_task * self.batch : self.args.rel_per_task * (self.batch + 1)]
        self.batch += 1

        current_relations = []
        cur_training_data = {}
        cur_valid_data = {}
        cur_test_data = {}

        for index in indexs:
            current_relations.append(self.id2rel[index])
            self.seen_relations.append(self.id2rel[index])
            cur_training_data[self.id2rel[index]] = self.training_dataset[index]
            cur_valid_data[self.id2rel[index]] = self.valid_dataset[index]
            cur_test_data[self.id2rel[index]] = self.test_dataset[index]
            self.history_test_data[self.id2rel[index]] = self.test_dataset[index]

        return cur_training_data, cur_valid_data, cur_test_data, current_relations, self.history_test_data, self.seen_relations

    def _read_data(self, file):
        if os.path.isfile(self.save_data_path):
            with open(self.save_data_path, "rb") as f:
                datas = pickle.load(f)
            train_dataset, val_dataset, test_dataset = datas
            return train_dataset, val_dataset, test_dataset
        else:
            data = json.load(open(file, "r", encoding="utf-8"))
            train_dataset = [[] for i in range(self.args.num_of_relation)]
            val_dataset = [[] for i in range(self.args.num_of_relation)]
            test_dataset = [[] for i in range(self.args.num_of_relation)]
            for relation in data.keys():
                rel_samples = data[relation]
                if self.seed != None:
                    random.seed(self.seed)
                random.shuffle(rel_samples)
                count = 0
                count1 = 0
                for i, sample in enumerate(rel_samples):
                    tokenized_sample = {}
                    tokenized_sample["relation"] = self.rel2id[sample["relation"]]
                    tokenized_sample["text"] = " ".join(sample["tokens"])
                    tokenized_sample["tokens"] = self.tokenizer.encode(" ".join(sample["tokens"]), padding="max_length", truncation=True, max_length=self.args.max_length)


                    if self.args.task_name == "FewRel":
                        if i < self.args.num_of_train:
                            train_dataset[self.rel2id[relation]].append(tokenized_sample)
                        elif i < self.args.num_of_train + self.args.num_of_val:
                            val_dataset[self.rel2id[relation]].append(tokenized_sample)
                        else:
                            test_dataset[self.rel2id[relation]].append(tokenized_sample)
                    else:
                        if i < len(rel_samples) // 5 and count <= 40:
                            count += 1
                            test_dataset[self.rel2id[relation]].append(tokenized_sample)
                        else:
                            count1 += 1
                            train_dataset[self.rel2id[relation]].append(tokenized_sample)
                            if count1 >= 320:
                                break

                    
            with open(self.save_data_path, "wb") as f:
                pickle.dump((train_dataset, val_dataset, test_dataset), f)
            return train_dataset, val_dataset, test_dataset

    def _read_relations(self, file):
        id2rel = json.load(open(file, "r", encoding="utf-8"))
        rel2id = {}
        for i, x in enumerate(id2rel):
            rel2id[x] = i
        return id2rel, rel2id


In [4]:
import random

random.seed(args.seed)
print(args.seed)

2021


In [6]:
data = data_sampler(args, seed=2021)
list_data = []

for steps, (training_data, valid_data, test_data, current_relations, historic_test_data, seen_relations) in enumerate(data):
    
    # task_x = []
    # print(current_relations)
    # for relation in current_relations:
    #     for sample in test_data[relation]:
    #         task_x.append({
    #             'text': sample['text'],
    #             'relation': sample['relation'],
    #             'description': new_pt,
    #             'chunk': current_relations,
    #         })
    # list_data.append(task_x)
    
    print(training_data[current_relations[0]][0])
    
    

{'relation': 26, 'text': 'She is noted for playing a vintage Rhodes Piano on both Denali albums , their [E11] self - titled album [E12] and its follow - up , [E21] The Instinct [E22] .', 'tokens': [101, 2016, 2003, 3264, 2005, 2652, 1037, 13528, 10588, 3682, 2006, 2119, 7939, 11475, 4042, 1010, 2037, 30522, 2969, 1011, 4159, 2201, 30523, 1998, 2049, 3582, 1011, 2039, 1010, 30524, 1996, 12753, 30525, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
{'relation': 72, 'text': '[E11] Renata Pokupić [E12] ( born July 24 , 1972 , in Virovitica , Republic of Croatia ) is a Croatian operatic [E21] mezzo - soprano [E22] .', 'tokens': [101, 30522, 14916, 6790, 13433, 5283, 24330, 30523, 1006, 2141, 2251, 2484, 1010, 3285, 1010, 1999, 6819, 12298, 18291, 2050,

In [None]:
from FlagEmbedding import BGEM3FlagModel

In [40]:
import json

data_ds = json.load(open('/home/luungoc/Thesis - 2023.2/Thesis_NgocLT/format/prompt.json', 'r'))
data2 = json.load(open('/home/luungoc/Thesis - 2023.2/Thesis_NgocLT/format/prompt1.json', 'r'))

data_ds += data2
len(data_ds)

10

In [74]:
json.dump(lis <sdghk'
          23, open('/home/luungoc/Thesis - 2023.2/Thesis_NgocLT/datasets/retrieval.json', 'w'), ensure_ascii=False)

In [30]:
print(data[9])

The relations described above serve to categorize and clarify the connections or associations between different entities within text data. Understanding these relations helps in organizing information more effectively, particularly in fields like knowledge graph construction, information retrieval, and natural language processing tasks. Here's an explanation of the sense of each relation:

1. **org:shareholders** - This relation identifies a party (individual or group) that owns shares in a corporation or organization, indicating a financial interest or partial ownership. The relation clarifies who has a stake in the company, which can affect decisions, control, or influence within the organization.

2. **org:parents** - This relation defines the ownership or control relationship between two corporations, specifically pointing out that one entity (the parent) owns another entity (the subsidiary). This can influence the operations, policies, and strategic direction of the subsidiary and

In [68]:
format = json.load(open('/home/luungoc/Thesis - 2023.2/Thesis_NgocLT/format/format.json', 'r'))

In [69]:
temp = "You are a useful information extraction machine. Read the examples carefully and explain the sense of five relations above (note: not analysis the examples)."

In [70]:
new_pt = []

for it1, it2 in zip(format, data_ds):
    new_pt.append(it1.replace(temp, it2))

In [71]:
print(new_pt[3])


User information
----------------
Example 1: 
{
    'context': `` [E21] ontario [E22] is taking the next step towards recovering taxpayer dollars spent fighting tobacco-related illnesses , '' ontario attorney general [E11] chris bentley [E12] said in a statement .,
    'entity_1': chris bentley,
    'entity_2': ontario,
    'relation': per:stateorprovinces_of_residence
}

Example 2: 
{
    'context': beijing , dec 27 -lrb- xinhua -rrb- the original china southwest airlines and [E11] china national aviation corporation [E12] -lrb- [E21] cnac [E22] -rrb- will use the same airline code and numeric code as air china from january 1 , 2003 .,
    'entity_1': china national aviation corporation,
    'entity_2': cnac,
    'relation': org:alternate_names
}

Example 3: 
{
    'context': the recent merger announced between the [E11] american bankers association [E12] and [E21] america [E22] 's community bankers has done more than create the nation 's largest banking trade association .,
    'ent

In [3]:
import os

# Lấy đường dẫn của tập tin hiện tại
current_file_path = os.path.abspath(__file__)

print("Đường dẫn của tập tin hiện tại:", current_file_path)


NameError: name '__file__' is not defined