In [1]:
from config import Param

param = Param()
args = param.args
args

args.task_name = args.dataname

# rel_per_task
args.rel_per_task = 8 if args.dataname == "FewRel" else 4

In [2]:
print(args)

Namespace(gpu=0, dataname='TACRED', task_name='TACRED', device='cuda', batch_size=64, num_tasks=10, rel_per_task=4, pattern='entity_marker', max_length=192, encoder_output_size=768, vocab_size=30522, marker_size=4, num_workers=0, save_checkpoint='./checkpoint/', classifier_epochs=10, seed=2021, max_grad_norm=10, data_path='/home/luungoc/Thesis - 2023.2/Thesis_NgocLT/datasets/standard/', bert_path='bert-base-uncased', drop_p=0.1, gradient_accumulation_steps=4, total_round=6, drop_out=0.5, use_gpu=True, hidden_size=768, rank_lora=8, bge_model='/kaggle/working/model_bge/checkpoint-6000', description_path='/kaggle/input/train-bge/standard.json', type_similar='colbert', num_protos=10)


In [3]:
import pickle
import random
import json, os
from transformers import BertTokenizer
import numpy as np  


def get_tokenizer(args):
    tokenizer = BertTokenizer.from_pretrained(args.bert_path, additional_special_tokens=["[E11]", "[E12]", "[E21]", "[E22]"])
    return tokenizer


class data_sampler(object):
    def __init__(self, args, seed=None):
        self.set_path(args)
        self.args = args

        # data path
        file_name = "{}.pkl".format("-".join([str(x) for x in [args.dataname, args.seed]]))
        mid_dir = ""
        for temp_p in ["datasets", "_process_path"]:
            mid_dir = os.path.join(mid_dir, temp_p)
            if not os.path.exists(mid_dir):
                os.mkdir(mid_dir)
        self.save_data_path = os.path.join(mid_dir, file_name)

        # import tokenizer
        self.tokenizer = get_tokenizer(args)

        # read relation data
        self.id2rel, self.rel2id = self._read_relations(args.relation_file)

        # random sampling
        self.seed = seed
        if self.seed is not None:
            random.seed(self.seed)
        self.shuffle_index = list(range(len(self.id2rel)))
        random.shuffle(self.shuffle_index)
        self.shuffle_index = np.argsort(self.shuffle_index)

        # regenerate data
        self.training_dataset, self.valid_dataset, self.test_dataset = self._read_data(self.args.data_file)

        # generate the task number
        self.batch = 0
        self.task_length = len(self.id2rel) // self.args.rel_per_task

        # record relations
        self.seen_relations = []
        self.history_test_data = {}
        
        if args.dataname in ["FewRel"]:
            self.id2rel = json.load(open(os.path.join(args.data_path, "id2rel.json"), 'r'))
        else:
            self.id2rel = json.load(open(os.path.join(args.data_path, "id2rel_tacred.json"), 'r'))
        
        self.rel2id = {label: idx for idx, label in enumerate(self.id2rel)}
        

    def set_path(self, args):
        use_marker = ""
        if args.dataname in ["FewRel"]:
            args.data_file = os.path.join(args.data_path, "data_with{}_marker.json".format(use_marker))
            args.relation_file = os.path.join(args.data_path, "id2rel.json")
            args.num_of_relation = 80
            args.num_of_train = 420
            args.num_of_val = 140
            args.num_of_test = 140
            
        elif args.dataname in ["TACRED"]:
            args.data_file = os.path.join(args.data_path, "data_with{}_marker_tacred.json".format(use_marker))
            args.relation_file = os.path.join(args.data_path, "id2rel_tacred.json")
            args.num_of_relation = 40
            args.num_of_train = 420
            args.num_of_val = 140
            args.num_of_test = 140

    def set_seed(self, seed):
        self.seed = seed
        if self.seed != None:
            random.seed(self.seed)
        self.shuffle_index = list(range(len(self.id2rel)))
        random.shuffle(self.shuffle_index)
        self.shuffle_index = np.argsort(self.shuffle_index)

    def __iter__(self):
        return self

    def __next__(self):
        if self.batch == self.task_length:
            raise StopIteration()

        indexs = self.shuffle_index[self.args.rel_per_task * self.batch : self.args.rel_per_task * (self.batch + 1)]
        self.batch += 1

        current_relations = []
        cur_training_data = {}
        cur_valid_data = {}
        cur_test_data = {}

        for index in indexs:
            current_relations.append(self.id2rel[index])
            self.seen_relations.append(self.id2rel[index])
            cur_training_data[self.id2rel[index]] = self.training_dataset[index]
            cur_valid_data[self.id2rel[index]] = self.valid_dataset[index]
            cur_test_data[self.id2rel[index]] = self.test_dataset[index]
            self.history_test_data[self.id2rel[index]] = self.test_dataset[index]

        return cur_training_data, cur_valid_data, cur_test_data, current_relations, self.history_test_data, self.seen_relations

    def _read_data(self, file):
        if os.path.isfile(self.save_data_path):
            with open(self.save_data_path, "rb") as f:
                datas = pickle.load(f)
            train_dataset, val_dataset, test_dataset = datas
            return train_dataset, val_dataset, test_dataset
        else:
            data = json.load(open(file, "r", encoding="utf-8"))
            train_dataset = [[] for i in range(self.args.num_of_relation)]
            val_dataset = [[] for i in range(self.args.num_of_relation)]
            test_dataset = [[] for i in range(self.args.num_of_relation)]
            for relation in data.keys():
                rel_samples = data[relation]
                if self.seed != None:
                    random.seed(self.seed)
                random.shuffle(rel_samples)
                count = 0
                count1 = 0
                for i, sample in enumerate(rel_samples):
                    tokenized_sample = {}
                    tokenized_sample["relation"] = self.rel2id[sample["relation"]]
                    tokenized_sample["text"] = " ".join(sample["tokens"])
                    tokenized_sample["tokens"] = self.tokenizer.encode(" ".join(sample["tokens"]), padding="max_length", truncation=True, max_length=self.args.max_length)


                    if self.args.task_name == "FewRel":
                        if i < self.args.num_of_train:
                            train_dataset[self.rel2id[relation]].append(tokenized_sample)
                        elif i < self.args.num_of_train + self.args.num_of_val:
                            val_dataset[self.rel2id[relation]].append(tokenized_sample)
                        else:
                            test_dataset[self.rel2id[relation]].append(tokenized_sample)
                    else:
                        if i < len(rel_samples) // 5 and count <= 40:
                            count += 1
                            test_dataset[self.rel2id[relation]].append(tokenized_sample)
                        else:
                            count1 += 1
                            train_dataset[self.rel2id[relation]].append(tokenized_sample)
                            if count1 >= 320:
                                break

                    
            with open(self.save_data_path, "wb") as f:
                pickle.dump((train_dataset, val_dataset, test_dataset), f)
            return train_dataset, val_dataset, test_dataset

    def _read_relations(self, file):
        id2rel = json.load(open(file, "r", encoding="utf-8"))
        rel2id = {}
        for i, x in enumerate(id2rel):
            rel2id[x] = i
        return id2rel, rel2id


In [6]:
data = data_sampler(args, seed=2021)
list_data = []

for steps, (training_data, valid_data, test_data, current_relations, historic_test_data, seen_relations) in enumerate(data):
    
    # list_data.append(task_x)
    
        # for item in training_data[relation]:
        #     if item['relation'] == 0:
        #         print(item)
    # if training_data[current_relations[0]][21]['relation'] == 0:
    print(training_data[current_relations[0]][21])
    # print(current_relations)
    
    

{'relation': 3, 'text': "When [E11] he [E12] rode streetcars as a child in [E21] Shreveport [E22] , he often sat in the back but found that the conductors would exclude him by moving the `` blacks only '' sign behind him so he would be in the white section .", 'tokens': [101, 2043, 30522, 2002, 30523, 8469, 21420, 2015, 2004, 1037, 2775, 1999, 30524, 23740, 30525, 1010, 2002, 2411, 2938, 1999, 1996, 2067, 2021, 2179, 2008, 1996, 23396, 2052, 23329, 2032, 2011, 3048, 1996, 1036, 1036, 10823, 2069, 1005, 1005, 3696, 2369, 2032, 2061, 2002, 2052, 2022, 1999, 1996, 2317, 2930, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [15]:
len(list_data)

json.dump(list_data, open('./prototype.json', 'w'), ensure_ascii=False)

In [5]:
import json

x = json.load(open('/home/luungoc/Thesis - 2023.2/Thesis_NgocLT/hoang/train.json', 'r'))

In [8]:
x[0]['relations']

['per:cities_of_residence', 'per:other_family', 'org:founded', 'per:origin']

In [39]:
text = """The relation name "org:stateorprovince_of_headquarters" refers to the specific type of relationship between an organization (E11, E12) and the state or province where that organization's headquarters are located (E21, E22). This relationship is identified from textual data that explicitly or implicitly mentions the geographical location of the main office or central operating facility of an organization. The examples provided illustrate various instances where the headquarters of an organization are linked to a particular state or province, showcasing the diversity in organizational types and the geographical spread of their central operations.

The "org:stateorprovince_of_headquarters" relation is pivotal for understanding the geographic distribution of organizational headquarters, which can offer insights into the regional economic impact, employment patterns, and the strategic decisions behind where companies choose to base their main operations. This relationship helps in mapping out the corporate landscape of different regions, indicating the concentration of industries, the preference of organizations for certain locales due to logistical, economic, or regulatory advantages, and the potential influence of these organizations on local economies and policies.

Understanding this relation is crucial for economic analysis, regional planning, and the study of corporate strategies. It aids in identifying how organizations are distributed across states or provinces, which can be essential for supply chain logistics, marketing strategies, and understanding the regional dynamics of business operations. This relation also has implications for tax policies, infrastructure development, and workforce distribution, offering a lens through which to view the interactions between corporations and the geographical contexts of their headquarters.
"""
for item in x:
    if item['relation'] == "org:stateorprovince_of_headquarters":
        print("Yes")
        item['text'] = text

Yes


In [14]:
import json

data = json.load(open('/home/luungoc/Thesis - 2023.2/tacred-relation/dataset/tacred/train.json', 'r'))

obj_type = set()
sub_type = set()

for sample in data:
    obj_type.add(sample['obj_type'])
    sub_type.add(sample['subj_type'])

In [115]:
import json

data_old = json.load(open('/home/luungoc/Thesis - 2023.2/Thesis_NgocLT/datasets/standard/data_no_marker_tacred.json', 'r'))

In [8]:
from datasets import load_dataset

data_ori = load_dataset('xiaobendanyn/tacred')

Found cached dataset text (/home/luungoc/.cache/huggingface/datasets/xiaobendanyn___text/xiaobendanyn--tacred-6917f9984c48fec2/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
training_data, valid_data, test_data = {}, {}, {}

relations = []
for item in data_ori['train']['text']:
    relations.append(item)
    
    
for relation in relations:
    training_data[relation], test_data[relation], valid_data[relation] = [], [], []

In [11]:
# data['train'][1]['text']


import json
from copy import deepcopy


def extract_entities(json_string):
    # Chuyển chuỗi JSON thành dictionary
    data = json.loads(json_string)
    
    # Lấy thông tin cần thiết từ dictionary
    token = data.get('token', [])
    relation = data.get('relation', None)
    entity_1 = data.get('h', {}).get('name', None)
    entity_2 = data.get('t', {}).get('name', None)
    
    # Tạo dictionary mới với thông tin lấy được
    new_data = {
        'token': token,
        'relation': relation,
        'entity_1': entity_1,
        'entity_2': entity_2
    }
    
    return insert_entity_tokens(new_data)



def insert_entity_tokens(data):
    new = deepcopy(data)
    token = data.get('token', [])
    entity_1 = data.get('entity_1', None)
    entity_2 = data.get('entity_2', None)

    if entity_1:
        # Tìm vị trí của entity_1 trong danh sách token
        index_entity_1 = find_entity_index(token, entity_1)
        if index_entity_1 is not None:
            # Chèn [E11] và [E12] vào vị trí tương ứng trong danh sách token
            token.insert(index_entity_1, "[E11]")
            token.insert(index_entity_1 + len(entity_1.split()) + 1, "[E12]")

    if entity_2:
        # Tìm vị trí của entity_2 trong danh sách token
        index_entity_2 = find_entity_index(token, entity_2)
        if index_entity_2 is not None:
            # Chèn [E21] và [E22] vào vị trí tương ứng trong danh sách token
            token.insert(index_entity_2, "[E21]")
            token.insert(index_entity_2 + len(entity_2.split()) + 1, "[E22]")

    return {
        'relation': data['relation'],
        'token_with_marker': token,
        'token': new['token'] 
    }

def find_entity_index(token, entity):
    entity_words = entity.split()
    for i in range(len(token) - len(entity_words) + 1):
        if token[i:i+len(entity_words)] == entity_words:
            return i
    return None

In [12]:
training_data, valid_data, test_data = {}, {}, {}

    
relations = set()

for sample in data_ori['train']:
    fix = extract_entities(sample['text'])
    
    relations.add(fix['relation'])
    # training_data[fix['relation']].append(fix)
    
    
    
for relation in list(relations):
    training_data[relation], test_data[relation], valid_data[relation] = [], [], []
    
    
list(relations)

['org:stateorprovince_of_headquarters',
 'per:stateorprovince_of_death',
 'per:cities_of_residence',
 'org:top_members/employees',
 'org:parents',
 'per:origin',
 'per:parents',
 'per:religion',
 'per:date_of_birth',
 'org:dissolved',
 'org:country_of_headquarters',
 'per:charges',
 'NA',
 'per:city_of_birth',
 'per:title',
 'org:number_of_employees/members',
 'per:stateorprovince_of_birth',
 'per:age',
 'org:founded_by',
 'per:alternate_names',
 'org:political/religious_affiliation',
 'org:shareholders',
 'org:founded',
 'per:country_of_death',
 'org:member_of',
 'per:siblings',
 'org:members',
 'org:alternate_names',
 'per:country_of_birth',
 'per:children',
 'per:countries_of_residence',
 'org:website',
 'org:city_of_headquarters',
 'per:city_of_death',
 'per:other_family',
 'org:subsidiaries',
 'per:spouse',
 'per:date_of_death',
 'per:schools_attended',
 'per:employee_of',
 'per:cause_of_death',
 'per:stateorprovinces_of_residence']

In [13]:
training_data, valid_data, test_data = {}, {}, {}

    
relations = set()

for sample in data_ori['train']:
    fix = extract_entities(sample['text'])
    
    relations.add(fix['relation'])
    # training_data[fix['relation']].append(fix)
    
    
    
for relation in list(relations):
    training_data[relation], test_data[relation], valid_data[relation] = [], [], []
    

for sample in data_ori['train']:
    fix = extract_entities(sample['text'])
    
    training_data[fix['relation']].append(fix)


for sample in data_ori['test']:
    fix = extract_entities(sample['text'])
    
    test_data[fix['relation']].append(fix)


for sample in data_ori['validation']:
    fix = extract_entities(sample['text'])
    
    valid_data[fix['relation']].append(fix)
    
    
total = {}
count = 0

for keys, values in enumerate(training_data):
    total[values] = []
    
    if values != 'per:country_of_death':
        total[values] += (training_data[values] + test_data[values] + valid_data[values])
        count += len(total[values])
        
count

106203

In [31]:
relation_na = []

for sample in total['NA']:
    relation_na.append({
        'relation': sample['relation'],
        'tokens': sample['token_with_marker'],
    })
    
    if len(relation_na) == 500:
        break

json.dump(relation_na, open('/home/luungoc/Thesis - 2023.2/Thesis_NgocLT/datasets/standard/no_relation.json', 'w'), ensure_ascii=False)

# def find_index(arr, value):
#     for i in range(len(arr)):
#         if arr[i] == value:
#             return i
#     return -1
        
# for keys, values in enumerate(total):
#     if values == 'NA':
#         for sample in total[values]:
#             if find_index(sample['token_with_marker'], '[E11]') + 1 == find_index(sample['token_with_marker'], '[E12]') \
#             or find_index(sample['token_with_marker'], '[E21]') + 1 == find_index(sample['token_with_marker'], '[E22]'):
#                 print(sample['token_with_marker'])
#             # print(find_index(sample['token_with_marker'], value='[E11]'))
#             # print(sample['token_with_marker'], v)

In [111]:
train_marker, train_no_marker = {}, {}

for keys, values in enumerate(total):
    train_marker[values], train_no_marker[values] = [], []
    
    for sample in total[values]:
        if sample['relation'] not in ['per:country_of_death', 'NA']:
            train_marker[values].append({
                'relation': sample['relation'],
                'tokens': sample['token_with_marker'],
            })
            
            train_no_marker[values].append({
                'relation': sample['relation'],
                'tokens': sample['token'],
            })

In [3]:
import json

data_old = json.load(open('/home/luungoc/Thesis - 2023.2/Thesis_NgocLT/datasets/train_step_{steps}.json', 'r'))

In [7]:
print(data_old[1]['neg'][1])

In the extracted passage, tokens [E11], [E12], [E21], [E22] appear to mark the positions of entities. However, the words or phrases between them are not linked or refer to any relationship between these entities.
Example: Survivors include his wife of 58 years , Jane Callaghan Gude of Washington ; five children , Sharon Gude of Rockville , Adrienne Lewis of Washington , Gilbert Gude Jr. of Bethesda and [E11] Gregory Gude [E12] and [E21] Daniel Gude [E22] , both of Cabin John , Md. ; and three grandchildren .


In [10]:
from collections import Counter

def most_frequent(arr):
    counter = Counter(arr)
    most_common = counter.most_common(1)  # Lấy phần tử có tần suất xuất hiện cao nhất
    return most_common[0][0]

# Sử dụng hàm:
array = [1, 2, 3, 4, 2, 2, 3, 1, 4, 2, 5, 5, 5, 5]
print("Giá trị có tần suất xuất hiện nhiều nhất là:", most_frequent(array))


Giá trị có tần suất xuất hiện nhiều nhất là: 2


In [12]:
def get_values_from_indices(array_n, array_m):
    return [array_n[index] for index in array_m if index < len(array_n)]

# Ví dụ sử dụng:
array_n = [50, 20, 30, 40, 50, 60, 70]  # Mảng n phần tử
array_m = [0, 2, 4]  # Mảng m phần tử (là chỉ số của mảng n)

m_values = most_frequent(get_values_from_indices(array_n, array_m))
print("Các giá trị tương ứng từ mảng n với các chỉ số từ mảng m:", m_values)


Các giá trị tương ứng từ mảng n với các chỉ số từ mảng m: 50


In [13]:
def most_frequent_value(array):
    """
    Trả về giá trị có tần suất xuất hiện nhiều nhất và một chỉ số của nó trong mảng.

    Parameters:
        array (list): Mảng chứa các giá trị.

    Returns:
        tuple: Một tuple gồm giá trị có tần suất xuất hiện nhiều nhất và một chỉ số của nó.
    """
    # Tạo một từ điển để đếm tần suất xuất hiện của từng giá trị
    frequency_dict = {}
    for index, value in enumerate(array):
        if value in frequency_dict:
            frequency_dict[value].append(index)
        else:
            frequency_dict[value] = [index]

    # Tìm giá trị có tần suất xuất hiện cao nhất
    max_frequency = max(len(indices) for indices in frequency_dict.values())
    most_frequent_values = [value for value, indices in frequency_dict.items() if len(indices) == max_frequency]

    # Chọn một chỉ số của giá trị có tần suất xuất hiện cao nhất
    index_of_most_frequent_value = frequency_dict[most_frequent_values[0]][0]

    return most_frequent_values[0], index_of_most_frequent_value

# Ví dụ sử dụng:
array = [1, 2, 3, 4, 2, 2, 3, 2, 5, 5, 5, 1, 2]
most_frequent_val, index_of_most_frequent_val = most_frequent_value(array)
print("Giá trị có tần suất xuất hiện nhiều nhất:", most_frequent_val)
print("Chỉ số của giá trị có tần suất xuất hiện nhiều nhất:", index_of_most_frequent_val)


Giá trị có tần suất xuất hiện nhiều nhất: 2
Chỉ số của giá trị có tần suất xuất hiện nhiều nhất: 1
