In [1]:
import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

cdr_rel2id = {'1:NR:2': 0, '1:CID:2': 1}
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.1', cache_dir='/data/pj20/.cache')

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
text2triples = {}

def chunks(l, n):
    res = []
    for i in range(0, len(l), n):
        assert len(l[i:i + n]) == n
        res += [l[i:i + n]]
    return res

def read_cdr(file_in, tokenizer):
    pmids = set()
    with open(file_in, 'r') as infile:
        lines = infile.readlines()
        for i_l, line in enumerate(lines):
            line = line.rstrip().split('\t')
            pmid = line[0]

            if pmid not in pmids:
                pmids.add(pmid)
                text = line[1]
                prs = chunks(line[2:], 17)

                ent2idx = {}
                train_triples = {}

                entity_pos = set()
                for p in prs:
                    es = list(map(int, p[8].split(':')))
                    ed = list(map(int, p[9].split(':')))
                    tpy = p[7]
                    for start, end in zip(es, ed):
                        entity_pos.add((start, end, tpy))

                    es = list(map(int, p[14].split(':')))
                    ed = list(map(int, p[15].split(':')))
                    tpy = p[13]
                    for start, end in zip(es, ed):
                        entity_pos.add((start, end, tpy))

                sents = [t.split(' ') for t in text.split('|')]
                new_sents = []
                sent_map = {}
                i_t = 0
                for sent in sents:
                    for token in sent:
                        tokens_wordpiece = tokenizer.tokenize(token)
                        for start, end, tpy in list(entity_pos):
                            if i_t == start:
                                tokens_wordpiece = ["*"] + tokens_wordpiece
                            if i_t + 1 == end:
                                tokens_wordpiece = tokens_wordpiece + ["*"]
                        sent_map[i_t] = len(new_sents)
                        new_sents.extend(tokens_wordpiece)
                        i_t += 1
                    sent_map[i_t] = len(new_sents)
                sents = new_sents

                entity_pos = []

                for p in prs:
                    if p[0] == "not_include":
                        continue
                    if p[1] == "L2R":
                        h_id, t_id = p[5], p[11]
                        h_start, t_start = p[8], p[14]
                        h_end, t_end = p[9], p[15]
                    else:
                        t_id, h_id = p[5], p[11]
                        t_start, h_start = p[8], p[14]
                        t_end, h_end = p[9], p[15]
                    h_start = map(int, h_start.split(':'))
                    h_end = map(int, h_end.split(':'))
                    t_start = map(int, t_start.split(':'))
                    t_end = map(int, t_end.split(':'))
                    h_start = [sent_map[idx] for idx in h_start]
                    h_end = [sent_map[idx] for idx in h_end]
                    t_start = [sent_map[idx] for idx in t_start]
                    t_end = [sent_map[idx] for idx in t_end]
                    if h_id not in ent2idx:
                        ent2idx[h_id] = len(ent2idx)
                        entity_pos.append(list(zip(h_start, h_end)))
                    if t_id not in ent2idx:
                        ent2idx[t_id] = len(ent2idx)
                        entity_pos.append(list(zip(t_start, t_end)))
                    h_id, t_id = ent2idx[h_id], ent2idx[t_id]

                    r = cdr_rel2id[p[0]]
                    if (h_id, t_id) not in train_triples:
                        train_triples[(h_id, t_id)] = [{'relation': r}]
                    else:
                        train_triples[(h_id, t_id)].append({'relation': r})

                relations, hts = [], []
                for h, t in train_triples.keys():
                    relation = [0] * len(cdr_rel2id)
                    for mention in train_triples[h, t]:
                        relation[mention["relation"]] = 1
                    relations.append(relation)
                    hts.append([h, t])

                text2triples[pmid] = {'text': sents, 'relations': relations, 'hts': hts}
            

In [7]:
test_features = read_cdr('/home/pj20/GREScore/datasets/cdr/test_filter.data', tokenizer)

In [10]:
text2triples

{'8701013': {'text': ['*',
   '▁F',
   'am',
   'ot',
   'id',
   'ine',
   '*',
   '▁-',
   '▁associated',
   '*',
   '▁del',
   'ir',
   'ium',
   '*',
   '▁.',
   '▁A',
   '▁series',
   '▁of',
   '▁six',
   '▁cases',
   '▁.',
   '*',
   '▁F',
   'am',
   'ot',
   'id',
   'ine',
   '*',
   '▁is',
   '▁a',
   '▁hist',
   'amine',
   '▁H',
   '2',
   '▁-',
   '▁re',
   'ceptor',
   '▁ant',
   'agon',
   'ist',
   '▁used',
   '▁in',
   '▁in',
   'patient',
   '▁settings',
   '▁for',
   '▁prevention',
   '▁of',
   '▁stress',
   '*',
   '▁ul',
   'cers',
   '*',
   '▁and',
   '▁is',
   '▁showing',
   '▁increasing',
   '▁popularity',
   '▁because',
   '▁of',
   '▁its',
   '▁low',
   '▁cost',
   '▁.',
   '▁Although',
   '▁all',
   '▁of',
   '▁the',
   '▁currently',
   '▁available',
   '▁H',
   '2',
   '▁-',
   '▁re',
   'ceptor',
   '▁ant',
   'agon',
   'ists',
   '▁have',
   '▁shown',
   '▁the',
   '▁prop',
   'ensity',
   '▁to',
   '▁cause',
   '*',
   '▁del',
   'ir',
   'ium',
   '*',

In [7]:
with open('/home/pj20/GREScore/datasets/cdr/test_filter.data', 'r') as f:
    lines = f.readlines()

In [8]:
lines[0]

'8701013\tFamotidine - associated delirium .|A series of six cases .|Famotidine is a histamine H2 - receptor antagonist used in inpatient settings for prevention of stress ulcers and is showing increasing popularity because of its low cost .|Although all of the currently available H2 - receptor antagonists have shown the propensity to cause delirium , only two previously reported cases have been associated with famotidine .|The authors report on six cases of famotidine - associated delirium in hospitalized patients who cleared completely upon removal of famotidine .|The pharmacokinetics of famotidine are reviewed , with no change in its metabolism in the elderly population seen .|The implications of using famotidine in elderly persons are discussed .\t1:CID:2\tL2R\tNON-CROSS\t0-1\t3-4\tD015738\tFamotidine|Famotidine|famotidine|famotidine|famotidine|famotidine|famotidine\tChemical\t0:11:66:75:88:93:113\t1:12:67:76:89:94:114\t0:2:3:4:4:5:6\tD003693\tdelirium|delirium|delirium\tDisease\t3