<a href="https://colab.research.google.com/github/nafis-momeni/BioRED_LLM/blob/main/Data_process.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import json
import time
import random

In [3]:
with open('/content/new_test.json') as file:
    new_test = json.load(file)
with open('/content/new_dev.json') as file:
    new_dev = json.load(file)
with open('/content/new_train.json') as file:
    new_train = json.load(file)

In [4]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import euclidean

# Example data (filled with dummy values for illustration)

train_stat = []

# Extract statistics for each document
for i in range(len(new_train)):
    ent_type = {'GeneOrGeneProduct': 0, 'ChemicalEntity': 0, 'DiseaseOrPhenotypicFeature': 0,
                'SequenceVariant': 0, 'OrganismTaxon': 0, 'CellLine': 0}
    rel_type = {'Association': 0, 'Positive_Correlation': 0, 'Bind': 0, 'Negative_Correlation': 0,
                'Comparison': 0, 'Conversion': 0, 'Cotreatment': 0, 'Drug_Interaction': 0}
    rel_c = 0
    novel = 0
    doc = new_train[str(i)]
    for e in doc["entities"]:
        ent_type[e["type"]] += 1

    for r in doc["relation"]:
        rel_type[r["infons"]["type"]] += 1
        rel_c += 1
        if r["infons"]["novel"] == "Novel":
            novel += 1

    stat = {
        'pmid': doc['pmid'],
        'ent_c': sum(value > 0 for value in ent_type.values()),
        'rel_c': rel_c,
        'g': ent_type['GeneOrGeneProduct'], 'c': ent_type['ChemicalEntity'], 'd': ent_type['DiseaseOrPhenotypicFeature'],
        'v': ent_type['SequenceVariant'], 'o': ent_type['OrganismTaxon'], 'cl': ent_type['CellLine'],
        'a': rel_type['Association'], 'pc': rel_type['Positive_Correlation'], 'b': rel_type['Bind'],
        'nc': rel_type['Negative_Correlation'], 'cmp': rel_type['Comparison'], 'cnv': rel_type['Conversion'],
        'ct': rel_type['Cotreatment'], 'di': rel_type['Drug_Interaction'],
        'novel_ratio': novel / rel_c if rel_c > 0 else 0,
        'ent_diversity': sum(value > 0 for value in ent_type.values()),
        'rel_diversity': sum(value > 0 for value in rel_type.values())
    }
    train_stat.append(stat)



In [5]:
# Convert test_stat to a structured array for easier manipulation
features = ['ent_c', 'rel_c', 'g', 'c', 'd', 'v', 'o', 'cl', 'a', 'pc', 'b', 'nc', 'cmp', 'cnv', 'ct', 'di', 'novel_ratio', 'ent_diversity', 'rel_diversity']
data_matrix = np.array([[stat[feature] for feature in features] for stat in train_stat])

# Calculate the average values for each feature
average_values = np.mean(data_matrix, axis=0)

# Calculate similarity score based on Euclidean distance
similarity_scores = np.array([1 / (1 + euclidean(sample, average_values)) for sample in data_matrix])

# Normalize the similarity scores to get representativeness scores
scaler = MinMaxScaler()
representativeness_scores = scaler.fit_transform(similarity_scores.reshape(-1, 1)).flatten()

# Add representativeness scores to the original statistics
for i, stat in enumerate(train_stat):
    stat['representativeness_score'] = representativeness_scores[i]

# Select top-k samples based on diversity scores
k = 20  # Number of samples to select
top_k_indices = np.argsort(representativeness_scores)[-k:]
selected_samples = [train_stat[i] for i in top_k_indices]

# Print the selected samples with diversity scores
for stat in selected_samples:
    print(stat)

print(top_k_indices)

{'pmid': '16575011', 'ent_c': 5, 'rel_c': 10, 'g': 1, 'c': 3, 'd': 3, 'v': 2, 'o': 1, 'cl': 0, 'a': 5, 'pc': 0, 'b': 0, 'nc': 3, 'cmp': 0, 'cnv': 0, 'ct': 2, 'di': 0, 'novel_ratio': 0.2, 'ent_diversity': 5, 'rel_diversity': 3, 'representativeness_score': 0.7602071860669584}
{'pmid': '15122708', 'ent_c': 4, 'rel_c': 7, 'g': 3, 'c': 0, 'd': 4, 'v': 1, 'o': 1, 'cl': 0, 'a': 6, 'pc': 1, 'b': 0, 'nc': 0, 'cmp': 0, 'cnv': 0, 'ct': 0, 'di': 0, 'novel_ratio': 0.2857142857142857, 'ent_diversity': 4, 'rel_diversity': 2, 'representativeness_score': 0.773053760850865}
{'pmid': '28346429', 'ent_c': 3, 'rel_c': 8, 'g': 5, 'c': 0, 'd': 2, 'v': 0, 'o': 1, 'cl': 0, 'a': 5, 'pc': 0, 'b': 1, 'nc': 2, 'cmp': 0, 'cnv': 0, 'ct': 0, 'di': 0, 'novel_ratio': 0.75, 'ent_diversity': 3, 'rel_diversity': 3, 'representativeness_score': 0.7734901884600571}
{'pmid': '28098423', 'ent_c': 5, 'rel_c': 11, 'g': 7, 'c': 1, 'd': 3, 'v': 0, 'o': 1, 'cl': 1, 'a': 4, 'pc': 3, 'b': 1, 'nc': 3, 'cmp': 0, 'cnv': 0, 'ct': 0, 'di'

In [None]:
  set_5 = [363, 399,185,319, 339,196]
  set_10 = set_5 + [263,288, 68, 359, 78]
  set_15 = set_10 + [22, 368, 367, 373, 51]
  zero_docs = [128,169,205,315,323, 363]
#representiveness
[85,  10, 294, 109, 71, 327, 366, 19 168 283, 279, 46, 358, 196, 201, 76, 320, 225, 56, 95]

#diversity
[322, 51,  78, 99, 355, 173, 368, 179, 371, 373, 339, 301, 288, 363, 185, 151,  22, 399, 263, 359, 196]

#2
[301, 319, 352,  22, 322,  78, 367, 151, 368,  51, 371, 355, 185, 373, 288, 263, 339, 196, 363, 359, 399]

rep + dive
[225, 56, 95, 363, 399]
[196, 363, 399, 56, 95]

In [6]:
# Extracting and normalizing features
features = ['ent_c', 'rel_c', 'g', 'c', 'd', 'v', 'o', 'cl', 'a', 'pc', 'b', 'nc', 'cmp', 'cnv', 'ct', 'di', 'novel_ratio', 'ent_diversity', 'rel_diversity']
data_matrix = np.array([[stat[feature] for feature in features] for stat in train_stat])

# Normalize the counts
scaler = MinMaxScaler()
normalized_data_matrix = scaler.fit_transform(data_matrix)

# Assign higher weights to scarce relation types
weights = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 3, 2, 2, 3, 1, 2, 6])
weighted_data_matrix = normalized_data_matrix * weights

# Calculate the composite scores
composite_scores = weighted_data_matrix.sum(axis=1)

# Normalize the composite scores
normalized_composite_scores = scaler.fit_transform(composite_scores.reshape(-1, 1)).flatten()

# Add representativeness scores to the original statistics
for i, stat in enumerate(train_stat):
    stat['diversity_score'] = normalized_composite_scores[i]

# Select top-k samples based on diversity scores
k = 21  # Number of samples to select
top_k_indices = np.argsort(normalized_composite_scores)[-k:]
selected_samples = [train_stat[i] for i in top_k_indices]

# Print the selected samples with diversity scores
for stat in selected_samples:
    print(stat)
print(top_k_indices)

{'pmid': '19463742', 'ent_c': 4, 'rel_c': 46, 'g': 12, 'c': 3, 'd': 0, 'v': 0, 'o': 2, 'cl': 2, 'a': 21, 'pc': 13, 'b': 1, 'nc': 11, 'cmp': 0, 'cnv': 0, 'ct': 0, 'di': 0, 'novel_ratio': 0.8695652173913043, 'ent_diversity': 4, 'rel_diversity': 4, 'representativeness_score': 0.05281731918696182, 'diversity_score': 0.7155205827772616}
{'pmid': '25218136', 'ent_c': 4, 'rel_c': 18, 'g': 14, 'c': 1, 'd': 1, 'v': 0, 'o': 1, 'cl': 0, 'a': 11, 'pc': 2, 'b': 3, 'nc': 2, 'cmp': 0, 'cnv': 0, 'ct': 0, 'di': 0, 'novel_ratio': 0.6111111111111112, 'ent_diversity': 4, 'rel_diversity': 4, 'representativeness_score': 0.2576172195828939, 'diversity_score': 0.720138586109711}
{'pmid': '16920333', 'ent_c': 3, 'rel_c': 6, 'g': 0, 'c': 5, 'd': 1, 'v': 0, 'o': 1, 'cl': 0, 'a': 0, 'pc': 1, 'b': 0, 'nc': 2, 'cmp': 3, 'cnv': 0, 'ct': 0, 'di': 0, 'novel_ratio': 0.8333333333333334, 'ent_diversity': 3, 'rel_diversity': 3, 'representativeness_score': 0.409884110438935, 'diversity_score': 0.7215812752079771}
{'pmid': 

In [17]:
from collections import defaultdict
train_stat = []

for i in range(len(new_train)):
  ent_type = {'GeneOrGeneProduct':0, 'ChemicalEntity': 0, 'DiseaseOrPhenotypicFeature': 0, 'SequenceVariant': 0, 'OrganismTaxon': 0, 'CellLine': 0}
  rel_type = {'Association': 0, 'Positive_Correlation': 0, 'Bind': 0, 'Negative_Correlation': 0, 'Comparison': 0, 'Conversion': 0, 'Cotreatment': 0, 'Drug_Interaction': 0}
  rel_c = 0
  novel =0
  doc = new_train[str(i)]
  for e in doc["entities"]:
    ent_type[e["type"]] += 1
  for r in doc["relation"]:
    rel_type[r["infons"]["type"]] += 1
    rel_c += 1
    if r["infons"]["novel"] == "Novel":
      novel += 1
  stat = {
      'pmid': doc['pmid'],
      'ent_c': sum(value > 0 for value in ent_type.values()),
      'rel_c': rel_c,
      'g': ent_type['GeneOrGeneProduct'], 'c': ent_type['ChemicalEntity'],'d': ent_type['DiseaseOrPhenotypicFeature'], 'v': ent_type['SequenceVariant'], 'o': ent_type['OrganismTaxon'], 'cl': ent_type['CellLine'],
      'a': rel_type['Association'], 'pc': rel_type['Positive_Correlation'], 'b': rel_type['Bind'], 'nc': rel_type['Negative_Correlation'], 'cmp': rel_type['Comparison'], 'cnv': rel_type['Conversion'], 'ct': rel_type['Cotreatment'], 'di': rel_type['Drug_Interaction'],
      'novel_ratio': novel/rel_c,
      'ent_diversity': sum(value > 0 for value in ent_type.values()),
      'rel_diversity': sum(value > 0 for value in rel_type.values())
  }
  train_stat.append(stat)


In [23]:
lissst = [ent_c, rel_c, g, c, d, v, o, cl, a, pc, b, nc, comp, cnv, ct, di, novel_ratios, ent_diversities, rel_diversities]
r = []
for l in lissst:
  r.append(np.mean(l))
print('[ent_c, rel_c, g, c, d, v, o, cl, a, pc, b, nc, comp, cnv, ct, di, novel_ratios, ent_diversities, rel_diversities]')
print(r)

[ent_c, rel_c, g, c, d, v, o, cl, a, pc, b, nc, comp, cnv, ct, di, novel_ratios, ent_diversities, rel_diversities]
[3.96, 11.63, 4.25, 2.24, 3.41, 1.39, 1.13, 0.22, 6.35, 3.25, 0.09, 1.71, 0.06, 0.01, 0.14, 0.02, 0.6631150732325763, 3.96, 2.22]


In [30]:
scaler = MinMaxScaler()
normalized_rel_c = scaler.fit_transform(np.array(rel_c).reshape(-1, 1)).flatten()
print(rel_c)
print(normalized_rel_c)

[18, 3, 15, 7, 4, 11, 48, 3, 1, 12, 14, 12, 2, 6, 4, 15, 15, 7, 11, 11, 19, 32, 31, 8, 107, 2, 1, 5, 5, 7, 11, 10, 6, 45, 2, 4, 12, 6, 2, 17, 6, 15, 10, 43, 4, 7, 5, 13, 4, 23, 15, 5, 6, 9, 4, 7, 23, 4, 2, 7, 5, 6, 7, 29, 12, 2, 1, 41, 8, 11, 6, 11, 21, 8, 8, 14, 7, 2, 11, 6, 11, 6, 17, 5, 3, 4, 25, 12, 5, 10, 24, 6, 10, 1, 14, 8, 4, 5, 6, 8]
[0.16037736 0.01886792 0.13207547 0.05660377 0.02830189 0.09433962
 0.44339623 0.01886792 0.         0.10377358 0.12264151 0.10377358
 0.00943396 0.04716981 0.02830189 0.13207547 0.13207547 0.05660377
 0.09433962 0.09433962 0.16981132 0.29245283 0.28301887 0.06603774
 1.         0.00943396 0.         0.03773585 0.03773585 0.05660377
 0.09433962 0.08490566 0.04716981 0.41509434 0.00943396 0.02830189
 0.10377358 0.04716981 0.00943396 0.1509434  0.04716981 0.13207547
 0.08490566 0.39622642 0.02830189 0.05660377 0.03773585 0.11320755
 0.02830189 0.20754717 0.13207547 0.03773585 0.04716981 0.0754717
 0.02830189 0.05660377 0.20754717 0.02830189 0.009433

In [None]:
from collections import defaultdict

pairs_data = defaultdict(list)
dup =0
with open('' , 'r') as file:
  content = file.read()
  results = content.split('\n\n')
  for result in results:
    lines = result.strip().split('\n')
    pmid = lines[0].split(':')[1].strip(' ')
    for line in lines[1:]:
      parts = line.split(',')
      relation = tuple(parts)
      if relation not in pairs_data[pmid]:
        pairs_data[pmid].append(relation)
      else:
        # print("duplication: "+ line + '\n')
        dup+=1

In [None]:
blocked_docs= ["27959387", "19484664", "17935240"]
def create_input(doc):
  pmid = doc["pmid"]
  input = ""
  ents = ""
  for e in doc["entities"]:
    ents += '[' + "/ ".join(e["names"]) + ']' + "," + e["id"]+ ","  + e["type"] + "\n"
  pairs = ""
  for p in pairs_data[pmid]:
    pairs+= p + '\n'
  # example += "pmid:" + doc["pmid"] +"\n"
  input += doc["title"] +"\n"
  input += doc["article"] +"\n"
  input +=  ents +"\n\n"
  input + "entity pairs:\n"
  input += pairs
  return input

In [None]:

def create_tune_dataset(filename, pairs_only):
  # zero_docs = [128,169,205,315,323]
  # docs = docs = list(set(range(0,len(filename))) - set(zero_docs))
  docs = list(set(range(0,len(filename))))

  examples=[]

  for i in docs:
    input = ""
    output = ""
    doc = filename[str(i)]
    ents = ""
    for e in doc["entities"]:
      ents += '[' + "/ ".join(e["names"]) + ']' + "," + e["id"]+ ","  + e["type"] + "\n"

    if pairs_only:
      out = ""
      for r in doc["relation"]:
        ent1=r["infons"]["entity1"]
        ent2 = r["infons"]["entity2"]
        if ent1 <= ent2:
          out += ent1 + "," + ent2 + "\n"
        else:
          out += ent2 + "," + ent1 + "\n"
    else:
      out = ""
      for r in doc["relation"]:
        ent1=r["infons"]["entity1"]
        ent2 = r["infons"]["entity2"]
        if ent1 <= ent2:
          out += r["infons"]["type"]+ "," + ent1 + "," + ent2 + "," + r["infons"]["novel"] + "\n"
        else:
          out += r["infons"]["type"]+ "," + ent2 + "," + ent1 + "," + r["infons"]["novel"] + "\n"


    # example += "pmid:" + doc["pmid"] +"\n"
    input = doc["title"] +"\n"
    input += doc["article"] +"\n"
    input +=  ents
    output = out
    example = [input, output]
    examples.append(example)

  return examples



In [None]:
import json

file_name = "train_ds_pair_no_ins.jsonl"
rows = create_tune_dataset(new_train, True)
with open(file_name, "a") as outfile:
  for i, row in enumerate(rows):
    new_data = {"messages": [{"role": "user","content": row[0]},{"role": "model","content": row[1]}]}
    prompt_token = len(str(new_data))
    if prompt_token > 321767 or len(row[0]) > 8191 or len(row[1]) > 8191 :
      print(i)
      print(prompt_token, len(row[0]), len(row[1]))

    else:
      print(json.dumps(new_data), file=outfile)

In [None]:
data = create_tune_dataset(new_dev, False)
filename = "dev_readable.txt"
with open(filename, 'a') as f:
  for i,d in enumerate(data):
    f.write(str(i)+ '\n' +d[0]+'\n'+d[1] + '\n')

data = create_tune_dataset(new_test, False)
filename = "test_readable.txt"
with open(filename, 'a') as f:
  for i,d in enumerate(data):
    f.write(str(i)+ '\n' +d[0]+'\n'+d[1] + '\n')

data = create_tune_dataset(new_train, False)
filename = "train_readable.txt"
with open(filename, 'a') as f:
  for i,d in enumerate(data):
    f.write(str(i)+ '\n' +d[0]+'\n'+d[1] + '\n')

In [None]:
import csv
# field names
fields = ['input:', 'output:']
# data rows of csv file
rows = create_tune_dataset(new_dev, True)
# name of csv file
filename = "dev_tune_pairs_20.csv"
# writing to csv file
with open(filename, 'w') as csvfile:
    # creating a csv writer object
    csvwriter = csv.writer(csvfile)
    # writing the fields
    csvwriter.writerow(fields)
    # writing the data rows
    csvwriter.writerows(rows)

In [None]:
import json
with open('/content/new_train.json') as file:
    new_train = json.load(file)

In [None]:
with open('/content/new_dev100.json') as file:
    new_dev = json.load(file)
with open('/content/new_train.json') as file:
    new_train = json.load(file)
with open('/content/new_test.json') as file:
    new_test = json.load(file)

In [None]:
print(create_one_example(363, new_train))

In [None]:
def create_one_example(id, filename):
  i = id

  example = ""
  doc = filename[str(i)]
  ents = ""
  for e in doc["entities"]:
    ents += '[' + "/ ".join(e["names"]) + ']' + "," + e["id"]+ ","  + e["type"] + "\n"
  rels = ""
  for r in doc["relation"]:
    rels += r["infons"]["type"]+ "," + r["infons"]["entity1"]+ "," + r["infons"]["entity2"]+ "," + r["infons"]["novel"] + "\n"
    # rels += entity_id_to_name(doc, r)

  # example += "pmid:" + doc["pmid"] +"\n"
  example += doc["title"] +"\n"
  example += doc["article"] +"\n"
  example += "\n" + ents +"\n"
  # example += "pmid:" + doc["pmid"] +"\n"
  example += rels +"\n\n"


  return example

In [None]:
for rel in new_dev['0']["relation"] :
  print(rel['infons'])

In [None]:
from collections import defaultdict
file_name = new_dev

stats = {}

file_type_c, file_novel_c = defaultdict(int), defaultdict(int)
file_rel = 0

for i in range(len(file_name)):
  doc = file_name[str(i)]
  pmid = doc['pmid']
  rel_c, tokens =0,0
  type_c, novel_c = defaultdict(int), defaultdict(int)
  tokens = len(create_one_example(i, file_name).split())
  for rel in doc['relation']:
    rel_type = rel['infons']['type']
    novelty = rel['infons']['novel']
    type_c[rel_type] +=1
    file_type_c[rel_type] +=1
    novel_c[novelty] +=1
    file_novel_c[novelty] +=1
    rel_c += 1
    file_rel +=1
  stats[i] = {
      'pmid': pmid,
      'types_count': type_c,
      'novelty_count': novel_c,
      'rel_c': rel_c,
      'tokens': tokens
  }

print(stats)

{0: {'pmid': '14510914', 'types_count': defaultdict(<class 'int'>, {'Positive_Correlation': 4, 'Association': 4, 'Negative_Correlation': 4}), 'novelty_count': defaultdict(<class 'int'>, {'Novel': 7, 'No': 5}), 'rel_c': 12, 'tokens': 365}, 1: {'pmid': '15096016', 'types_count': defaultdict(<class 'int'>, {'Positive_Correlation': 1}), 'novelty_count': defaultdict(<class 'int'>, {'No': 1}), 'rel_c': 1, 'tokens': 203}}


In [None]:
'''
dev:
Total number of relations: 1162
Relation type counts: {'Positive_Correlation': 352, 'Association': 560, 'Negative_Correlation': 216, 'Bind': 19, 'Cotreatment': 10, 'Comparison': 5}
Novelty counts: {'Novel': 835, 'No': 327}

train:
Total number of relations: 4178
Relation type counts: {'Association': 2192, 'Positive_Correlation': 1089, 'Bind': 61, 'Negative_Correlation': 763, 'Comparison': 28, 'Conversion': 3, 'Cotreatment': 31, 'Drug_Interaction': 11}
Novelty counts: {'No': 1340, 'Novel': 2838}

test:
Total number of relations: 1163
Relation type counts: {'Association': 635, 'Positive_Correlation': 325, 'Negative_Correlation': 171, 'Comparison': 6, 'Bind': 9, 'Conversion': 1, 'Cotreatment': 14, 'Drug_Interaction': 2}
Novelty counts: {'Novel': 859, 'No': 304}
'''
'''



'''

In [None]:
from collections import Counter

file_name = new_train
stats = {}
file_type_c = Counter()
file_novel_c = Counter()
file_rel, file_tokens = 0,0

for i in range(4):
    print(i)
    doc = file_name[str(i)]
    pmid = doc['pmid']
    rel_c = 0
    type_c = Counter()
    novel_c = Counter()
    tokens = len(create_one_example(i, file_name).split())
    file_tokens +=tokens
    for rel in doc['relation']:
        rel_type = rel['infons']['type']
        novelty = rel['infons']['novel']
        print(rel)
        type_c[rel_type] += 1
        file_type_c[rel_type] += 1
        novel_c[novelty] += 1
        print(novel_c)
        file_novel_c[novelty] += 1
        print(file_novel_c)
        rel_c += 1
        file_rel += 1

    stats[i] = {
        'pmid': pmid,
        'types_count': dict(type_c),
        'novelty_count': dict(novel_c),
        'rel_c': rel_c,
        'tokens': tokens
    }



In [None]:
# Get the relation type you want to see
target_rel_type = 'Bind'

# Sort documents based on the target relation type count
sorted_docs = sorted(stats.items(), key=lambda x: x[1]['types_count'].get(target_rel_type, 0), reverse=True)

# Print information
print(f"Total number of relations: {file_rel}")
print(f"Total number of tokens: {file_tokens}")
print(f"Relation type counts: {dict(file_type_c)}")
print(f"Novelty counts: {dict(file_novel_c)}")


# print(f"\nDocuments with the most '{target_rel_type}' relation type:")
# for doc_idx, doc_stats in sorted_docs:
#     # if doc_stats['types_count'].get(target_rel_type, 0) > 0:
#         print(f"\nDocument {doc_idx} (PMID: {doc_stats['pmid']})")
#         print(f"Number of relations: {doc_stats['rel_c']}")
#         print(f"Number of tokens: {doc_stats['tokens']}")
#         print(f"Relation type counts: {doc_stats['types_count']}")
#         print(f"Novelty counts: {doc_stats['novelty_count']}")

Total number of relations: 1163
Total number of tokens: 27880
Relation type counts: {'Association': 635, 'Positive_Correlation': 325, 'Negative_Correlation': 171, 'Comparison': 6, 'Bind': 9, 'Conversion': 1, 'Cotreatment': 14, 'Drug_Interaction': 2}
Novelty counts: {'Novel': 859, 'No': 304}


In [None]:
from collections import Counter
import openpyxl

file_name = new_test
stats = {}
file_type_c = Counter()
file_novel_c = Counter()
file_rel= 0

for i in range(len(file_name)):
    doc = file_name[str(i)]
    pmid = doc['pmid']
    rel_c = 0
    type_c = Counter()
    novel_c = Counter()
    tokens = len(create_one_example(i, file_name).split())


    for rel in doc['relation']:
        rel_type = rel['infons']['type']
        novelty = rel['infons']['novel']
        type_c[rel_type] += 1
        file_type_c[rel_type] += 1
        novel_c[novelty] += 1
        file_novel_c[novelty] += 1
        rel_c += 1
        file_rel += 1

    stats[i] = {
        'pmid': pmid,
        'types_count': dict(type_c),
        'novelty_count': dict(novel_c),
        'rel_c': rel_c,
        'tokens': tokens
    }

# Create a new workbook
workbook = openpyxl.Workbook()
worksheet = workbook.active

# Write the header row
header = ['id', 'relation_count', 'tokens'] + list(file_type_c.keys()) + list(file_novel_c.keys())
worksheet.append(header)

# Write the data rows
for doc_idx, doc_stats in stats.items():
    row = [doc_idx, doc_stats['rel_c'], doc_stats['tokens']]
    for rel_type in file_type_c.keys():
        row.append(doc_stats['types_count'].get(rel_type, 0))
    for nov in file_novel_c.keys():
        row.append(doc_stats['novelty_count'].get(nov, 0))
    # row.append(sum(doc_stats['novelty_count'].values()))
    # row.append(doc_stats['novelty_count'].get('No', 0))
    worksheet.append(row)

# Save the workbook
workbook.save('output.xlsx')

In [None]:
def create_output(doc):
  output = ""
  rels = ""
  for r in doc["relation"]:
      if r["infons"]["entity1"] <= r["infons"]["entity2"]:
        ent1 = r["infons"]["entity1"]
        ent2 = r["infons"]["entity2"]
      else:
        ent1 = r["infons"]["entity2"]
        ent2 = r["infons"]["entity1"]

      rels += r["infons"]["type"]+ "," + ent1 + "," + ent2 + "," + r["infons"]["novel"] + "\n"
  output += "pmid:" + doc["pmid"] +"\n"
  output += rels +"\n\n"
  return output

In [None]:
output = ""
for i in range(len(new_test)):
  rels = ""
  doc = new_test[str(i)]
  for r in doc["relation"]:
      if r["infons"]["entity1"] <= r["infons"]["entity2"]:
        ent1 = r["infons"]["entity1"]
        ent2 = r["infons"]["entity2"]
      else:
        ent1 = r["infons"]["entity2"]
        ent2 = r["infons"]["entity1"]

      rels += r["infons"]["type"]+ "," + ent1 + "," + ent2 + "," + r["infons"]["novel"] + "\n"
  output += "pmid:" + doc["pmid"] +"\n"
  output += rels +"\n\n"
  i += 1

In [None]:
#test output for pairs
output = ""
for i in range(len(new_test)):
  rels = ""
  doc = new_test[str(i)]
  for r in doc["relation"]:
      if r["infons"]["entity1"] <= r["infons"]["entity2"]:
        ent1 = r["infons"]["entity1"]
        ent2 = r["infons"]["entity2"]
      else:
        ent1 = r["infons"]["entity2"]
        ent2 = r["infons"]["entity1"]

      rels +=  ent1 + "," + ent2 + "\n"
  output += "pmid:" + doc["pmid"] +"\n"
  output += rels +"\n\n"
  i += 1

In [None]:
with open("/content/expected.txt", "w") as f:
    f.write(output)

In [None]:
def entity_id_to_name(doc, r):
  ent1_name = ""
  ent2_name = ""
  for e in doc["entities"]:
    if r["infons"]["entity1"] == e["id"]:
      ent1_name = '[' + "/ ".join(e["names"]) + ']'
  for e in doc["entities"]:
    if r["infons"]["entity2"] == e["id"]:
      ent2_name = '[' + "/ ".join(e["names"]) + ']'
  return r["infons"]["type"]+ "," + ent1_name + "," + ent2_name + "," + r["infons"]["novel"] + "\n"


In [None]:
def convert_examples(filename):
  pubmed = {}
  for index, doc in enumerate(filename["documents"]):
    annotations = [doc["passages"][0]["annotations"], doc["passages"][1]["annotations"]]
    pubmed[index]= {
        "pmid" : doc["id"],
        "title": doc["passages"][0]["text"],
        "article": doc["passages"][1]["text"],
        "entities": [],
        "relation": doc["relations"]
    }


    i = 0
    for annotation in annotations:
      for entity in annotation:
          new_e = True
          identifier = entity["infons"]["identifier"]
          name = entity["text"]
          entity_type = entity["infons"]["type"]

          for e in pubmed[index]["entities"]:
              if identifier == e["id"]:
                  new_e = False
                  if name.lower() not in [n.lower() for n in e["names"]]:
                    e["names"].append(name)
                  break

          if new_e or i ==0:
              entities[i] = {
                  "id": identifier,
                  "names": [name],
                  "type": entity_type
              }
              pubmed[index]["entities"].append(entities[i])
              i += 1
  return pubmed




In [None]:

inputs = create_inputs()
outputs = ""
prompt = add_exaples_prompt(init_prompt)
i = 35
print(new_test[str(i)]["pmid"])
print(inputs[i])
prompt += "produce similar output(pmid, relations) for this article:" + '\n'
prompt += inputs[i]
response = model.generate_content(prompt)
outputs += "pmid:" + new_test[str(i)]["pmid"]+ "\n" + response.text + "\n\n"

print(outputs)

18257781
Co-inheritance of a PKD1 mutation and homozygous PKD2 variant: a potential modifier in autosomal dominant polycystic kidney disease.
BACKGROUND: Autosomal dominant polycystic kidney disease (ADPKD), which is caused by mutations in polycystins 1 (PC1) and 2 (PC2), is one of the most commonly inherited renal diseases, affecting ~1 : 1000 Caucasians. MATERIALS AND METHODS: We screened Greek ADPKD patients with the denaturing gradient gel electrophoresis (DGGE) assay and direct sequencing. RESULTS: We identified a patient homozygous for a nucleotide change c.1445T > G, resulting in a novel homozygous substitution of the non-polar hydrophobic phenylalanine to the polar hydrophilic cysteine in exon 6 at codon 482 (p.F482C) of the PKD2 gene and a de-novo PKD1 splice-site variant IVS21-2delAG. We did not find this PKD2 variant in a screen of 280 chromosomes of healthy subjects, supporting its pathogenicity. The proband's parents did not have the PKD1 mutation. Real-time PCR of the PKD

In [None]:
print(create_output(new_test[str(35)]))

pmid:18257781
Positive_Correlation,D007690,c|DEL|IVS21-2|AG,Novel
Association,5311,D007690,No
Association,5310,D007690,No
Positive_Correlation,D007690,rs75762896,Novel





In [None]:
init_prompt---- = '''
you are a helpful assistants in extracting biomedical relations from biomedical articles. your task is to find relations and produce structured results. each relation has tow entities and it has specific relation type. here is the entity and relation types and their definition.:
Based on the BioRED corpus, here are short definitions for the 8 relation types:

Association: The relation between two entities where the association cannot be categorized as positive or negative correlation, or the description is unclear.
Comparison: The relation that compares the effects or properties of two chemicals or drugs.
Conversion: A chemical is transformed or converted into another chemical through a chemical reaction or process.
Cotreatment: The use of two or more chemicals/drugs together as a combination therapy for treating a disease or condition.
Negative_Correlation: A relation indicating an inverse or opposing effect between two entities, such as a chemical decreasing the expression of a gene, or a variant causing resistance to a drug.
Positive_Correlation: A relation indicating a direct or reinforcing effect between two entities, such as a chemical increasing the expression of a gene, or a variant causing sensitivity to a drug.
Bind: A physical interaction or binding between two entities, such as a chemical binding to a gene or its protein product.
Drug_Interaction: A pharmacological interaction that occurs when two drugs are administered together, potentially affecting their efficacy or side effects.

Here are short descriptions for the 6 entity types in the BioRED corpus:
(1) Gene: This entity type includes genes, proteins, mRNA, and other gene products. Entity linking is performed to map gene mentions to specific NCBI Gene identifiers.
(2) Chemical: This covers chemicals and drugs. Chemical mentions are linked to MeSH identifiers.
(3) Disease: This includes diseases, symptoms, and some disease-related phenotypes. Disease mentions are mapped to concept identifiers from a combination of MeSH and OMIM.
(4) Variant: This represents genomic and protein variants, including substitutions, deletions, insertions, and others. Variant mentions are normalized to dbSNP accession numbers or their component representations when an identifier is not available.
(5) Species: This covers species names from the hierarchical taxonomy of organisms. Species mentions are linked to NCBI Taxonomy identifiers.
(6) CellLine: This represents cell line names. Cell line mentions are mapped to identifiers from the Cellosaurus database.

novelty:
Novel: It is used for relations that are related to the main point or novelty of the abstract. Any information that would be part of the results or conclusions of the paper is considered novel.
No: It is for relations that are background information, typically providing context for the abstract, such as results of previous studies or relevant details that are needed to understand why the paper is important:

the input is the article and the entities with their name and type and an identifier. first line:title second line: article then entities in this format([entity names],entity id,type) the output is like this: relation type,entity1,entity2,novelty :


Curcumin prevents maleate-induced nephrotoxicity: relation to hemodynamic alterations, oxidative stress, mitochondrial oxygen consumption and activity of respiratory complex I.
The potential protective effect of the dietary antioxidant curcumin (120 mg/Kg/day for 6 days) against the renal injury induced by maleate was evaluated. Tubular proteinuria and oxidative stress were induced by a single injection of maleate (400 mg/kg) in rats. Maleate-induced renal injury included increase in renal vascular resistance and in the urinary excretion of total protein, glucose, sodium, neutrophil gelatinase-associated lipocalin (NGAL) and N-acetyl b-D-glucosaminidase (NAG), upregulation of kidney injury molecule (KIM)-1, decrease in renal blood flow and claudin-2 expression besides of necrosis and apoptosis of tubular cells on 24 h. Oxidative stress was determined by measuring the oxidation of lipids and proteins and diminution in renal Nrf2 levels. Studies were also conducted in renal epithelial LLC-PK1 cells and in mitochondria isolated from kidneys of all the experimental groups. Maleate induced cell damage and reactive oxygen species (ROS) production in LLC-PK1 cells in culture. In addition, maleate treatment reduced oxygen consumption in ADP-stimulated mitochondria and diminished respiratory control index when using malate/glutamate as substrate. The activities of both complex I and aconitase were also diminished. All the above-described alterations were prevented by curcumin. It is concluded that curcumin is able to attenuate in vivo maleate-induced nephropathy and in vitro cell damage. The in vivo protection was associated to the prevention of oxidative stress and preservation of mitochondrial oxygen consumption and activity of respiratory complex I, and the in vitro protection was associated to the prevention of ROS production.

[Curcumin],D003474,ChemicalEntity
[maleate],C030272,ChemicalEntity
[nephrotoxicity/ renal injury/ nephropathy],D007674,DiseaseOrPhenotypicFeature
[oxygen],D010100,ChemicalEntity
[respiratory complex I],D042967,ChemicalEntity
[proteinuria],D011507,DiseaseOrPhenotypicFeature
[rats],10116,OrganismTaxon
[glucose],D005947,ChemicalEntity
[sodium],D012964,ChemicalEntity
[neutrophil gelatinase-associated lipocalin/ NGAL],170496,GeneOrGeneProduct
[N-acetyl b-D-glucosaminidase/ NAG],-,GeneOrGeneProduct
[kidney injury molecule (KIM)-1],286934,GeneOrGeneProduct
[claudin-2],733684,GeneOrGeneProduct
[necrosis and apoptosis of tubular cells],D007673,DiseaseOrPhenotypicFeature
[lipids],D008055,ChemicalEntity
[Nrf2],83619,GeneOrGeneProduct
[LLC-PK1],CVCL_0391,CellLine
[reactive oxygen species/ ROS],D017382,ChemicalEntity
[ADP],D000244,ChemicalEntity
[malate],C030298,ChemicalEntity
[glutamate],D018698,ChemicalEntity
[aconitase],50655,GeneOrGeneProduct

Positive_Correlation,D011507,C030272,Novel
Positive_Correlation,D007674,286934,Novel
Negative_Correlation,D007674,733684,Novel
Association,D007674,170496,Novel
Association,D007674,D012964,Novel
Association,D007674,D005947,Novel
Negative_Correlation,C030272,50655,Novel
Association,C030272,D010100,Novel
Positive_Correlation,C030272,D017382,Novel
Positive_Correlation,C030272,286934,Novel
Negative_Correlation,C030272,733684,Novel
Association,C030272,170496,Novel
Association,C030272,D012964,Novel
Association,C030272,D005947,Novel
Positive_Correlation,C030272,D007674,No
Negative_Correlation,D003474,D007674,Novel
Negative_Correlation,D003474,C030272,Novel

Considarations: the output only should have the relations in the mentioned structure! each entity pair only has one relation.
'''