In [1]:
!pip install transformers
!pip install peft
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install datasets

Collecting transformers
  Downloading transformers-4.39.2-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.22.2-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading 

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftConfig
import torch

model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id) #, force_download=True, resume_download=False)

# Load the model
model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1",
                                             load_in_4bit=True,
                                             torch_dtype=torch.float16,
                                             device_map="cuda",
                                             low_cpu_mem_usage=True,
                                             # force_download=True, 
                                             # resume_download=False
                                             )

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [6]:
# Load the PeftConfig from the specified directory
adapter_model_id = "results/checkpoint-500"
peft_config = PeftConfig.from_pretrained(adapter_model_id, device_map='cuda')

# Add the adapter with the loaded config
model.add_adapter(peft_config)

In [8]:
instruction =  "Context: I want you to extract semantic triples from the following paragraph.\nRules: The subject of each triple is a chemical. Only use information explicitly present in the paragraph, do not hallucinate. Do not create any fictional or incorrect outputs. Strictly follow all rules and guidelines given. If the object of the triple has more than one noun, split it into separate triples with only one noun each. Do not repeat any triples. The output must be a list of the triples you are most confident in, with each triple in the format [\u201csubject\u201d, \u201cpredicate\u201d, \u201cobject\u201d].\nQ: Extract triples on the part of the human body the chemical subjects can be found in. Use the predicate \"biolocation is\" for these triples. The object of each triple must be a human body part. If you cannot find any information on this, do not output any triples.Q: Extract triples on the human exposure route of the chemical subjects. Use the predicate \"exposed through\" for these triples. If you cannot find any information on this, do not output any triples.Q: Extract triples on what food or organism the chemical subject is sourced from. Use the predicate \"sourced through\" for these triples. If you cannot find any information on this, do not output any triples.Q: Extract triples on what disease the chemical subject causes. Use the predicate \"causes\" for these triples. The object of each triple must be a disease. If you cannot find any information on this, do not output any triples.Q: Extract triples on the biological mechanism the chemical subject is a part of. Use the predicate \"involved in\" for these triples. If you cannot find any information on this, do not output any triples.Q: Extract triples on the biological role the chemical subject has. Use the predicate \"has role of\" for these triples. If you cannot find any information on this, do not output any triples. Paragraph:"
abstract = "The study explores the metabolites trans-2-Enoyl-OPC6-CoA, derived from Mallard duck, and an Irbesartan derivative, M2, with a specific localization in the Kidney. Additionally, the compound (E)-3,7-Dimethyl-1,5,7-octatrien-3-ol was found to be localized in the Cytoplasm. Understanding the distribution of these compounds in different cellular compartments could provide insights into their potential biological significance. Further research is warranted to elucidate the metabolic pathways and interactions involving these molecules and their impact on cellular processes. This investigation sheds light on the unique distribution patterns of these compounds and their potential roles in kidney and cytoplasmic functions."
text = instruction + abstract

# Generate input IDs
inputs = tokenizer(text, return_tensors="pt", truncation=True).to('cuda')

# Generate output
output = model.generate(**inputs, max_new_tokens=2048) #, max_time=10)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Context: I want you to extract semantic triples from the following paragraph.
Rules: The subject of each triple is a chemical. Only use information explicitly present in the paragraph, do not hallucinate. Do not create any fictional or incorrect outputs. Strictly follow all rules and guidelines given. If the object of the triple has more than one noun, split it into separate triples with only one noun each. Do not repeat any triples. The output must be a list of the triples you are most confident in, with each triple in the format [“subject”, “predicate”, “object”].
Q: Extract triples on the part of the human body the chemical subjects can be found in. Use the predicate "biolocation is" for these triples. The object of each triple must be a human body part. If you cannot find any information on this, do not output any triples.Q: Extract triples on the human exposure route of the chemical subjects. Use the predicate "exposed through" for these triples. If you cannot find any information

In [9]:
!pip install jaro-winkler
!pip install scikit-learn
!pip install torcheval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting jaro-winkler
  Downloading jaro_winkler-2.0.3-py3-none-any.whl.metadata (2.6 kB)
Downloading jaro_winkler-2.0.3-py3-none-any.whl (33 kB)
Installing collected packages: jaro-winkler
Successfully installed jaro-winkler-2.0.3
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting scikit-learn
  Downloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.4.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torcheval
Successfully installed torcheval-0.0.7
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [10]:
import torch
from torcheval.metrics.functional import binary_f1_score
from sklearn.metrics import precision_score, recall_score
import json
from jaro import jaro_winkler_metric

def read_lines(file: str) -> list[str]:
    """Read lines from a file and return as a list."""
    with open(file, "r") as f:
        lines = f.readlines()
    return lines

def read_jsonl(file: str) -> list[list[tuple[str]]]:
    """Read JSON lines from a file and parse into a list of lists of tuples."""
    lines = read_lines(file)
    result = [[tuple(triplet) for triplet in json.loads(line)] for line in lines]
    return result

def read_txt(file: str) -> list[str]:
    """Read lines from a text file and return as a list."""
    return read_lines(file)

def to_binary(preds, gt):
    """Convert prediction and ground truth samples to binary classification."""
    max_len = max(len(preds), len(gt))
    output = [1 if pred in gt else 0 for pred in preds]
    target = [1 if gt in preds else 0 for gt in gt]

    # Pad the shorter tensor with zeros
    if len(output) < max_len:
        output.extend([0] * (max_len - len(output)))
    if len(target) < max_len:
        target.extend([0] * (max_len - len(target)))

    return torch.tensor(output), torch.tensor(target)

def calculate_scores(preds, gt):
    """Calculate F1 score, precision, recall, and Jaro-Winkler score."""
    max_len = max(len(gt), len(preds))
    f1_score_sum = precision_sum = recall_sum = jw_score_sum = 0

    for i in range(max_len):
        # Calculate F1 score, precision, and recall
        pred_sample = preds[min(i, len(preds) - 1)]
        gt_sample = gt[min(i, len(gt) - 1)]

        # Convert samples to lowercase
        pred_sample = [[item.lower() for item in triplet] for triplet in pred_sample]
        gt_sample = [[item.lower() for item in triplet] for triplet in gt_sample]

        output, target = to_binary(pred_sample, gt_sample)
        output, target = torch.tensor(output), torch.tensor(target)
        f1_score_sum += binary_f1_score(output, target)
        precision_sum += precision_score(target, output)
        recall_sum += recall_score(target, output)

        # Calculate Jaro-Winkler score
        pred_sample_txt = ' '.join([' '.join(triplet) for triplet in pred_sample])
        gt_sample_txt = ' '.join([' '.join(triplet) for triplet in gt_sample])
        jw_score_sum += jaro_winkler_metric(pred_sample_txt, gt_sample_txt)

    # Calculate averages
    f1_score = f1_score_sum / max_len
    precision = precision_sum / max_len
    recall = recall_sum / max_len
    jw_score = jw_score_sum / max_len

    return float(f1_score), precision, recall, jw_score

In [None]:
# Inport test set
import json
import ast
import re
import statistics
from tqdm import tqdm

# Create lists
f1_score_list = []
precision_list = []
recall_list = []
jw_score_list = []
successful_output = 0
failed_output = 0

test_set_path = 'test_80.jsonl'
output_file_path = 'output_80.jsonl'
with open(test_set_path, 'r') as infile, open(output_file_path, "w") as outfile:
  for line in tqdm(infile):
    data = json.loads(line)
    abstract = data['input']
    ground_truth = data['output']
    
    # Create input
    text = instruction + abstract

    # Generate input IDs
    inputs = tokenizer(text, return_tensors="pt", truncation=True).to('cuda')
    
    # Generate output
    output = model.generate(**inputs, max_new_tokens=2048) #, max_time=30)
    result = tokenizer.decode(output[0], skip_special_tokens=True)

    valid = False
    # Use ast.literal_eval() to convert the string to a list

    output = []
    try:
        # Turn output to a list
        # my_list = ast.literal_eval(result)
        pattern = r'\["([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\]'
        extracted_list = re.findall(pattern, result, re.MULTILINE)
        my_list = [triple for triple in extracted_list if 'NA' not in triple]
        output = my_list
        gt = [[tuple(triplet) for triplet in ground_truth]]
        preds = [[tuple(triplet) for triplet in my_list]]

        # Get scores
        f1_score, precision, recall, jw_score = calculate_scores(preds, gt)

        # Print outputs
        print("\n\n")
        print(f"Ground Truth: {gt}")
        print(f"Output:       {preds}")
        print(f1_score, precision, recall, jw_score)

        # Increment variables
        f1_score_list.append(f1_score)
        precision_list.append(precision)
        recall_list.append(recall)
        jw_score_list.append(jw_score)
        successful_output += 1
        valid = True

    except (SyntaxError, ValueError) as e:
        failed_output += 1
        print(result)

    # Save the {'input': abstract, 'output': output} pair to JSONL
    json.dump({'input': abstract, 'output':output, 'output_complete': result, 'valid':valid}, outfile)
    outfile.write('\n')

# Calculate statistics for each list
print("\n\n\n")
for name, score_list in [("F1 Score", f1_score_list), ("Precision", precision_list), ("Recall", recall_list), ("Jaro-Winkler Score", jw_score_list)]:
    avg_score = sum(score_list) / successful_output
    min_score = min(score_list)
    max_score = max(score_list)
    std_dev = statistics.stdev(score_list) if len(score_list) > 1 else 0  # Std. dev. requires at least 2 data points

    # Print statistics
    print(f"{name}:")
    print(f"  Average: {avg_score}")
    print(f"  Minimum: {min_score}")
    print(f"  Maximum: {max_score}")
    print(f"  Standard Deviation: {std_dev}")

# Print other information
total_outputs = successful_output + failed_output
print(f"Successful Outputs: {successful_output}")
print(f"Failed Outputs: {failed_output}")
print(f"Total Outputs: {total_outputs}")


0it [00:00, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  output, target = torch.tensor(output), torch.tensor(target)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.





Ground Truth: [[('Uroporphyrinogen III', 'sourced through', 'Passion fruit'), ('PA(18:3(9Z,12Z,15Z)/22:0)', 'sourced through', 'Passion fruit'), ('3-oxo-eicosatrienoyl-CoA', 'sourced through', 'Passion fruit'), ('Uroporphyrinogen III', 'sourced through', 'Hedge mustard'), ('α-D-ribose-1-phosphate', 'sourced through', 'Passion fruit')]]
Output:       [[]]
0.0 0.0 0.0 0.0


  output, target = torch.tensor(output), torch.tensor(target)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.





Ground Truth: [[('Phosphate', 'involved in', 'De Novo Triacylglycerol Biosynthesis TG(a-25:0/18:0/8:0)'), ('Phosphate', 'involved in', 'De Novo Triacylglycerol Biosynthesis TG(i-20:0/a-15:0/i-19:0)'), ('Phosphate', 'involved in', 'De Novo Triacylglycerol Biosynthesis TG(20:3(5Z,8Z,11Z)/20:5(5Z,8Z,11Z,14Z,17Z)/20:5(5Z,8Z,11Z,14Z,17Z))')]]
Output:       [[('phosphate', 'biolocation is', 'human body'), ('phosphate', 'exposed through', 'in vitro enzymatic assays'), ('phosphate', 'exposed through', 'cellular models'), ('phosphate', 'sourced through', 'triacylglycerol biosynthesis'), ('phosphate', 'causes', 'increased incorporation of phosphate into triacylglycerols with different fatty acid compositions'), ('phosphate', 'involved in', 'triacylglycerol biosynthesis'), ('phosphate', 'has role of', 'regulating enzyme activities and gene expression related to triacylglycerol biosynthesis')]]
0.0 0.0 0.0 0.5354030388994442


In [21]:
# Inport test set
import json
import ast
import re
import statistics
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor


# Create lists
f1_score_list = []
precision_list = []
recall_list = []
jw_score_list = []
successful_output = 0
failed_output = 0

test_set_path = 'test_80.jsonl'
output_file_path = 'output_80.jsonl'

# Function to process a single line
def process_line(line):
    data = json.loads(line)
    abstract = data['input']
    ground_truth = data['output']
    
    text = instruction + abstract

    inputs = tokenizer(text, return_tensors="pt", truncation=True).to('cuda')

    output = model.generate(**inputs, max_new_tokens=2048)
    result = tokenizer.decode(output[0], skip_special_tokens=True)

    valid = False
    output = []
    try:
        pattern = r'\["([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\]'
        extracted_list = re.findall(pattern, result, re.MULTILINE)
        my_list = [triple for triple in extracted_list if 'NA' not in triple]
        output = my_list
        gt = [[tuple(triplet) for triplet in ground_truth]]
        preds = [[tuple(triplet) for triplet in my_list]]

        # Calculate scores
        f1_score, precision, recall, jw_score = calculate_scores(preds, gt)

        # Print outputs
        print("\n\n")
        print(f"Ground Truth: {gt}")
        print(f"Output:       {preds}")
        print(f1_score, precision, recall, jw_score)

        f1_score_list.append(f1_score)
        precision_list.append(precision)
        recall_list.append(recall)
        jw_score_list.append(jw_score)
        successful_output += 1
        valid = True

    except (SyntaxError, ValueError) as e:
        failed_output += 1
        print(result)

    return {'input': abstract, 'output':output, 'output_complete': result, 'valid':valid}

# Main processing loop
with open(test_set_path, 'r') as infile, open(output_file_path, "w") as outfile:
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(process_line, infile), total=total_lines))

        # Write results to output file
        for result in results:
            json.dump(result, outfile)
            outfile.write('\n')
  

# Calculate statistics for each list
print("\n\n\n")
for name, score_list in [("F1 Score", f1_score_list), ("Precision", precision_list), ("Recall", recall_list), ("Jaro-Winkler Score", jw_score_list)]:
    avg_score = sum(score_list) / successful_output
    min_score = min(score_list)
    max_score = max(score_list)
    std_dev = statistics.stdev(score_list) if len(score_list) > 1 else 0  # Std. dev. requires at least 2 data points

    # Print statistics
    print(f"{name}:")
    print(f"  Average: {avg_score}")
    print(f"  Minimum: {min_score}")
    print(f"  Maximum: {max_score}")
    print(f"  Standard Deviation: {std_dev}")

# Print other information
total_outputs = successful_output + failed_output
print(f"Successful Outputs: {successful_output}")
print(f"Failed Outputs: {failed_output}")
print(f"Total Outputs: {total_outputs}")


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

KeyboardInterrupt: 