In [1]:
import pandas as pd 
import numpy as np
import json 
import random
import os

#### Software description from Biocontainers and Software ontology

In [2]:
def format_to_phi3_input(path, output_file):
    formatted_data = []

    # Read the JSONL file
    with open(path, 'r', encoding='utf-8') as f:
        i = 0
        for line in f:
            entry = json.loads(line.strip())  # Load each line as JSON
            question = entry["question"]
            answer = entry["answer"]
            
            # Create a list of dictionaries with "messages" key
            formatted_entry = {
               "prompt": "You are a bioinformatics assistant. Please provide a concise and accurate answer describing this tool.",
                "prompt_id": f"prompt_{i}",
                "messages": [
                    {
                        "role": "user",
                        "content": question
                    },
                    {
                        "role": "assistant",
                        "content": answer
                    }
                ]
            }
            formatted_data.append(formatted_entry)
            i += 1

    # Print the formatted data for verification
    print(formatted_data)

    # Write the formatted data to a new JSONL file
    with open(output_file, 'w', encoding='utf-8') as f:
        for entry in formatted_data:
            json.dump(entry, f)
            f.write('\n')  # Write newline after each JSON object

    print(f"Formatted data saved to {output_file}")
    

In [None]:
input_file = r"./biocontainers_1.jsonl"
output_file = r"./FT_tools.jsonl"
format_to_phi3_input(input_file, output_file) #tools descriptions

Formatted data saved to C:\Users\t-nmehandru\OneDrive - Microsoft\benchmark\fine_tuning\FT_data\FT_tools.jsonl


#### Bioconatiners Help Documentation 


In [92]:
def format_to_phi3_input_h(path, output_file):
    formatted_data = []

    # Read the JSONL file
    with open(path, 'r', encoding='utf-8') as f:
        i = 0
        for line in f:
            entry = json.loads(line.strip())  # Load each line as JSON
            
            # Create a dictionary with "prompt", "prompt_id", and "messages" keys
            formatted_entry = {
                "prompt": "You are a bioinformatics assistant. Please provide help documentation and details about the arguments for these tools.",
                "prompt_id": f"prompt_{i}",
                "messages": entry.get("messages", [])
            }
            formatted_data.append(formatted_entry)
            i += 1

    # Print the formatted data for verification
    print(formatted_data)

    # Write the formatted data to a new JSONL file
    with open(output_file, 'w', encoding='utf-8') as f:
        for entry in formatted_data:
            json.dump(entry, f)
            f.write('\n')  # Write newline after each JSON object

    print(f"Formatted data saved to {output_file}")


In [None]:
input_file_h = r"./biocontainers_help.jsonl"
output_file_h = r"./FT_docs.jsonl"

format_to_phi3_input_h(input_file_h, output_file_h) #help documentation

Formatted data saved to C:\Users\t-nmehandru\OneDrive - Microsoft\benchmark\fine_tuning\FT_data\FT_docs.jsonl


In [95]:
def count_lines(json_file):
    count = 0 
    with open(json_file, 'r', encoding = 'utf-8') as f:
        for line in f:
            count += 1
    return count
num_entries = count_lines(input_file)
print(f"Number of lines in description jsonl file:{num_entries}")

num_entries_h = count_lines(input_file_h)
print(f"Number of lines in help doc jsonl file:{num_entries_h}")

Number of lines in description jsonl file:12754
Number of lines in help doc jsonl file:258


#### Train/ Test Split.

In [47]:

import random
random.seed(44)
def split_jsonl(input_file, train_file, test_file, train_ratio=0.8):
    # Read the JSONL file
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Shuffle the lines to ensure random distribution
    random.shuffle(lines)

    # Calculate the split index
    total_lines = len(lines)
    split_index = int(total_lines * train_ratio)

    # Split into training and testing datasets
    train_lines = lines[:split_index]
    test_lines = lines[split_index:]

    # Write training data to file
    with open(train_file, 'w', encoding='utf-8') as f:
        f.writelines(train_lines)

    # Write testing data to file
    with open(test_file, 'w', encoding='utf-8') as f:
        f.writelines(test_lines)

    print(f"Data split into {len(train_lines)} training samples and {len(test_lines)} testing samples.")
    print(f"Training data saved to {train_file}")
    print(f"Testing data saved to {test_file}")
 


Data split into 206 training samples and 52 testing samples.
Training data saved to C:\Users\t-nmehandru\OneDrive - Microsoft\benchmark\fine_tuning\FT_data\train_1.jsonl
Testing data saved to C:\Users\t-nmehandru\OneDrive - Microsoft\benchmark\fine_tuning\FT_data\test_1.jsonl


### Split Biocontainer tool description

In [None]:
#split FT_tools

input_file_1 = r"...\04_fine_tuning\input_data\format_input_data\FT_docs.jsonl"
train_file_1 =  r"...train_1.jsonl"
test_file_1 =  r"...test_1.jsonl"  #output 

    # Split the JSONL file
split_jsonl(input_file_1, train_file_1, test_file_1)

Data split into 10203 training samples and 2551 testing samples.
Training data saved to C:\Users\t-nmehandru\OneDrive - Microsoft\benchmark\fine_tuning\FT_data\train_1.jsonl
Testing data saved to C:\Users\t-nmehandru\OneDrive - Microsoft\benchmark\fine_tuning\FT_data\test_1.jsonl


### Split Biocontainer commandline options

In [None]:
#split FT_docs
input_file_2 = r"...\04_fine_tuning\input_data\format_input_data\FT_docs.jsonl\FT_docs.jsonl"
train_file_2 =  r"....\train_2.jsonl"
test_file_2 =  r"...\test_2.jsonl" #output

# Split the JSONL file
split_jsonl(input_file_2, train_file_2, test_file_2)

Data split into 206 training samples and 52 testing samples.
Training data saved to C:\Users\t-nmehandru\OneDrive - Microsoft\benchmark\fine_tuning\FT_data\train_2.jsonl
Testing data saved to C:\Users\t-nmehandru\OneDrive - Microsoft\benchmark\fine_tuning\FT_data\test_2.jsonl


#### Combine splits from tool descriptions and help documentation. 

In [None]:
#powershell terminal

Get-Content train_1.jsonl, train_2.jsonl | Set-Content -Path train.jsonl


SyntaxError: invalid syntax (3582741285.py, line 2)

In [None]:
# powershell terminal

Get-Content test_1.jsonl, test_2.jsonl | Set-Content -Path test.jsonl

#### Format QA pairs from biostars data into jsonl format to evaluate on fine-tuned Phi-3 model. The QA pairs are from the test data (1 question upvote and 3 answer upvotes). Take csv file where the questions were tagged by GPT-3.5 as being both a tools and an analysis question. 

In [None]:
def csv_to_jsonl(input, output):
    df = pd.read_csv(input)
    with open(output, 'w', encoding = 'utf-8') as f:
        for i, row in df.iterrows():
            entry = row.to_dict()
            json.dump(entry, f)
            f.write('\n')
    print(f"Data successfully converted to JSONL format and saved to {output}")


csv_file = r"../../Evaluation/analysis_and_tools_only.csv"

csv_to_jsonl(csv_file, jsonl_file)

Data successfully converted to JSONL format and saved to C:\Users\t-nmehandru\OneDrive - Microsoft\benchmark\04_fine_tuning\ft_phi3_input.jsonl


In [None]:
def format_to_phi3_input_ta(path, output_file):
    formatted_data = []

    # Read the JSONL file
    with open(path, 'r', encoding='utf-8') as f:
        i = 0
        for line in f:
            entry = json.loads(line.strip())  # Load each line as JSON
            question = entry["content"]
            answer = entry["answer_content"]
            
            # Create a list of dictionaries with "messages" key
            formatted_entry = {
               "prompt": "You are a bioinformatics assistant. Please provide a concise and accurate answer to the following question:",
                "prompt_id": f"prompt_{i}",
                "messages": [
                    {
                        "role": "user",
                        "content": question
                    },
                    {
                        "role": "assistant",
                        "content": answer
                    }
                ]
            }
            formatted_data.append(formatted_entry)
            i += 1

    # Print the formatted data for verification
    print(formatted_data)

    # Write the formatted data to a new JSONL file
    with open(output_file, 'w', encoding='utf-8') as f:
        for entry in formatted_data:
            json.dump(entry, f)
            f.write('\n')  # Write newline after each JSON object

    print(f"Formatted data saved to {output_file}")
    