In [1]:
import pandas as pd
from tqdm.notebook import tqdm

tqdm.pandas()
import warnings

warnings.filterwarnings("ignore")
import re
import time

from together import Together

df = pd.read_parquet("/mnt/d/Downloads/new_assay.prqt")
df = (
    df[["assay_chembl_id", "description"]]
    .drop_duplicates(subset="description")
    .sample(20000, random_state=116)
    .reset_index(drop=True)
)
api_key = "MY API KEY"
client = Together(api_key=api_key)

### Try few-shot assay labeling with together.ai API, meta-llama/Llama-3.3-70B-Instruct

In [2]:
def extract_substrate(description, model_name):
    prompt = f"""
    Task: Extract the substrate name and the target name from the description of a biochemical assay. If no substrate is mentioned, return "The substrate is None", if no target is mentioned, return "The target is None".

    
    Examples:

    Example 1:
    input: Inhibition of human JNK2 at 1 uM using ATF2 as substrate
    output: The substrate is ATF2. The target is JNK2.

    Example 2:
    input: Displacement of [3H]NMS from human muscarinic M5 receptor expressed in CHO cells
    output: The substrate is [3H]NMS. The target is muscarinic M5 receptor.

    Example 3:
    input: This experiment measured the total protein concentration in a sample using the Bradford assay.
    output: The substrate is None. The target is None.

    Example 4:
    input: Mixed-type of inhibition of mushroom tyrosinase assessed as Vmax using L-DOPA as substrate at 0.05 uM measured at 475 nm by Lineweaver-Burk double reciprocal plot analysis (Rvb = 0.257 to 0.267 deltaOD/min)
    output: The substrate is L-DOPA. The target is tyrosinase.

    Example 5:
    input: Analysis of cell viability of human HEK293 cells after treatment with different concentrations of atezolizumab.
    output: The substrate is atezolizumab. The target is None.

    Example 6:
    input: Inhibition of FITC-LDEETGEFL-NH2 binding to recombinant human Keap1 Kelch domain (321 to 609 residues) expressed in Escherichia Coli BL21(DE3)pLysS cells at 5 uM centrifuged for 2 mins followed by 30 mins incubation under shaking condition by fluorescence polarization assay relative to control
    output: The substrate is FITC-LDEETGEFL-NH2. The target is Keap1.

    Example 7:
    input: Substrate activity at recombinant human NQO1 expressed in Escherichia coli assessed as compound metabolized at 600 uL in presence of NADH and FAD measured after 12 hrs by HPLC analysis
    output: The substrate is None. The target is NQO1.

    
    Now extract the substrate name and the target name from the following description.
    Answer format should be : The substrate is <substrate name or None> . The target is <target name or None> .
    Nothing else should not appear in the answer.
    
    input: {description}
    output:
    """

    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
    )

    return response.choices[0].message.content

In [3]:
error_list = []
ans_dict = {}

df = df.iloc[:10000, :].reset_index(drop=True)
for index, row in tqdm(df.iterrows(), total=len(df)):
    try:
        ans = extract_substrate(
            row.description, model_name="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
        )
        ans_dict[row.assay_chembl_id] = ans
    except KeyboardInterrupt:
        break
    except:
        time.sleep(3)
        error_list.append(row.assay_chembl_id)
        print(f"smth wrong at {index} row, {row.assay_chembl_id}")

  0%|          | 0/5000 [00:00<?, ?it/s]

In [7]:
ans = df.query("assay_chembl_id in @ans_dict")
ans["llama_output"] = ans.assay_chembl_id.apply(lambda x: ans_dict[x])

In [2]:
# ans.to_parquet("/mnt/d/Downloads/llama_outputs_3.prqt", index=False)

final_ans = pd.concat(
    [
        pd.read_parquet("/mnt/d/Downloads/llama_outputs_1.prqt"),
        pd.read_parquet("/mnt/d/Downloads/llama_outputs_2.prqt"),
        pd.read_parquet("/mnt/d/Downloads/llama_outputs_3.prqt"),
    ]
).reset_index(drop=True)

final_ans["substrate"] = final_ans.llama_output.apply(
    lambda x: re.search(r"The substrate is (.*). The", x).group(1)
)
final_ans["target"] = final_ans.llama_output.apply(
    lambda x: re.search(r"The target is (.*).", x).group(1)
)

final_ans.to_parquet("/mnt/d/Downloads/llama_output_all.prqt", index=False)

### Set NER tag to each token

In [79]:
def prepare_ner_data(row):
    description = row["description"]
    substrate = row["substrate"]
    target = row["target"]

    if substrate != "None":
        match = re.search(re.escape(substrate), description, re.IGNORECASE)
        if match:
            start, end = match.span()
            description = " ".join(
                [description[:start], description[start:end], description[end:]]
            )

    if target != "None":
        match = re.search(re.escape(target), description, re.IGNORECASE)
        if match:
            start, end = match.span()
            description = " ".join(
                [description[:start], description[start:end], description[end:]]
            )

    tokens = description.split()
    labels = ["O"] * len(tokens)

    def mark_entity(entity, label_prefix):
        if entity != "None":
            entity_tokens = entity.split()
            entity_length = len(entity_tokens)

            for i in range(len(tokens) - entity_length + 1):
                if tokens[i : i + entity_length] == entity_tokens:
                    labels[i] = f"B-{label_prefix}"
                    for j in range(1, entity_length):
                        labels[i + j] = f"I-{label_prefix}"

    mark_entity(substrate, "SUBSTRATE")
    mark_entity(target, "TARGET")

    return list(zip(tokens, labels))


final_ans["ner_data"] = final_ans.apply(prepare_ner_data, axis=1)

In [99]:
final_ans.head()

Unnamed: 0,assay_chembl_id,description,llama_output,substrate,target,ner_data
0,CHEMBL4605978,Inhibition of human GST-tagged USP9X CD (1554 ...,The substrate is Ubiquitin-Rhodamine110-glycin...,Ubiquitin-Rhodamine110-glycine,USP9X,"[(Inhibition, O), (of, O), (human, O), (GST-ta..."
1,CHEMBL5039331,Inhibition of LSD1 in human MGC-803 cells asse...,The substrate is None. The target is LSD1.,,LSD1,"[(Inhibition, O), (of, O), (LSD1, B-TARGET), (..."
2,CHEMBL827279,Inhibitory concentration against human glutath...,The substrate is None. The target is glutathio...,,glutathione reductase,"[(Inhibitory, O), (concentration, O), (against..."
3,CHEMBL846760,In vitro vitamin D receptor-mediated antiproli...,The substrate is None. The target is vitamin D...,,vitamin D receptor,"[(In, O), (vitro, O), (vitamin, B-TARGET), (D,..."
4,CHEMBL4885798,"TRIM33(PHD,Bromo.) DiscoverX BROMOscan",The substrate is None. The target is TRIM33.,,TRIM33,"[(TRIM33, B-TARGET), ((PHD,Bromo.), O), (Disco..."
