In [None]:
%%capture
# ! pip install trl
! pip install datasets
! pip install rdkit
# ! pip install peft
# ! pip install git+https://github.com/bitsandbytes-foundation/bitsandbytes.git
# ! pip install bitsandbytes==0.45.1

In [None]:
from datasets import load_dataset , Dataset
# from trl import SFTConfig, SFTTrainer
import pandas as pd

## building sft dataset manual

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Papers/Original_Papers/Llm_drug_prediction/Code/reasoning_data_api/test_reason.csv")
dset = Dataset.from_pandas(df)

In [None]:
df_sample = df.head()

In [None]:
# from rdkit import Chem
# from rdkit.Chem import Descriptors, Draw, AllChem, Lipinski, Crippen, rdMolDescriptors
# from rdkit.Chem import rdDepictor
# import pandas as pd

# def get_molecule_properties(smiles):
#     """
#     Compute molecular properties and analyses for a given SMILES string.

#     Args:
#         smiles (str): SMILES string of the molecule.

#     Returns:
#         dict: A dictionary containing all computed properties and analyses.
#     """
#     # Initialize the molecule
#     molecule = Chem.MolFromSmiles(smiles)
#     if molecule is None:
#         raise ValueError("Invalid SMILES string provided.")

#     # Dictionary to store all properties
#     properties = {}


#     properties["Molecular_Weight"] = Descriptors.MolWt(molecule)
#     properties["LogP"] = Crippen.MolLogP(molecule)
#     properties["Molecular_Refractivity"] = Crippen.MolMR(molecule)
#     properties["TPSA"] = Descriptors.TPSA(molecule)
#     properties["Hydrogen_Bond_Donors"] = Lipinski.NumHDonors(molecule)
#     properties["Hydrogen_Bond_Acceptors"] = Lipinski.NumHAcceptors(molecule)
#     properties["Rotatable_Bonds"] = Lipinski.NumRotatableBonds(molecule)
#     properties["Chiral_Centers"] = Chem.FindMolChiralCenters(molecule)

#     return properties

from rdkit import Chem
from rdkit.Chem import Descriptors, Draw, AllChem, Lipinski, Crippen, rdMolDescriptors, FilterCatalog
from rdkit.Chem.FilterCatalog import FilterCatalogParams
from rdkit.Chem import rdDepictor
import pandas as pd

def get_molecule_properties(smiles):
    """
    Compute molecular properties and analyses for a given SMILES string, including structural alerts (PAINS/Brenk).

    Args:
        smiles (str): SMILES string of the molecule.

    Returns:
        dict: A dictionary containing all computed properties and analyses.
    """
    # Initialize the molecule
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        raise ValueError("Invalid SMILES string provided.")

    # Dictionary to store all properties
    properties = {}

    # Basic descriptors
    properties["Molecular Weight"] = Descriptors.MolWt(molecule)
    properties["LogP"] = Crippen.MolLogP(molecule)
    properties["Molecular Refractivity"] = Crippen.MolMR(molecule)
    properties["TPSA"] = Descriptors.TPSA(molecule)
    properties["Hydrogen Bond_Donors"] = Lipinski.NumHDonors(molecule)
    properties["Hydrogen_Bond Acceptors"] = Lipinski.NumHAcceptors(molecule)
    properties["Rotatable Bonds"] = Lipinski.NumRotatableBonds(molecule)
    properties["Chiral Centers"] = len(Chem.FindMolChiralCenters(molecule))  # Count instead of list
    properties["Total Atoms"] = molecule.GetNumAtoms()  # Added total number of atoms

    properties["Total Atoms"] = molecule.GetNumAtoms()
    properties["Heavy Atoms"] = molecule.GetNumHeavyAtoms()
    properties["Total Bonds"] = molecule.GetNumBonds()
    properties["Formal Charge"] = Chem.rdmolops.GetFormalCharge(molecule)
    properties["Total Rings"] = molecule.GetRingInfo().NumRings()

    # Structural alerts (PAINS/Brenk)
    params = FilterCatalogParams()
    params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)
    params.AddCatalog(FilterCatalogParams.FilterCatalogs.BRENK)
    catalog = FilterCatalog.FilterCatalog(params)

    # Get all matches
    matches = catalog.GetMatches(molecule)
    if matches:
        properties["Structural Alerts"] = [entry.GetDescription() for entry in matches]
    else:
        properties["Structural Alerts"] = "None"

    # ========================
    # NEW: Ring Composition Analysis
    # ========================
    ri = molecule.GetRingInfo()
    aromatic_rings = 0
    aliphatic_rings = 0

    for ring in ri.AtomRings():
        is_aromatic = True
        for atom_idx in ring:
            if not molecule.GetAtomWithIdx(atom_idx).GetIsAromatic():
                is_aromatic = False
                break
        if is_aromatic:
            aromatic_rings += 1
        else:
            aliphatic_rings += 1


    properties["Aromatic Rings"] = aromatic_rings
    properties["Aliphatic Rings"] = aliphatic_rings
    # properties["Ring_Score"] = (
    #     "Optimal" if aromatic_rings == 2 and aliphatic_rings == 1
    #     else "Suboptimal"
    # )
    return properties



In [None]:
df_sample['status'] = df_sample['labels'].map({1: 'Approved', 0 :'Not Approved'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample['status'] = df_sample['labels'].map({1: 'Approved', 0 :'Not Approved'})


In [None]:
df_sample["rdkit_info"] = df_sample["SMILES"].apply(lambda x: get_molecule_properties(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample["rdkit_info"] = df_sample["SMILES"].apply(lambda x: get_molecule_properties(x))


In [None]:
df_sample["rdkit_info"][0]

{'Molecular Weight': 324.39900000000006,
 'LogP': 3.812980000000003,
 'Molecular Refractivity': 90.91400000000006,
 'TPSA': 36.26,
 'Hydrogen Bond_Donors': 0,
 'Hydrogen_Bond Acceptors': 3,
 'Rotatable Bonds': 5,
 'Chiral Centers': 0,
 'Total Atoms': 24,
 'Heavy Atoms': 24,
 'Total Bonds': 26,
 'Formal Charge': 0,
 'Total Rings': 3,
 'Structural Alerts': 'None',
 'Aromatic Rings': 2,
 'Aliphatic Rings': 1}

In [None]:
sys_prompt4 ="""
You are a **chemist specializing in drug discovery and molecular modeling**, tasked with analyzing chemical compounds for drug-likeness and viability. Your assessment considers:

1. **Drug-Likeness Rules**:
   - **Lipinski’s Rule of Five** (MW, LogP, H-bond donors/acceptors).
   - **Veber’s Rule** (PSA, rotatable bonds).
   - **Ghose, Egan, and Rule of Three** (lead-like properties).
   - **PAINS & Brenk Filters** (toxic/assay-interfering structures).

2. **ADMET Profiling**:
   - **Absorption** (solubility, permeability).
   - **Distribution** (Vd, plasma protein binding).
   - **Metabolism** (CYP450 interactions, stability).
   - **Excretion** (renal, biliary clearance).
   - **Toxicity** (hepatotoxicity, cardiotoxicity, Ames test).

3. **Information From similar and dissimilar  drugs**


Your goal is to determine the compound’s potential as a drug candidate by integrating these computational and experimental criteria. and predict it will be approved or not
"""

In [None]:
sys_prompt5 = """
You are a **chemist specializing in drug discovery and molecular modeling**. your task is to predict wheather the compound will be approved or not.
you should consider the following criteria:


1. Drug-Likeness Rules(Weight: 30%):

Assign 1 point per passed rule (e.g., Lipinski, Veber, Ghose).
Normalize to total applicable rules (e.g., 8/12 rules passed = 0.67×30=20.1).

2. Structural Alerts
PAINS/Brenk Filters:

No alerts: +10.

Alerts present: -10.

4. Model Similarity & Bias Adjustment (Weight: 40%):
Adjusted Similarity Score=(Avg. Approved Sim×Precision−Avg. Unapproved Sim×(1 - Precision))

5. Toxicity & Pharmacokinetics (Weight: 20%)
Approved Similar Drugs:

If ≥3/5 have low/acceptable toxicity: +15.

Missing data assumed neutral (no penalty).

Unapproved Similar Drugs:

If ≥3/5 have high toxicity/unknown: +5.



"""

In [None]:
sys_prompt6 = """
You are a **chemist specializing in drug discovery and molecular modeling**. your task is to predict wheather the compound will be approved or not.

you should consider the following criteria:
1. Drug-Likeness Rules:
   - Lipinski’s Rule of Five (MW, LogP, H-bond donors/acceptors).
   - Veber’s Rule (PSA, rotatable bonds).
   - Ghose
   - Egan

    Assign 1 point per passed rule
    Normalize to total applicable rules (e.g., 1/5 rules passed =0.20).


2. ADMET Profiling:
   - Absorption (solubility, permeability).
   - Distribution (Vd, plasma protein binding).
   - Metabolism (CYP450 interactions, stability).
   - Excretion (renal, biliary clearance).
   - Toxicity (hepatotoxicity, cardiotoxicity, Ames test).
- PAINS & Brenk Filters (toxic/assay-interfering structures).


3. Structural Alerts:
    PAINS/Brenk Filters

4. Model Similarity & Bias Adjustment:
Adjusted Similarity Score=(Avg. Approved Sim×Precision−Avg. Unapproved Sim×(1 - Precision))

5. Toxicity & Pharmacokinetics


"""

In [None]:
sys_prompt7 = """
You are a **chemist specializing in drug discovery and molecular modeling**. your task is to predict wheather the compound will be approved or not.
you should consider the following criteria:
the total sum of scoring is 100. analysis the following information and return the score.
1. **Drug-Likeness Rules**:
   - **Lipinski’s Rule of Five** (MW, LogP, H-bond donors/acceptors).
   - **Veber’s Rule** (PSA, rotatable bonds).
   - **Ghose, Egan, and Rule of Three** (lead-like properties).
   - **PAINS & Brenk Filters** (toxic/assay-interfering structures).

2. **ADMET Profiling**:
   - **Absorption** (solubility, permeability).
   - **Distribution** (Vd, plasma protein binding).
   - **Metabolism** (CYP450 interactions, stability).
   - **Excretion** (renal, biliary clearance).
   - **Toxicity** (hepatotoxicity, cardiotoxicity, Ames test).

3. Structural AlertsP
4. Model Similarity & Bias Adjustment
5. Toxicity & Pharmacokinetics

"""

In [None]:
## I should review step by step and check again.

In [None]:
########################################################################################################################
########################################################################################################################
########################################################################################################################


n = 0



########################################################################################################################
########################################################################################################################
########################################################################################################################

In [None]:
def get_first_prompt(list_of_most_approved_info_for_n_similar , list_of_most_nonapproved_info_for_n_similar,rdkit_info , sys_prompt):
    prompt = f"""
    {sys_prompt}

    I have developed a model that can predict the drug likelibility of compound X.
    to predict that I have trained a classifier on a dataset of approved and unapproved drugs. the model performance is {{'F1 score': 0.80,
    'Accuracy': 0.69,
    'Precision': 0.68,
    'Recall': 0.975,
    'AUC': 0.68,
    'Specificity': 0.20}}
    This model outputs two lists:
    1. The five most similar approved small molecules (with their properties and similarity scores).
    2. The five most similar non-approved small molecules (with their properties and similarity scores).

    Your task is to analyze the likelihood of compound X receiving regulatory approval based on its similarity to known molecules.

    - RDKit Analysis of Compound X:
    {rdkit_info}df["SMILES"]

    Using the provided data:
    - Approved molecules (with similarity scores): {list_of_most_approved_info_for_n_similar}
    - Non-approved molecules (with similarity scores): {list_of_most_nonapproved_info_for_n_similar}
    """
    return prompt

In [None]:
df_sample["1st_prompt"] = df_sample.apply(lambda x : get_first_prompt(x["list_of_most_approved_info_for_n_similar"],x["list_of_most_nonapproved_info_for_n_similar"], x["rdkit_info"], sys_prompt6) ,axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample["1st_prompt"] = df_sample.apply(lambda x : get_first_prompt(x["list_of_most_approved_info_for_n_similar"],x["list_of_most_nonapproved_info_for_n_similar"], x["rdkit_info"], sys_prompt6) ,axis = 1)


In [None]:
df_sample

Unnamed: 0,SMILES,labels,Dicts,list_of_most_approved_info_for_n_similar,list_of_most_nonapproved_info_for_n_similar,status,rdkit_info,1st_prompt
0,CN(C)CCCC1(c2ccc(F)cc2)OCc2cc(C#N)ccc21,1,{'most_app': ...,"[{'name': 'Escitalopram', 'similarity score': ...","[{'name': 'Cebranopadol', 'similarity score': ...",Approved,"{'Molecular Weight': 324.39900000000006, 'LogP...",\n \nYou are a **chemist specializing in dr...
1,CC(C)CCCCC(=O)N[C@@H](CCN)C(=O)N[C@@H](CN[C@@H...,1,{'most_app': ...,"[{'name': None, 'similarity score': 0.96003526...","[{'name': None, 'similarity score': 0.83061236...",Approved,"{'Molecular Weight': 2296.970999999999, 'LogP'...",\n \nYou are a **chemist specializing in dr...
2,CC(C)(C)c1cc(C(C)(C)C)c(NC(=O)c2c[nH]c3ccccc3c...,1,{'most_app': ...,"[{'name': 'Atorvastatin', 'similarity score': ...","[{'name': 'Ritobegron', 'similarity score': 0....",Approved,"{'Molecular Weight': 392.49900000000014, 'LogP...",\n \nYou are a **chemist specializing in dr...
3,CCCCCCCCCCCCCCCCCCCCCCO,1,{'most_app': ...,"[{'name': 'Salmeterol', 'similarity score': 0....","[{'name': 'Ethanolamine oleate', 'similarity s...",Approved,"{'Molecular Weight': 326.60900000000004, 'LogP...",\n \nYou are a **chemist specializing in dr...
4,CC(C)(CO)C(O)C(=O)NCCCO,0,{'most_app': ...,"[{'name': 'Dexpanthenol', 'similarity score': ...","[{'name': 'Duvoglustat', 'similarity score': 0...",Not Approved,"{'Molecular Weight': 205.25399999999996, 'LogP...",\n \nYou are a **chemist specializing in dr...


In [None]:
df_sample["labels"][n]

np.int64(1)

In [None]:
df_sample["1st_prompt"][n]

'\n    \nYou are a **chemist specializing in drug discovery and molecular modeling**. your task is to predict wheather the compound will be approved or not.\n\nyou should consider the following criteria:\n1. Drug-Likeness Rules:\n   - Lipinski’s Rule of Five (MW, LogP, H-bond donors/acceptors).\n   - Veber’s Rule (PSA, rotatable bonds).\n   - Ghose\n   - Egan\n\n    Assign 1 point per passed rule\n    Normalize to total applicable rules (e.g., 1/5 rules passed =0.20).\n\n\n2. ADMET Profiling:\n   - Absorption (solubility, permeability).\n   - Distribution (Vd, plasma protein binding).\n   - Metabolism (CYP450 interactions, stability).\n   - Excretion (renal, biliary clearance).\n   - Toxicity (hepatotoxicity, cardiotoxicity, Ames test).\n- PAINS & Brenk Filters (toxic/assay-interfering structures).\n\n\n3. Structural Alerts:\n    PAINS/Brenk Filters\n\n4. Model Similarity & Bias Adjustment:\nAdjusted Similarity Score=(Avg. Approved Sim×Precision−Avg. Unapproved Sim×(1 - Precision))\n\n

In [None]:
df_sample.loc[n, "think1"] = """
Okay, let's tackle this problem step by step. The user wants to predict if compound X will be approved based on several criteria. Let's start by breaking down each part of the analysis as outlined.

First, Drug-Likeness Rules. The user mentioned Lipinski’s Rule of Five, Veber’s Rule, Ghose, and Egan. I need to check each of these for compound X using the provided RDKit analysis.

Looking at Lipinski's Rule:

- Molecular Weight (MW) should be ≤500. Compound X has 324.4, so that's a pass.

- LogP ≤5. The LogP here is 3.81, which is under 5, so another pass.

- Hydrogen Bond Donors ≤5. The compound has 0, so pass.

- Hydrogen Bond Acceptors ≤10. It has 3, so pass. That's 4/4 for Lipinski.

Veber's Rule looks at Polar Surface Area (TPSA) ≤140 Å² and Rotatable Bonds ≤10. Compound X has TPSA of 36.26 and 5 rotatable bonds. Both pass, so that's 2 points.

Ghose Filter:

- MW between 160-480. 324.4 is within range.

- LogP between -0.4 to 5.6. 3.81 is okay.

- Molar Refractivity (MR) between 40-130. The MR here is 90.91, so pass.

- Total atoms between 20-70. Compound X has 24 atoms. Pass. So 4/4 for Ghose.

Egan's Rule:

- LogP ≤5.1 and TPSA ≤131.6. Compound X meets both, so 2 points.

Adding up all applicable rules. Wait, the user says to assign 1 point per passed rule and normalize. Let me check how many total rules there are. Lipinski has 4, Veber 2, Ghose 4, Egan 2. Total of 12 rules? Wait, actually, each rule (Lipinski, Veber, Ghose, Egan) is a set of criteria. For example, Lipinski's Rule of Five has four criteria, but passing all four counts as 1 point for Lipinski. Or is each criterion a separate point? The user says "1 point per passed rule". So each rule (e.g., Lipinski's) is considered a single rule, and passing all its criteria gives 1 point. But wait, no, the user might mean each individual criterion. The initial instruction says "Assign 1 point per passed rule. Normalize to total applicable rules (e.g., 1/5 rules passed =0.20)." Hmm, maybe each rule is a separate check. For example, Lipinski's Rule has four components (MW, LogP, H donors, H acceptors). But Lipinski's Rule is considered as a single rule where if a compound passes all four, it gets 1 point. Similarly, Veber's is two components (PSA and rotatable bonds), and passing both gives 1 point. So each of the four rules (Lipinski, Veber, Ghose, Egan) are each a single rule. But the user says "1 point per passed rule". So if a compound passes Lipinski's Rule (all four criteria), that's 1 point. If it passes Veber's, another 1 point. Then Ghose and Egan each 1 point if passed. But how many total applicable rules? For example, if some rules aren't applicable. But the user hasn't mentioned that. So total applicable rules would be 4 (Lipinski, Veber, Ghose, Egan). So the normalization is over 4. But in the example given, 1/5, maybe there are 5 rules? Wait, maybe the user made a typo. Let me recheck the initial instruction:

"Drug-Likeness Rules:

- Lipinski’s Rule of Five (MW, LogP, H-bond donors/acceptors).

- Veber’s Rule (PSA, rotatable bonds).

- Ghose

- Egan

Assign 1 point per passed rule

Normalize to total applicable rules (e.g., 1/5 rules passed =0.20)."

Wait, the example says 1/5, which would imply 5 rules, but the user listed four. Maybe Ghose and Egan each have more than one component? Let me recall the actual rules.

Lipinski's Rule of Five: MW ≤500, LogP ≤5, H-donors ≤5, H-acceptors ≤10. All four must be met to pass the rule. So passing Lipinski is 1 point.

Veber's Rule: PSA ≤140 Å² and rotatable bonds ≤10. Both needed for 1 point.

Ghose Filter: MW 160-480, LogP -0.4–5.6, MR 40-130, total atoms 20-70. All four needed for 1 point.

Egan’s Rule: LogP ≤5.88 and TPSA ≤131.6. Both needed for 1 point.

So each of the four rules (Lipinski, Veber, Ghose, Egan) is a separate rule. So total applicable rules are 4. Each passed rule gives 1 point. So the normalization would be points / 4.

Now, checking compound X against each:

Lipinski: All four criteria met (MW 324.4, LogP 3.81, H-donors 0, H-acceptors 3). So 1 point.

Veber: PSA 36.26 ≤140 and rotatable bonds 5 ≤10. Pass. 1 point.

Ghose: MW 324.4 within 160-480, LogP 3.81 within -0.4–5.6, MR 90.91 within 40-130, total atoms 24 within 20-70. All pass. 1 point.

Egan: LogP 3.81 ≤5.88 and TPSA 36.26 ≤131.6. Both pass. 1 point.

Total points: 4/4 = 1.0. So normalized score is 1.0. That's perfect for drug-likeness.

Next, ADMET Profiling. The user lists Absorption, Distribution, Metabolism, Excretion, Toxicity, PAINS/Brenk. But the data provided doesn't directly give ADMET parameters. However, the approved and non-approved molecules' info might have some clues. For example, looking at the approved similar drugs, their metabolism and toxicity profiles might be similar. But without specific ADMET data for compound X, maybe we can infer based on similarity to approved drugs. For example, if the approved drugs have good ADMET profiles and compound X is similar, it might also have good ADMET.

Structural Alerts: The RDKit analysis says "Structural Alerts: None", so that's good. PAINS/Brenk filters are passed, so no issues there.

Model Similarity & Bias Adjustment: The adjusted similarity score formula is given. Need to calculate that. The approved molecules have similarity scores, and the non-approved as well. The model's precision is 0.68.

First, compute the average similarity score for approved and non-approved. The approved list has 5 entries, but some have 'None' for name. Let's check:

Approved molecules:

1. Escitalopram: 0.9846

2. None: 0.9304 (this entry seems incomplete, but assuming it's approved? The approval groups are 'approved' for others. This one has approval groups as None. Maybe a data issue. Perhaps this entry should be excluded? Or maybe it's a mistake. Let's check the user input.

Looking at the user's data for approved molecules:

The first entry is Escitalopram, approval groups: ['approved'].

Second entry: name None, approval groups None. Probably an error or missing data. Maybe this is a non-approved, but in the approved list. Hmm. But the user says "approved molecules (with similarity scores)" and lists five entries. The second one has approval groups as None, so maybe it's a mistake. Perhaps exclude that entry. So taking only the four approved with valid data: Escitalopram (0.9846), Rotigotine (0.9241), Clopidogrel (0.9227), Azelastine (0.9129). The None entry has 0.9304 but no approval info; perhaps it's non-approved but mistakenly in the approved list. Alternatively, maybe it's approved but data missing. For safety, maybe calculate the average of all five, but the second entry's approval group is None. Alternatively, maybe the user intended all five in the approved list. Let's proceed with the given data as is.

Approved similarity scores: [0.9846, 0.9304, 0.9241, 0.9227, 0.9129]. Average = sum them and divide by 5.

Sum: 0.9846 + 0.9304 = 1.915; +0.9241=2.8391; +0.9227=3.7618; +0.9129=4.6747. Average ≈4.6747/5 ≈0.9349.

Non-approved molecules: their similarity scores are [0.9189, 0.9010, 0.8997, 0.8997, 0.8987]. Sum: 0.9189 +0.9010=1.8199; +0.8997=2.7196; +0.8997=3.6193; +0.8987=4.518. Average ≈4.518/5 ≈0.9036.

Model's precision is 0.68.

Adjusted Similarity Score = (Avg. Approved Sim × Precision) - (Avg. Unapproved Sim × (1 - Precision)).

Plugging in:

(0.9349 * 0.68) - (0.9036 * (1 - 0.68)) = 0.6357 - (0.9036 * 0.32) ≈ 0.6357 - 0.2892 ≈0.3465. So adjusted score is ~0.35.

Toxicity & Pharmacokinetics: The approved drugs have known toxicity profiles. For example, Escitalopram has overdose info, but no severe structural alerts. The non-approved ones might have less favorable profiles. Since compound X has no structural alerts, that's a positive sign. Also, looking at the approved similar drugs, their toxicity is manageable, suggesting compound X might as well.

Now, the model performance: The classifier has high recall (0.975) but low specificity (0.20), meaning it's good at identifying approved drugs but has many false positives. Precision is 0.68, so when it predicts approved, 68% are correct. F1 is 0.80, which is decent.

Combining all factors:

- Drug-likeness score is 1.0, excellent.

- ADMET: Assuming similar to approved drugs, which have acceptable profiles. Structural alerts none.

- Adjusted similarity score positive (0.35), favoring approved.

- Model's precision is 0.68, but with high recall. However, the adjusted score might offset some bias.

But how to combine these? The user wants a final prediction. The high drug-likeness, no structural alerts, and similarity to approved compounds (especially Escitalopram which is very similar) suggest a higher chance of approval. The model's adjusted score is positive but not extremely high. However, the classifier itself has a precision of 0.68, meaning 68% of its positive predictions are correct. If the model is predicting compound X as approved (since it's asking for the prediction), then with 68% precision, but considering the high similarity to approved drugs, the likelihood might be higher.

But wait, the user hasn't explicitly stated the model's prediction for compound X. The model outputs similar approved and non-approved molecules, but the actual prediction would be based on those similarities. The adjusted similarity score is a way to weigh the similarity towards approved, considering the model's precision. The adjusted score is positive (0.35), which might tip towards approved.

Putting it all together, despite the model's moderate precision, the strong drug-likeness, structural safety, and high similarity to approved drugs (especially the top one being almost 0.98 similar to Escitalopram) suggest that compound X has a good chance of approval. However, the non-approved similar molecules are also quite similar (scores around 0.90), which might be a concern, but the adjusted score still favors approved.

Final conclusion: Likely to be approved.

"""

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample.loc[n, "think1"] = """


In [None]:
df_sample.loc[n, "answer1"] = """
Analysis of Compound X's Approval Likelihood
1. Drug-Likeness Rules
Compound X passes all 4 drug-likeness rules (Lipinski, Veber, Ghose, Egan) with a normalized score of 1.0 (4/4 rules passed). Key highlights:

Lipinski: MW = 324.4 (<500), LogP = 3.81 (<5), H-bond donors = 0 (<5), H-bond acceptors = 3 (<10).

Veber: TPSA = 36.26 Å² (<140), rotatable bonds = 5 (<10).

Ghose/Egan: All parameters (MW, LogP, MR, TPSA) within acceptable ranges.

Conclusion: Excellent drug-likeness profile.

2. ADMET & Structural Alerts
ADMET: No direct data, but similarity to approved drugs (e.g., escitalopram, clopidogrel) suggests favorable absorption, metabolism, and toxicity profiles.

Structural Alerts: None detected (passes PAINS/Brenk filters).

3. Model Similarity & Bias Adjustment
Avg. Approved Similarity: 0.935 (high similarity to escitalopram [0.985], rotigotine, clopidogrel).

Avg. Unapproved Similarity: 0.904 (closest non-approved: cebranopadol [0.919], rucaparib).

Adjusted Similarity Score:

(
0.935
×
0.68
)
−
(
0.904
×
0.32
)
=
0.35
(0.935×0.68)−(0.904×0.32)=0.35
Positive score indicates bias toward approved drugs.

4. Toxicity & Pharmacokinetics
Similarity to approved drugs (e.g., escitalopram) suggests manageable toxicity (no cardiotoxicity/hepatotoxicity alerts).

Pharmacokinetic parameters (e.g., rotatable bonds = 5, TPSA = 36.26) suggest good permeability and solubility.

5. Model Performance Context
High Recall (97.5%): Model rarely misses approved drugs (low false negatives).

Low Specificity (20%): High false positives, but adjusted similarity score mitigates bias.

Precision (68%): 68% of predicted "approved" compounds are truly approved.

Final Prediction
Compound X has a high likelihood of regulatory approval due to:

Perfect drug-likeness score.

High structural similarity to approved drugs (e.g., escitalopram).

No structural alerts or toxicity red flags.

Positive adjusted similarity score (0.35) despite model bias.

Recommendation: Proceed to preclinical trials with focus on CYP450 metabolism and long-term toxicity studies.


"""

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample.loc[n, "answer1"] = """


In [None]:
def get_second_prompt(label):
    return f"""But wait the true answer is {label}. think step by step and tell why"""
## reevaluation
## on the second thought
df_sample["2nd_prompt"] = df_sample.apply(lambda x : get_second_prompt(x["status"]) ,axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample["2nd_prompt"] = df_sample.apply(lambda x : get_second_prompt(x["status"]) ,axis = 1)


In [None]:
df_sample["2nd_prompt"][n]

'But wait the true answer is Approved. think step by step and tell why'

In [None]:
df_sample.loc[n , "think2"]= """
Okay, let's try to figure out why the compound X wasn't approved even though the initial analysis suggested it might be. First, I need to recall the data provided. The user mentioned that the model listed similar approved and non-approved molecules. The approved ones like Dexpanthenol, Miglustat, Glucosamine, and Nadolol had high similarity scores. The non-approved ones included Duvoglustat, Migalastat, Arbutin, and Indole-3-carbinol.

The initial analysis looked at drug-likeness rules (Lipinski, Veber) and ADMET properties, and everything seemed okay. Molecular weight, LogP, TPSA, rotatable bonds—all within acceptable ranges. Toxicity data from similar approved drugs suggested low acute toxicity. So why the rejection?

Hmm, maybe the answer lies in the specifics of the non-approved molecules. Let me check those again. Migalastat is mentioned as approved but under accelerated pathways, which sometimes means there's conditional approval pending further studies. Duvoglustat is investigational, so maybe it has issues. Arbutin is approved but for topical use, which is different from systemic administration. Indole-3-carbinol is investigational, possibly with unresolved safety issues.

Wait, the user said the true answer is "Not Approved." So what's missing here? Maybe the ADMET profile isn't as good as thought. Let's re-examine the ADMET part. The compound has low LogP (-1.14), which is quite hydrophilic. High solubility is good, but too low LogP can mean poor membrane permeability, making oral bioavailability challenging. Even if similar approved drugs have low LogP, maybe this compound's combination of properties leads to poor absorption despite passing the rules.

Another angle: the similar non-approved drugs. If the compound is structurally closer to non-approved ones with issues, that's a red flag. For example, Migalastat's approval is under accelerated pathways with ongoing trials. If compound X's mechanism or metabolism mirrors a drug that later showed toxicity in confirmatory trials, that could be a problem.

Also, looking at the approved analogs: Dexpanthenol is used topically or as a supplement. Miglustat is for a rare disease (Gaucher) with specific indications. If compound X is intended for a broader use, the safety profile might not hold up. The high similarity to Dexpanthenol might not translate to efficacy in a different therapeutic area.

Toxicity data: The LD50 values from Dexpanthenol are high, but maybe compound X has off-target effects not captured in acute toxicity. Chronic toxicity or specific organ toxicity (like hepatotoxicity) could be a concern. The initial analysis didn't mention CYP interactions or detailed metabolism beyond what's in similar drugs. If compound X has unique metabolites that are toxic, that's a risk.

Pharmacodynamics and mechanism of action: If the approved analogs work through well-understood mechanisms but compound X's MoA is novel or less understood, regulators might be cautious. The non-approved analogs might have mechanisms that failed in clinical trials, and similarity to those could be a negative.

Another point: The presence of 4 hydrogen bond donors and acceptors. While within Lipinski's limits, this might lead to high polarity, affecting passive diffusion. Even if TPSA is under 140, the specific arrangement could hinder absorption.

Also, the rotatable bonds (6) are at the higher end of Veber's criteria. This might increase molecular flexibility, potentially leading to poor bioavailability or rapid metabolism. Approved drugs with similar counts might have optimized structures that compound X lacks.

Looking at the non-approved list, Arbutin is approved but for topical use. If compound X is intended for systemic use, the safety profile from topical applications doesn't guarantee safety orally or intravenously. Similarly, Indole-3-carbinol is investigational, possibly due to insufficient efficacy or safety concerns.

Regulatory agencies also look at therapeutic need. If compound X is similar to existing drugs but doesn't offer significant advantages (e.g., better efficacy, fewer side effects), approval might be denied. The market might already be saturated with similar molecules.

Lastly, maybe the model's similarity scores aren't the whole story. Structural similarity doesn't always translate to similar pharmacokinetics or dynamics. Subtle differences in the structure could lead to unexpected issues in clinical trials, like drug interactions or immune responses not seen in analogs.

Putting it all together, despite passing initial drug-likeness and ADMET checks, compound X's high similarity to drugs with limited use (topical, rare diseases) or non-approved molecules, combined with potential bioavailability issues and lack of therapeutic innovation, likely led to its rejection.

"""

In [None]:
df_sample.loc[n , "answer2"]= """
1. Critical Drug-Likeness and ADMET Red Flags
Lipinski/VEBER Rule Limitations
Low LogP (-1.14): Extremely hydrophilic, suggesting poor membrane permeability despite good solubility. While Lipinski’s rules are met, passive absorption for systemic delivery may fail (e.g., analogs like Dexpanthenol are used topically or in niche indications, not orally).

High Rotatable Bonds (6): At the upper limit of Veber’s criteria, increasing metabolic instability and reducing bioavailability compared to optimized drugs like Miglustat.

ADMET Weaknesses
Absorption: Low LogP and moderate TPSA (89.79) indicate poor oral bioavailability. Approved analogs (e.g., Dexpanthenol) bypass this via non-oral routes (injection/topical).

Toxicity Gaps:

No Ames test data for mutagenicity. Structural similarity to non-mutagens is not definitive.

Chronic toxicity risks (e.g., hepatotoxicity) are unaddressed. Non-approved analogs like Duvoglustat/Indole-3-carbinol highlight unresolved safety concerns.

2. Structural Similarity to High-Risk Analogs
Non-Approved Molecules
Migalastat (0.866 similarity): Approved under accelerated pathways with ongoing confirmatory trials. Compound X’s similarity raises questions about long-term safety validation.

Arbutin (0.865 similarity): Approved only for topical use. Systemic administration of Compound X may inherit unstudied risks (e.g., hydroquinone release).

Duvoglustat/Indole-3-carbinol: Investigational status implies efficacy/safety failures in trials (e.g., anti-HIV activity abandoned due to toxicity).

Approved Molecules with Niche Use
Dexpanthenol: Topical/epithelial use only. Systemic efficacy unproven.

Miglustat: Targets rare diseases (Gaucher/Niemann-Pick). Broad indications for Compound X may require stricter safety thresholds.

3. Lack of Therapeutic Differentiation
Mechanistic Redundancy: Overlaps with existing drugs (e.g., Glucosamine for joint health) without novel targets or improved efficacy.

No Market Need: Similar to Nadolol (beta-blocker), which is no longer first-line due to newer selective agents.

4. Hidden Metabolic/Toxicological Risks
Metabolism: Unlike Miglustat (minimal hepatic metabolism), Compound X’s undefined metabolic pathway risks toxic metabolite formation.

CYP450 Interactions: No data on enzyme inhibition/induction, critical for chronic use.

Conclusion: Key Reasons for Rejection
Poor Oral Bioavailability: Hydrophilicity and rotatable bonds limit systemic absorption.

Structural Links to Non-Approved/High-Risk Drugs: Migalastat’s conditional approval and Arbutin’s topical-only use signal unaddressed risks.

Lack of Therapeutic Innovation: Redundant mechanism without safety/efficacy advantages.

Incomplete Toxicity Profile: Missing mutagenicity and chronic toxicity data.

Final Verdict: Regulatory agencies prioritize risk-benefit balance. Compound X’s similarity to niche-use drugs, unresolved ADMET gaps, and lack of differentiation outweigh its drug-like properties. Not Approved.

Recommendation: Redesign for improved permeability (e.g., prodrug) or pursue topical indications with rigorous chronic toxicity studies."""

In [None]:
## add posible routs(oral/topical):

def get_third_prompt(label):
    return  f"""
    write your thinking in the following format
    {{"status(Approved/NonApproved)": {label},
    "score(0=notapproved to 10=approved): "
    "reason_to_be_approved":
    "reason_to_be_notapproved"   }}
    """
df_sample["3rd_prompt"] = df_sample.apply(lambda x : get_third_prompt(x["status"]) ,axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample["3rd_prompt"] = df_sample.apply(lambda x : get_third_prompt(x["status"]) ,axis = 1)


In [None]:
df_sample["3rd_prompt"][n]

'\n    write your thinking in the following format\n    {"status(Approved/NonApproved)": Not Approved,\n    "score(0=notapproved to 10=approved): "\n    "reason_to_be_approved":\n    "reason_to_be_notapproved"   }\n    '

In [None]:
df_sample.loc[n , "think3"]= """
Alright, the user wants me to format the previous analysis into a specific JSON structure. Let me start by recalling the conclusion from the last response: Compound X was deemed Not Approved.

First, I need to assign a score. The original analysis mentioned strengths like meeting drug-likeness rules and similarity to approved drugs, but significant weaknesses in bioavailability and toxicity. Considering the balance, a score around 3.5/10 seems appropriate—low but not the absolute lowest.

For "reason_to_be_approved", I should list the positives: adherence to Lipinski's and Veber's rules, similarity to safe approved drugs like Dexpanthenol, and low acute toxicity. These are valid points that could support approval under certain conditions.

Now, "reason_to_be_notapproved" requires detailing the critical flaws. Poor oral bioavailability due to low LogP and high rotatable bonds is a key issue. Structural similarity to non-approved or topically used drugs like Arbutin and Migalastat (with their own risks) raises red flags. Lack of data on chronic toxicity and CYP interactions, plus no therapeutic innovation compared to existing drugs, are major hurdles. The market need isn't addressed, making approval unlikely.

I need to ensure the JSON syntax is correct, with proper commas and quotes. Also, the score should be a float. Let me double-check each key and value to match the required format.
"""

In [None]:
df_sample.loc[n , "anwer3"] = """
{
    "status(Approved/NonApproved)": "Not Approved",
    "score(0=notapproved to 10=approved)": 3.5,
    "reason_to_be_approved": "Compound X meets key drug-likeness rules (Lipinski/Veber), aligns structurally with safe approved drugs (e.g., Dexpanthenol, Miglustat), and shows low acute toxicity in analogs.",
    "reason_to_be_notapproved": "1. **Poor oral bioavailability**: Extremely low LogP (-1.14) and high rotatable bonds (6) limit systemic absorption. 2. **Structural similarity to non-approved/topical drugs**: Migalastat (accelerated approval with unresolved safety) and Arbutin (topical-only use) suggest unaddressed systemic risks. 3. **Incomplete toxicity profile**: No Ames test or chronic toxicity data. 4. **No therapeutic differentiation**: Overlaps with existing drugs (e.g., Glucosamine) without novel mechanisms. 5. **Metabolic uncertainty**: Undefined CYP450 interactions and potential toxic metabolites."
}
"""

In [None]:
df_sample.to_csv("/content/drive/MyDrive/Papers/Original_Papers/Llm_drug_prediction/Code/reasoning_data_api/for_sft.csv" , index = False)

## training sft

In [None]:
df_sft = pd.read_csv("/content/drive/MyDrive/Papers/Original_Papers/Llm_drug_prediction/Code/reasoning_data_api/for_sft.csv" )

In [None]:
df_sft

Unnamed: 0,SMILES,labels,Dicts,list_of_most_approved_info_for_n_similar,list_of_most_nonapproved_info_for_n_similar,status,rdkit_info,1st_prompt,think1,answer1,2nd_prompt,think2,answer2,3rd_prompt,think3,anwer3
0,CN(C)CCCC1(c2ccc(F)cc2)OCc2cc(C#N)ccc21,1,{'most_app': ...,"[{'name': 'Escitalopram', 'similarity score': ...","[{'name': 'Cebranopadol', 'similarity score': ...",Approved,"{'Molecular_Weight': 324.39900000000006, 'LogP...",\n \nYou are a **chemist specializing in dr...,"\nOkay, let's tackle this analysis. The user w...",\nBased on the analysis of Compound X's proper...,But wait the true answer is Approved. think st...,"\nOkay, let's see. The user previously asked f...",\nStep-by-Step Explanation for Regulatory Appr...,\n write your thinking in the following for...,"\nOkay, let's tackle this step by step. The us...","\n{\n ""status(Approved/NonApproved)"": ""Appr..."
1,CC(C)CCCCC(=O)N[C@@H](CCN)C(=O)N[C@@H](CN[C@@H...,1,{'most_app': ...,"[{'name': None, 'similarity score': 0.96003526...","[{'name': None, 'similarity score': 0.83061236...",Approved,"{'Molecular_Weight': 2296.970999999999, 'LogP'...",\n \nYou are a **chemist specializing in dr...,"\nOkay, let's start by looking at the data pro...",\nAnalysis of Compound X's Drug Approval Poten...,But wait the true answer is Approved. think st...,,\nRevised Analysis: Why Compound X Could Be Ap...,\n write your thinking in the following for...,"\nOkay, let's tackle this. The user wants the ...","\n{\n ""status(Approved/NonApproved)"": ""Appr..."
2,CC(C)(C)c1cc(C(C)(C)C)c(NC(=O)c2c[nH]c3ccccc3c...,1,{'most_app': ...,"[{'name': 'Atorvastatin', 'similarity score': ...","[{'name': 'Ritobegron', 'similarity score': 0....",Approved,"{'Molecular_Weight': 392.49900000000014, 'LogP...",\n \nYou are a **chemist specializing in dr...,"\nOkay, let's start by looking at Compound X's...",\nAnalysis of Compound X's Likelihood for Regu...,But wait the true answer is Approved. think st...,"\nOkay, so the user initially provided a detai...",\n1. High Structural Similarity to Atorvastati...,\n write your thinking in the following for...,"\nAlright, let's tackle this query step by ste...","\n{\n ""status(Approved/NonApproved)"": ""Appr..."
3,CCCCCCCCCCCCCCCCCCCCCCO,1,{'most_app': ...,"[{'name': 'Salmeterol', 'similarity score': 0....","[{'name': 'Ethanolamine oleate', 'similarity s...",Approved,"{'Molecular_Weight': 326.60900000000004, 'LogP...",\n \nYou are a **chemist specializing in dr...,"\nOkay, let's try to figure out whether compou...",\nAnalysis of Compound X's Drug Approval Likel...,But wait the true answer is Approved. think st...,"\nOkay, the user previously thought the compou...","\n1. Drug-Likeness Rules Are Guidelines, Not A...",\n write your thinking in the following for...,"\nOkay, let's tackle this. The user wants me t...","\n{\n ""status(Approved/NonApproved)"": ""Appr..."
4,CC(C)(CO)C(O)C(=O)NCCCO,0,{'most_app': ...,"[{'name': 'Dexpanthenol', 'similarity score': ...","[{'name': 'Duvoglustat', 'similarity score': 0...",Not Approved,"{'Molecular_Weight': 205.25399999999996, 'LogP...",\n \nYou are a **chemist specializing in dr...,"\nOkay, let me start by looking at the data pr...",\nAnalysis of Compound X for Regulatory Approv...,But wait the true answer is Not Approved. thin...,"\nOkay, let's try to figure out why the compou...",\n1. Critical Drug-Likeness and ADMET Red Flag...,\n write your thinking in the following for...,"\nAlright, the user wants me to format the pre...","\n{\n ""status(Approved/NonApproved)"": ""Not ..."


In [None]:
## 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

In [None]:
from transformers import AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
import torch

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:

df_sft_sft = pd.read_csv("/content/drive/MyDrive/Papers/Original_Papers/Llm_drug_prediction/Code/reasoning_data_api/just_for_sft.csv")

df_sft_sft_100 = pd.concat(100*[df_sft_sft], axis=0, ignore_index=True)

dset = Dataset.from_pandas(df_sft_sft_100)

def get_tokenize(dset):
    return tokenizer(dset["sft_input"] , padding="max_length" , truncation=True  )


dset2train = dset.map(get_tokenize)
def get_sft_prompt(input , thinking , answer):
    return f"""<｜begin▁of▁sentence｜><｜User｜>{input}<think>{thinking}</think><｜Assistant｜>{answer}<｜end▁of▁sentence｜><｜Assistant｜>"""


# df_sft["sft_input"] = df_sft.apply(lambda row : get_sft_prompt(row["1st_prompt"] ,f"""{row["think1"]} \n{row["answer1"]} \n but wait lets analysis again \n{row["think2"]}\n {row["answer2"]}"""  , row["anwer3"]) , axis = 1)
# Load model directly


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
tokenizer.model_max_length

16384

In [None]:
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B", # this is not working I shoud replace it with 7b"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B""meta-llama/Llama-3.1-8B-Instruct"
                                             device_map="auto",  # Automatically places layers on available devices
                                             torch_dtype=torch.bfloat16,
                                             )

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.67G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/7.39G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]



In [None]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_

In [None]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
print(config.max_position_embeddings)  # This gives the context length

131072


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
message = [
    {"role":"system","content":"you are a helpful assistant"},
    {"role": "user", "content":"hi what is yourname? jsekhgflkja;`qwhbufgln qwbgl;uq b"},
    {"role":"assistant","content":" foul " }
]
tokenized = tokenizer.apply_chat_template(message, tokenize=True, return_tensors="pt")

tokenized

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1627,  10263,    220,   2366,     19,    271,   9514,    527,
            264,  11190,  18328, 128009, 128006,    882, 128007,    271,   6151,
           1148,    374,    701,    609,     30,    503,    325,  31764,     70,
           1517,     74,   5697,     26,     63,     80,   1336,   6034,     70,
           2312,   2874,  20824,   6200,     26,     84,     80,    293, 128009,
         128006,  78191, 128007,    271,     69,  11206, 128009]])

In [None]:
tokenizer.decode(tokenized[0])

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nyou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nhi what is yourname? jsekhgflkja;`qwhbufgln qwbgl;uq b<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nfoul<|eot_id|>'

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)


model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B", # this is not working I shoud replace it with 7b"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B""meta-llama/Llama-3.1-8B-Instruct"
                                             quantization_config=bnb_config,
                                             device_map="auto",  # Automatically places layers on available devices
                                             torch_dtype=torch.bfloat16,
                                            #  attn_implementation="flash_attention_2"
                                             ) #"manycore-research/SpatialLM-Llama-1B""deepseek-ai/DeepSeek-R1"

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=8,  # Rank of the low-rank matrices
    lora_alpha=16,  # Scaling factor
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "down_proj" , "up_proj" , "gate_proj"],  # Modules to apply LoRA to
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# 6. Wrap model with LoRA
model = get_peft_model(model, peft_config)

# [len(tokenizer.tokenize(i)) for i in df_sft["sft_input"]]


RuntimeError: Failed to import transformers.integrations.bitsandbytes because of the following error (look up to see its traceback):
0 active drivers ([]). There should only be one.

In [None]:
!pip install -q torch ninja packaging
!pip install -q flash-attn --no-build-isolation

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m422.8/422.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone


In [None]:
!pip install -q  accelerate

In [None]:

# df_sft_sft = df_sft[["sft_input"]]
# df_sft_sft.to_csv("/content/drive/MyDrive/Papers/Original_Papers/Llm_drug_prediction/Code/reasoning_data_api/just_for_sft.csv", index = False)
offload_dir = "./offload"
training_args = SFTConfig(
    output_dir="/tmp",                  # Directory for output files
    num_train_epochs=1,                # Number of training epochs
    per_device_train_batch_size=2,     # Batch size per device
    gradient_accumulation_steps=1,     # Number of updates steps to accumulate
    learning_rate=2e-4,                # Learning rate
    weight_decay=0.01,                 # Weight decay
    logging_dir="./logs",              # Directory for logs
    logging_steps=1,                  # Log every X steps
    save_steps=500,                    # Save checkpoint every X steps
    save_total_limit=2,                # Max number of checkpoints to keep
    # evaluation_strategy="steps",       # Evaluate every X steps
    # eval_steps=100,                    # Evaluation steps
    fp16=True,                         # Use mixed precision training
    warmup_steps=500,                  # Number of warmup steps
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    report_to = "none",
    torch_compile =True,
    gradient_checkpointing= True,
    # use_flash_attention_2=True,
    # low_cpu_mem_usage=True  ,
    # torch_dtype="auto",  # Automatically uses bf16/fp16 if supported
    # device_map="auto",
)

trainer = SFTTrainer(
    model = model ,
    train_dataset=dset2train,
    args=training_args,
)
trainer.train()

The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here.


Truncating train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
W0326 10:54:59.310000 6752 torch/_inductor/utils.py:1137] [14/0] Not enough SMs to use max_autotune_gemm mode


Step,Training Loss
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0


KeyboardInterrupt: 

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
# model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

In [None]:
tokenizer.model_max_length

16384

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:

tokenizer2 = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
tokenizer2.model_max_length

131072