In [2]:
import sqlite3
import pandas as pd

# Path to your database
db_path = 'chembl_35/chembl_35_sqlite/chembl_35.db'

# Connect
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# List available tables
tables = cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
print("Available tables:")
for table in tables:
    print("-", table[0])


Available tables:
- action_type
- assay_type
- chembl_id_lookup
- confidence_score_lookup
- curation_lookup
- chembl_release
- source
- relationship_type
- target_type
- variant_sequences
- bioassay_ontology
- data_validity_lookup
- activity_smid
- activity_stds_lookup
- assay_classification
- atc_classification
- bio_component_sequences
- component_sequences
- protein_classification
- domains
- go_classification
- structural_alert_sets
- products
- frac_classification
- hrac_classification
- irac_classification
- research_stem
- organism_class
- patent_use_codes
- usan_stems
- version
- cell_dictionary
- docs
- target_dictionary
- tissue_dictionary
- molecule_dictionary
- activity_supp
- component_class
- component_domains
- component_go
- component_synonyms
- structural_alerts
- defined_daily_dose
- product_patents
- protein_class_synonyms
- research_companies
- assays
- compound_records
- binding_sites
- biotherapeutics
- compound_properties
- compound_structural_alerts
- compound_s

In [3]:
query = """
SELECT md.chembl_id, cs.canonical_smiles
FROM molecule_dictionary md
JOIN compound_structures cs ON md.molregno = cs.molregno
LIMIT 10;
"""
df_smiles = pd.read_sql_query(query, conn)
df_smiles.head()


Unnamed: 0,chembl_id,canonical_smiles
0,CHEMBL6329,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl
1,CHEMBL6328,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1
2,CHEMBL265667,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1
3,CHEMBL6362,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1
4,CHEMBL267864,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1


In [6]:
def show_columns(table_name):
    query = f"PRAGMA table_info({table_name})"
    cols = pd.read_sql_query(query, conn)
    print(f"\nüìä Columns in table `{table_name}`:\n")
    print(cols[['name', 'type']])

# List columns in key tables
for table in [
    'activities',
    'molecule_dictionary',
    'compound_structures',
    'target_dictionary',
    'drug_indication',
    'assays'
]:
    show_columns(table)



üìä Columns in table `activities`:

                     name           type
0             activity_id         BIGINT
1                assay_id         BIGINT
2                  doc_id         BIGINT
3               record_id         BIGINT
4                molregno         BIGINT
5       standard_relation    VARCHAR(50)
6          standard_value        NUMERIC
7          standard_units   VARCHAR(100)
8           standard_flag       SMALLINT
9           standard_type   VARCHAR(250)
10       activity_comment  VARCHAR(4000)
11  data_validity_comment    VARCHAR(30)
12    potential_duplicate       SMALLINT
13          pchembl_value  NUMERIC(4, 2)
14           bao_endpoint    VARCHAR(11)
15               uo_units    VARCHAR(10)
16             qudt_units    VARCHAR(70)
17                   toid        INTEGER
18            upper_value        NUMERIC
19   standard_upper_value        NUMERIC
20                 src_id        INTEGER
21                   type   VARCHAR(250)
22               re

In [7]:
query = """
SELECT 
    a.assay_id,
    a.molregno,
    md.chembl_id AS drug_id,
    md.pref_name AS drug_name,
    a.standard_type,
    a.standard_value,
    a.standard_units,
    a.standard_relation,
    ass.tid,
    td.pref_name AS target_name,
    td.organism AS target_organism
FROM activities a
JOIN molecule_dictionary md ON a.molregno = md.molregno
JOIN assays ass ON a.assay_id = ass.assay_id
JOIN target_dictionary td ON ass.tid = td.tid
WHERE a.standard_type IN ('IC50', 'Ki', 'Kd')
  AND a.standard_value IS NOT NULL
  AND a.standard_units = 'nM'
LIMIT 100;
"""
df_activity = pd.read_sql_query(query, conn)
df_activity.head()


Unnamed: 0,assay_id,molregno,drug_id,drug_name,standard_type,standard_value,standard_units,standard_relation,tid,target_name,target_organism
0,54505,180094,CHEMBL113081,,IC50,100000.0,nM,>,63,DNA topoisomerase II alpha,Homo sapiens
1,83907,182268,CHEMBL324340,,IC50,2500.0,nM,=,11653,Heparanase,Homo sapiens
2,88152,182268,CHEMBL324340,,IC50,50000.0,nM,>,22221,NON-PROTEIN TARGET,
3,83907,182855,CHEMBL109600,,IC50,9000.0,nM,=,11653,Heparanase,Homo sapiens
4,154606,252199,CHEMBL357278,,IC50,4000.0,nM,=,10483,Palmitoyl-CoA oxidase,Rattus norvegicus


In [8]:
query = """
SELECT 
    md.chembl_id AS drug_id,
    cs.canonical_smiles
FROM molecule_dictionary md
JOIN compound_structures cs ON md.molregno = cs.molregno
WHERE cs.canonical_smiles IS NOT NULL
LIMIT 100;
"""
df_smiles = pd.read_sql_query(query, conn)
df_smiles.head()


Unnamed: 0,drug_id,canonical_smiles
0,CHEMBL6329,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl
1,CHEMBL6328,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1
2,CHEMBL265667,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1
3,CHEMBL6362,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1
4,CHEMBL267864,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1


In [9]:
query = """
SELECT 
    md.chembl_id AS drug_id,
    di.mesh_id,
    di.mesh_heading,
    di.efo_id,
    di.efo_term
FROM drug_indication di
JOIN molecule_dictionary md ON di.molregno = md.molregno
LIMIT 100;
"""
df_indications = pd.read_sql_query(query, conn)
df_indications.head()


Unnamed: 0,drug_id,mesh_id,mesh_heading,efo_id,efo_term
0,CHEMBL1201823,D045743,"Scleroderma, Diffuse",EFO:0000404,diffuse scleroderma
1,CHEMBL1201823,D001172,"Arthritis, Rheumatoid",EFO:0000685,rheumatoid arthritis
2,CHEMBL1201584,D009203,Myocardial Infarction,EFO:0000612,myocardial infarction
3,CHEMBL271227,D011471,Prostatic Neoplasms,EFO:0001663,prostate carcinoma
4,CHEMBL1566,D003920,Diabetes Mellitus,EFO:0000400,diabetes mellitus


In [10]:
def show_columns(table_name):
    print(f"\nüìä Columns in `{table_name}`")
    query = f"PRAGMA table_info({table_name})"
    df = pd.read_sql_query(query, conn)
    display(df[['name', 'type']])

for table in [
    'compound_properties',
    'structural_alerts',
    'compound_structural_alerts',
    'structural_alert_sets',
    'drug_warning',
    'metabolism',
    'drug_mechanism'
]:
    show_columns(table)



üìä Columns in `compound_properties`


Unnamed: 0,name,type
0,molregno,BIGINT
1,mw_freebase,"NUMERIC(9, 2)"
2,alogp,"NUMERIC(9, 2)"
3,hba,INTEGER
4,hbd,INTEGER
5,psa,"NUMERIC(9, 2)"
6,rtb,INTEGER
7,ro3_pass,VARCHAR(3)
8,num_ro5_violations,SMALLINT
9,cx_most_apka,"NUMERIC(9, 2)"



üìä Columns in `structural_alerts`


Unnamed: 0,name,type
0,alert_id,BIGINT
1,alert_set_id,BIGINT
2,alert_name,VARCHAR(100)
3,smarts,VARCHAR(4000)



üìä Columns in `compound_structural_alerts`


Unnamed: 0,name,type
0,cpd_str_alert_id,BIGINT
1,molregno,BIGINT
2,alert_id,BIGINT



üìä Columns in `structural_alert_sets`


Unnamed: 0,name,type
0,alert_set_id,BIGINT
1,set_name,VARCHAR(100)
2,priority,SMALLINT





Unnamed: 0,name,type
0,warning_id,BIGINT
1,record_id,BIGINT
2,molregno,BIGINT
3,warning_type,VARCHAR(20)
4,warning_class,VARCHAR(100)
5,warning_description,VARCHAR(4000)
6,warning_country,VARCHAR(1000)
7,warning_year,INTEGER
8,efo_term,VARCHAR(200)
9,efo_id,VARCHAR(20)



üìä Columns in `metabolism`


Unnamed: 0,name,type
0,met_id,BIGINT
1,drug_record_id,BIGINT
2,substrate_record_id,BIGINT
3,metabolite_record_id,BIGINT
4,pathway_id,BIGINT
5,pathway_key,VARCHAR(50)
6,enzyme_name,VARCHAR(200)
7,enzyme_tid,BIGINT
8,met_conversion,VARCHAR(200)
9,organism,VARCHAR(100)



üìä Columns in `drug_mechanism`


Unnamed: 0,name,type
0,mec_id,BIGINT
1,record_id,BIGINT
2,molregno,BIGINT
3,mechanism_of_action,VARCHAR(250)
4,tid,BIGINT
5,site_id,BIGINT
6,action_type,VARCHAR(50)
7,direct_interaction,SMALLINT
8,molecular_mechanism,SMALLINT
9,disease_efficacy,SMALLINT


In [11]:
import pandas as pd

def get_table_columns(connection):
    tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
    tables = pd.read_sql_query(tables_query, connection)['name'].tolist()
    
    column_dict = {}
    for table in tables:
        try:
            cols = pd.read_sql_query(f"PRAGMA table_info({table});", connection)
            column_dict[table] = cols[['name', 'type']]
        except:
            column_dict[table] = 'Error reading columns'
    return column_dict

all_columns = get_table_columns(conn)

# Display results for verification
for table, cols in all_columns.items():
    print(f"\nüì¶ Table: {table}")
    print(cols)



üì¶ Table: action_type
          name          type
0  action_type   VARCHAR(50)
1  description  VARCHAR(200)
2  parent_type   VARCHAR(50)

üì¶ Table: assay_type
         name          type
0  assay_type    VARCHAR(1)
1  assay_desc  VARCHAR(250)

üì¶ Table: chembl_id_lookup
          name         type
0    chembl_id  VARCHAR(20)
1  entity_type  VARCHAR(50)
2    entity_id       BIGINT
3       status  VARCHAR(10)
4  last_active      INTEGER

üì¶ Table: confidence_score_lookup
               name          type
0  confidence_score      SMALLINT
1       description  VARCHAR(100)
2    target_mapping   VARCHAR(30)

üì¶ Table: curation_lookup
          name          type
0   curated_by   VARCHAR(32)
1  description  VARCHAR(100)

üì¶ Table: chembl_release
                name         type
0  chembl_release_id      INTEGER
1     chembl_release  VARCHAR(20)
2      creation_date     DATETIME

üì¶ Table: source
              name           type
0           src_id        INTEGER
1  src_descr

In [1]:
!pip install tdqm

Collecting tdqm
  Downloading tdqm-0.0.1.tar.gz (1.4 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: tdqm
  Building wheel for tdqm (setup.py): started
  Building wheel for tdqm (setup.py): finished with status 'done'
  Created wheel for tdqm: filename=tdqm-0.0.1-py3-none-any.whl size=1395 sha256=5191731ad923de20eb37a12f5f25c2b923479d11aab233cd2e1168dc996bd3bc
  Stored in directory: c:\users\konde\appdata\local\pip\cache\wheels\af\02\71\aae0f7ee738abf19498353918ddae0f90a0d6ceb337b0bbc91
Successfully built tdqm
Installing collected packages: tdqm
Successfully installed tdqm-0.0.1


  DEPRECATION: Building 'tdqm' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'tdqm'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [1]:
import sqlite3
import pandas as pd
import time
from tqdm import tqdm

start_total = time.time()

print("üîå Connecting to database...")
conn = sqlite3.connect("chembl_35/chembl_35_sqlite/chembl_35.db")

print("üîç Step 1: Running TEST query with LIMIT 50000...")
start = time.time()
query = """
SELECT 
    md.chembl_id AS drug_id,
    cs.canonical_smiles AS SMILES,
    cp.cx_logd AS logD,
    cp.qed_weighted AS drug_likeness,
    cp.psa AS psa,
    cp.cx_logp AS logP,
    act.standard_value AS IC50,
    td.pref_name AS target,
    di.efo_term AS indication,
    sa.alert_name AS toxicity_alert,
    dw.warning_description AS warning,
    met.enzyme_name AS metabolism_enzyme,
    met.met_conversion AS metabolism_conversion

FROM molecule_dictionary md
JOIN compound_structures cs ON md.molregno = cs.molregno
JOIN compound_properties cp ON md.molregno = cp.molregno
JOIN activities act ON md.molregno = act.molregno AND act.standard_type = 'IC50' AND act.standard_value IS NOT NULL
JOIN assays a ON act.assay_id = a.assay_id AND a.confidence_score >= 8
JOIN target_dictionary td ON a.tid = td.tid
LEFT JOIN drug_indication di ON md.molregno = di.molregno
LEFT JOIN compound_structural_alerts csa ON md.molregno = csa.molregno
LEFT JOIN structural_alerts sa ON csa.alert_id = sa.alert_id
LEFT JOIN drug_warning dw ON md.molregno = dw.molregno
LEFT JOIN compound_records cr ON md.molregno = cr.molregno
LEFT JOIN metabolism met ON cr.record_id = met.substrate_record_id

LIMIT 50000
"""

df = pd.read_sql_query(query, conn)
conn.close()
print(f"‚úÖ Query finished in {round(time.time() - start, 2)} seconds")
print(f"üî¢ Rows fetched: {len(df):,}")

# Step 2: Cleaning
print("üßπ Step 2: Cleaning...")
start = time.time()
df.dropna(subset=["SMILES", "IC50", "target"], inplace=True)
df.drop_duplicates(inplace=True)
df["IC50"] = pd.to_numeric(df["IC50"], errors="coerce")
print(f"‚úÖ Cleaned in {round(time.time() - start, 2)} seconds")

# Step 3: Save to CSV + Parquet
print("üíæ Step 3: Saving to disk...")
start = time.time()

# CSV with tqdm
with tqdm(total=len(df), desc="Saving CSV", unit="rows") as pbar:
    df.to_csv("test_clean_chembl.csv", index=False)
    pbar.update(len(df))

# Parquet (fast!)
df.to_parquet("test_clean_chembl.parquet", index=False)
print(f"‚úÖ Saved CSV + Parquet in {round(time.time() - start, 2)} seconds")

print(f"üèÅ ALL DONE in {round(time.time() - start_total, 2)} seconds")


üîå Connecting to database...
üîç Step 1: Running TEST query with LIMIT 50000...
‚úÖ Query finished in 2.01 seconds
üî¢ Rows fetched: 50,000
üßπ Step 2: Cleaning...
‚úÖ Cleaned in 0.09 seconds
üíæ Step 3: Saving to disk...


Saving CSV: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1871/1871 [00:00<00:00, 42983.98rows/s]


‚úÖ Saved CSV + Parquet in 0.91 seconds
üèÅ ALL DONE in 3.01 seconds


In [5]:
import pandas as pd
import numpy as np
# Load dataset (choose either CSV or Parquet)
df = pd.read_csv("test_clean_chembl.csv")  # OR
# df = pd.read_parquet("test_clean_chembl.parquet")

# üîç Preview top rows
print("üîπ Preview:")
print(df.head())

# üìä Summary info
print("\nüîπ Data Info:")
print(df.info())

# üßº Missing values check
print("\nüîπ Null Values:")
print(df.isnull().sum())

# üî¢ Unique targets and indications
print("\nüîπ Unique targets:", df['target'].nunique())
print("üîπ Unique indications:", df['indication'].nunique())

# üìà IC50 distribution
print("\nüîπ IC50 (nM) stats:")
print(df['IC50'].describe())

# üìä Most common targets
print("\nüîπ Top 10 Targets:")
print(df['target'].value_counts().head(10))

# üß™ Optional: Convert IC50 to pIC50 (‚Äìlog10(IC50 in molar units))
df["pIC50"] = -df["IC50"].astype(float).div(1e9).apply(lambda x: np.log10(x) if x > 0 else None)

# üéØ Filter: Only rows targeting EGFR
egfr_df = df[df["target"].str.contains("EGFR", na=False)]
print("\nüîç Drugs targeting EGFR:")
print(egfr_df[["drug_id", "SMILES", "IC50", "indication"]].head())


üîπ Preview:
        drug_id                                             SMILES  logD  \
0  CHEMBL113081           c1ccc(-c2nc3c(-c4nc5ccccc5o4)cccc3o2)cc1  4.72   
1  CHEMBL324340  Cc1ccc2oc(-c3cccc(N4C(=O)c5ccc(C(=O)O)cc5C4=O)...  0.74   
2  CHEMBL109600  COc1ccccc1-c1ccc2oc(-c3ccc(OC)c(N4C(=O)c5ccc(C...  1.56   
3  CHEMBL357278  Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4ccc(Cl)c(C(...  3.62   
4  CHEMBL357119  Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)NCCc4ccccc4)CC...  2.12   

   drug_likeness     psa  logP      IC50                      target  \
0           0.44   52.06  4.72  100000.0  DNA topoisomerase II alpha   
1           0.52  100.71  4.12    2500.0                  Heparanase   
2           0.28  119.17  4.94    9000.0                  Heparanase   
3           0.44   77.93  3.71    4000.0       Palmitoyl-CoA oxidase   
4           0.48   77.93  2.22   17000.0       Palmitoyl-CoA oxidase   

0        NaN            NaN      NaN               NaN                   NaN  
1        NaN    p

In [6]:
egfr_df = df[df["target"].str.contains("epidermal growth factor", case=False, na=False)]


In [7]:
# Keep rows with non-null SMILES, IC50, and indication
filtered_df = df.dropna(subset=["SMILES", "IC50", "indication"])
print(f"‚úÖ Rows with all critical data: {len(filtered_df)}")


‚úÖ Rows with all critical data: 186


In [8]:
import numpy as np
df["pIC50"] = -df["IC50"].div(1e9).apply(lambda x: np.log10(x) if x > 0 else None)
strong_binders = df[df["pIC50"] > 7]
strong_binders.to_csv("strong_binders.csv", index=False)


In [9]:
df1 = pd.read_csv("strong_binders.csv")
df1.head()

Unnamed: 0,drug_id,SMILES,logD,drug_likeness,psa,logP,IC50,target,indication,toxicity_alert,warning,metabolism_enzyme,metabolism_conversion,pIC50
0,CHEMBL305153,CC(C)(C)NC[C@H](O)CON=C1c2ccccc2-c2ccccc21,1.08,0.71,53.85,3.55,29.4,Beta-1 adrenergic receptor,,imine,,,,7.531653
1,CHEMBL305153,CC(C)(C)NC[C@H](O)CON=C1c2ccccc2-c2ccccc21,1.08,0.71,53.85,3.55,29.4,Beta-1 adrenergic receptor,,Oxygen-nitrogen single bond,,,,7.531653
2,CHEMBL305153,CC(C)(C)NC[C@H](O)CON=C1c2ccccc2-c2ccccc21,1.08,0.71,53.85,3.55,29.4,Beta-1 adrenergic receptor,,Long aliphatic chain,,,,7.531653
3,CHEMBL305153,CC(C)(C)NC[C@H](O)CON=C1c2ccccc2-c2ccccc21,1.08,0.71,53.85,3.55,30.8,Beta-2 adrenergic receptor,,imine,,,,7.511449
4,CHEMBL305153,CC(C)(C)NC[C@H](O)CON=C1c2ccccc2-c2ccccc21,1.08,0.71,53.85,3.55,30.8,Beta-2 adrenergic receptor,,Oxygen-nitrogen single bond,,,,7.511449


In [11]:
import pandas as pd
import sqlite3

# Connect to your ChEMBL SQLite database
conn = sqlite3.connect("chembl_35/chembl_35_sqlite/chembl_35.db")

# Define the solid SQL query
query = """
SELECT DISTINCT
    md.chembl_id                    AS drug_id,
    md.pref_name                    AS drug_name,
    cs.canonical_smiles             AS SMILES,
    cp.cx_logd                      AS logD,
    cp.cx_logp                      AS logP,
    cp.psa                          AS psa,
    cp.qed_weighted                 AS drug_likeness,
    md.max_phase                    AS max_phase,
    act.standard_value              AS IC50,
    td.pref_name                    AS target,
    td.organism                     AS organism,
    td.target_type                  AS target_type,
    dm.mechanism_of_action          AS mechanism_of_action,
    di.efo_term                     AS efo_term,
    di.efo_id                       AS efo_id,
    di.mesh_heading                 AS mesh_heading,
    di.mesh_id                      AS mesh_id,
    sa.alert_name                   AS toxicity_alert

FROM molecule_dictionary md
INNER JOIN compound_structures cs ON md.molregno = cs.molregno
INNER JOIN compound_properties cp ON md.molregno = cp.molregno
INNER JOIN activities act ON md.molregno = act.molregno
    AND act.standard_type = 'IC50'
    AND act.standard_value IS NOT NULL
INNER JOIN assays a ON act.assay_id = a.assay_id
    AND a.confidence_score >= 8
INNER JOIN target_dictionary td ON a.tid = td.tid
LEFT JOIN drug_mechanism dm ON md.molregno = dm.molregno AND dm.tid = td.tid
LEFT JOIN drug_indication di ON md.molregno = di.molregno
LEFT JOIN compound_structural_alerts csa ON md.molregno = csa.molregno
LEFT JOIN structural_alerts sa ON csa.alert_id = sa.alert_id
LIMIT 50000;
"""

# Run the query and load into DataFrame
df = pd.read_sql_query(query, conn)

# Clean: drop any rows with NULL SMILES or IC50 just in case
df.dropna(subset=["SMILES", "IC50"], inplace=True)

# Convert IC50 to pIC50 (optional)
import numpy as np
df["pIC50"] = -df["IC50"].astype(float).div(1e9).apply(lambda x: np.log10(x) if x > 0 else None)

# Preview
df.head()


Unnamed: 0,drug_id,drug_name,SMILES,logD,logP,psa,drug_likeness,max_phase,IC50,target,organism,target_type,mechanism_of_action,efo_term,efo_id,mesh_heading,mesh_id,toxicity_alert,pIC50
0,CHEMBL113081,,c1ccc(-c2nc3c(-c4nc5ccccc5o4)cccc3o2)cc1,4.72,4.72,52.06,0.44,,100000.0,DNA topoisomerase II alpha,Homo sapiens,SINGLE PROTEIN,,,,,,,4.0
1,CHEMBL324340,,Cc1ccc2oc(-c3cccc(N4C(=O)c5ccc(C(=O)O)cc5C4=O)...,0.74,4.12,100.71,0.52,,2500.0,Heparanase,Homo sapiens,SINGLE PROTEIN,,,,,,phthalimide,5.60206
2,CHEMBL109600,,COc1ccccc1-c1ccc2oc(-c3ccc(OC)c(N4C(=O)c5ccc(C...,1.56,4.94,119.17,0.28,,9000.0,Heparanase,Homo sapiens,SINGLE PROTEIN,,,,,,phthalimide,5.045757
3,CHEMBL357278,,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4ccc(Cl)c(C(...,3.62,3.71,77.93,0.44,,4000.0,Palmitoyl-CoA oxidase,Rattus norvegicus,SINGLE PROTEIN,,,,,,,5.39794
4,CHEMBL357119,,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)NCCc4ccccc4)CC...,2.12,2.22,77.93,0.48,,17000.0,Palmitoyl-CoA oxidase,Rattus norvegicus,SINGLE PROTEIN,,,,,,,4.769551


In [16]:
df = df[df["max_phase"] >= 2]             # Keep only clinical-stage drugs
df = df[df["mechanism_of_action"].notna()]  # Only drugs with known MoA
df = df[df["efo_term"].notna()]             # Keep drugs with disease linkage
df.head()
df.shape[0]

3761

In [2]:
import pandas as pd
import sqlite3
import numpy as np

# Connect to your ChEMBL SQLite database
conn = sqlite3.connect("chembl_35/chembl_35_sqlite/chembl_35.db")

# Optimized SQL query (no LIMIT)
query = """
SELECT DISTINCT
    md.chembl_id                    AS drug_id,
    md.pref_name                    AS drug_name,
    cs.canonical_smiles             AS SMILES,
    cp.cx_logd                      AS logD,
    cp.cx_logp                      AS logP,
    cp.psa                          AS psa,
    cp.qed_weighted                 AS drug_likeness,
    md.max_phase                    AS max_phase,
    act.standard_value              AS IC50,
    td.pref_name                    AS target,
    td.organism                     AS organism,
    td.target_type                  AS target_type,
    dm.mechanism_of_action          AS mechanism_of_action,
    di.efo_term                     AS efo_term,
    di.efo_id                       AS efo_id,
    di.mesh_heading                 AS mesh_heading,
    di.mesh_id                      AS mesh_id,
    sa.alert_name                   AS toxicity_alert
FROM molecule_dictionary md
INNER JOIN compound_structures cs ON md.molregno = cs.molregno
INNER JOIN compound_properties cp ON md.molregno = cp.molregno
INNER JOIN activities act ON md.molregno = act.molregno
    AND act.standard_type = 'IC50'
    AND act.standard_value IS NOT NULL
INNER JOIN assays a ON act.assay_id = a.assay_id
    AND a.confidence_score >= 8
INNER JOIN target_dictionary td ON a.tid = td.tid
LEFT JOIN drug_mechanism dm ON md.molregno = dm.molregno AND dm.tid = td.tid
LEFT JOIN drug_indication di ON md.molregno = di.molregno
LEFT JOIN compound_structural_alerts csa ON md.molregno = csa.molregno
LEFT JOIN structural_alerts sa ON csa.alert_id = sa.alert_id
"""

# Run the query
print("üîÑ Loading data...")
df = pd.read_sql_query(query, conn)

# Drop rows missing SMILES or IC50
df.dropna(subset=["SMILES", "IC50"], inplace=True)

# Filter to clinically relevant + annotated data
df = df[df["max_phase"] >= 2]
df = df[df["mechanism_of_action"].notna()]
df = df[df["efo_term"].notna()]

# Drop duplicates
df.drop_duplicates(inplace=True)

# Compute pIC50
df["pIC50"] = -df["IC50"].astype(float).div(1e9).apply(lambda x: np.log10(x) if x > 0 else None)

# Save to CSV (added line)
df.to_csv("cleaned_clinical_drugs_dataset.csv", index=False)

# Summary
print("‚úÖ Final dataset shape:", df.shape)
print("üß™ Columns:", list(df.columns))


üîÑ Loading data...
‚úÖ Final dataset shape: (553992, 19)
üß™ Columns: ['drug_id', 'drug_name', 'SMILES', 'logD', 'logP', 'psa', 'drug_likeness', 'max_phase', 'IC50', 'target', 'organism', 'target_type', 'mechanism_of_action', 'efo_term', 'efo_id', 'mesh_heading', 'mesh_id', 'toxicity_alert', 'pIC50']


In [7]:
df.head(30)

Unnamed: 0,drug_id,drug_name,SMILES,logD,logP,psa,drug_likeness,max_phase,IC50,target,organism,target_type,mechanism_of_action,efo_term,efo_id,mesh_heading,mesh_id,toxicity_alert,pIC50
0,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,AIDS,EFO:0000765,Acquired Immunodeficiency Syndrome,D000163,triple bond,9.124939
1,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,immunodeficiency disease,MONDO:0021094,Autoimmune Diseases,D001327,triple bond,9.124939
2,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,type 2 diabetes mellitus,MONDO:0005148,"Diabetes Mellitus, Type 2",D003924,triple bond,9.124939
3,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,hepatitis C virus infection,EFO:0003047,Hepatitis C,D006526,triple bond,9.124939
4,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,infection,EFO:0000544,Infections,D007239,triple bond,9.124939
5,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,non-Hodgkins lymphoma,EFO:0005952,"Lymphoma, Non-Hodgkin",D008228,triple bond,9.124939
6,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,malaria,EFO:0001068,Malaria,D008288,triple bond,9.124939
7,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,neoplasm,EFO:0000616,Neoplasms,D009369,triple bond,9.124939
8,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,pancreatic carcinoma,EFO:0002618,Pancreatic Neoplasms,D010190,triple bond,9.124939
9,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,tuberculosis,MONDO:0018076,Tuberculosis,D014376,triple bond,9.124939


In [4]:
# Number of unique drugs
num_unique_drugs = df['drug_id'].nunique()
print(f"üî¢ Unique drugs: {num_unique_drugs}")


üî¢ Unique drugs: 850


In [7]:
df['target'].unique()

array(['Human immunodeficiency virus type 1 reverse transcriptase',
       'Matrix metalloproteinase 9', 'Matrix metalloproteinase 13',
       'Steroid 5-alpha-reductase 2', 'Androgen Receptor',
       'Xanthine dehydrogenase', 'FK506-binding protein 1A',
       'Human immunodeficiency virus type 1 protease', 'Cyclophilin A',
       'Ileal bile acid transporter', 'Nitric oxide synthase, inducible',
       'Matrix metalloproteinase-1', 'Matrix metalloproteinase-2',
       'Matrix metalloproteinase 3', 'Tyrosine-protein kinase SRC',
       'Serotonin 1d (5-HT1d) receptor', 'Carbonic anhydrase II',
       'Dihydrofolate reductase', 'Cyclooxygenase-2', 'Aldose reductase',
       'Thymidylate synthase', 'Cytochrome P450 17A1',
       'Muscarinic acetylcholine receptor M3',
       'Farnesyl diphosphate synthase', 'HMG-CoA reductase',
       'Epidermal growth factor receptor erbB1',
       'Receptor protein-tyrosine kinase erbB-2',
       'Angiotensin-converting enzyme', 'Neuraminidase',
    

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load BioGPT model from Microsoft
model_name = "microsoft/BioGPT-Large"
biogpt_tokenizer = AutoTokenizer.from_pretrained(model_name)
biogpt_model = AutoModelForCausalLM.from_pretrained(model_name)

print("‚úÖ BioGPT-Large loaded and ready.")

‚úÖ BioGPT-Large loaded and ready.


In [2]:
save_directory = "./saved_models/biogpt-large"

# Save model
biogpt_model.save_pretrained(save_directory)

# Save tokenizer
biogpt_tokenizer.save_pretrained(save_directory)

print("üì¶ BioGPT-Large saved to:", save_directory)


üì¶ BioGPT-Large saved to: ./saved_models/biogpt-large


In [3]:
pip install peft bitsandbytes accelerate transformers datasets



Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-win_amd64.whl.metadata (10 kB)
Downloading bitsandbytes-0.46.1-py3-none-win_amd64.whl (72.2 MB)
   ---------------------------------------- 0.0/72.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.2 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.2 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.2 MB ? eta -:--:--
   ---------------------------------------- 0.5/72.2 MB 837.5 kB/s eta 0:01:26
   ---------------------------------------- 0.8/72.2 MB 1.0 MB/s eta 0:01:11
    --------------------------------------- 1.3/72.2 MB 1.2 MB/s eta 0:00:58
    --------------------------------------- 1.6/72.2 MB 1.3 MB/s eta 0:00:54
   - -------------------------------------- 1.8/72.2 MB 1.4 MB/s eta 0:00:50
   - -------------------------------------- 2.1/72.2 MB 1.4 MB/s eta 0:00:51
   - -------------------------------------- 2.4/72.2 MB 1.4 MB/s eta 0:00:52
   - ----------

In [6]:
import pandas as pd

# Load your dataset
df = pd.read_csv("data/cleaned_clinical_drugs_dataset.csv")
print("‚úÖ Loaded:", df.shape)
df.head(30)


‚úÖ Loaded: (553992, 19)


Unnamed: 0,drug_id,drug_name,SMILES,logD,logP,psa,drug_likeness,max_phase,IC50,target,organism,target_type,mechanism_of_action,efo_term,efo_id,mesh_heading,mesh_id,toxicity_alert,pIC50
0,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,AIDS,EFO:0000765,Acquired Immunodeficiency Syndrome,D000163,triple bond,9.124939
1,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,immunodeficiency disease,MONDO:0021094,Autoimmune Diseases,D001327,triple bond,9.124939
2,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,type 2 diabetes mellitus,MONDO:0005148,"Diabetes Mellitus, Type 2",D003924,triple bond,9.124939
3,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,hepatitis C virus infection,EFO:0003047,Hepatitis C,D006526,triple bond,9.124939
4,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,infection,EFO:0000544,Infections,D007239,triple bond,9.124939
5,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,non-Hodgkins lymphoma,EFO:0005952,"Lymphoma, Non-Hodgkin",D008228,triple bond,9.124939
6,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,malaria,EFO:0001068,Malaria,D008288,triple bond,9.124939
7,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,neoplasm,EFO:0000616,Neoplasms,D009369,triple bond,9.124939
8,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,pancreatic carcinoma,EFO:0002618,Pancreatic Neoplasms,D010190,triple bond,9.124939
9,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,tuberculosis,MONDO:0018076,Tuberculosis,D014376,triple bond,9.124939


In [2]:
columns_to_keep = [
    "drug_name", "SMILES", "target", "mechanism_of_action",
    "pIC50", "efo_term", "mesh_heading"
]

df = df[columns_to_keep].dropna()
print("üßº After cleaning:", df.shape)


üßº After cleaning: (553987, 7)


In [3]:
def create_prompt(row):
    prompt = f"""### Instruction:
Given the following drug information, predict the target and mechanism.

Drug Name: {row['drug_name']}
SMILES: {row['SMILES']}

### Response:
Target: {row['target']}
Mechanism: {row['mechanism_of_action']}"""
    return prompt

df['text'] = df.apply(create_prompt, axis=1)
formatted_df = df[['text']].rename(columns={'text': 'text'})

print("üìÑ Sample Prompt:\n", formatted_df.iloc[0]['text'])


üìÑ Sample Prompt:
 ### Instruction:
Given the following drug information, predict the target and mechanism.

Drug Name: EFAVIRENZ
SMILES: O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1

### Response:
Target: Human immunodeficiency virus type 1 reverse transcriptase
Mechanism: Human immunodeficiency virus type 1 reverse transcriptase inhibitor


In [4]:
import json

output_path = "formatted_data.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for record in formatted_df['text']:
        json.dump({"text": record}, f)
        f.write("\n")

print(f"üíæ Saved formatted dataset to {output_path}")


üíæ Saved formatted dataset to formatted_data.jsonl


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch

# Load tokenizer and model
model_name = "microsoft/BioGPT-Large"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model in FP16 and manually push to GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16
).to("cuda")

# Prepare for LoRA training (no quantization)
model = prepare_model_for_kbit_training(model)

# LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # fallback: try ["c_attn"] or inspect if error
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# ‚úÖ Show trainable parameters
def print_trainable_parameters(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"‚úÖ Trainable params: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")

print_trainable_parameters(model)


AssertionError: Torch not compiled with CUDA enabled

In [6]:
# Count number of lines in the saved JSONL file
with open("formatted_data.jsonl", "r", encoding="utf-8") as f:
    num_lines = sum(1 for _ in f)

print(f"üìä Lines saved in formatted_data.jsonl: {num_lines}")
print(f"üìÅ Expected rows: {len(formatted_df)}")


üìä Lines saved in formatted_data.jsonl: 553987
üìÅ Expected rows: 553987


In [13]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA configuration
peft_config = LoraConfig(
    r=4,  # Lower rank to reduce trainable parameters
    lora_alpha=16,
    target_modules=["q_proj"],  # Only adapting query projection layer
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Load base BioGPT model
base_model = AutoModelForCausalLM.from_pretrained("microsoft/BioGPT-Large")

# Apply LoRA configuration
model = get_peft_model(base_model, peft_config)

# Print trainable parameters
model.print_trainable_parameters()


trainable params: 614,400 || all params: 1,571,803,200 || trainable%: 0.039088863033234694


In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch

# Step 1: Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/BioGPT-Large")

# Step 2: Load formatted dataset
dataset = load_dataset("json", data_files="formatted_data.jsonl", split="train")

# Step 3: Tokenization
def tokenize_function(example):
    texts = [p + " " + c for p, c in zip(example["prompt"], example["completion"])]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Step 4: LoRA configuration (already optimized)
peft_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Step 5: Load base model and apply LoRA
base_model = AutoModelForCausalLM.from_pretrained("microsoft/BioGPT-Large")
model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

# Step 6: Training arguments
training_args = TrainingArguments(
    output_dir="./biogpt-lora-llm4mol",
    per_device_train_batch_size=4,       # Tune if GPU OOM
    gradient_accumulation_steps=4,       # Effective batch size = 16
    num_train_epochs=1,
    learning_rate=2e-4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),      # Use FP16 if CUDA available
    report_to="none"
)

# Step 7: Data collator (for causal LM)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Step 8: Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# Step 9: Train the model
trainer.train()


The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.
  from scipy.stats import pearsonr, spearmanr


RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject