In [1]:
print("test")

test


In [1]:
!curl -L -o padel.zip https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip
!curl -L -o padel.sh https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  289k    0  289k    0     0   508k      0 --:--:-- --:--:-- --:--:--  518k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  289k    0  289k    0     0   688k      0 --:--:-- --:--:-- --:--:--  698k


In [6]:
!ls -lh padel.zip    # check file size
file padel.zip      # check actual file type


SyntaxError: invalid syntax (3744295861.py, line 2)

In [7]:
!curl -L -o padel.zip https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  289k    0  289k    0     0   555k      0 --:--:-- --:--:-- --:--:--  561k


In [8]:
!unzip padel.zip

Archive:  padel.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of padel.zip or
        padel.zip.zip, and cannot find padel.zip.ZIP, period.


In [3]:
import pandas as pd
from padelpy import from_smiles
import os

# 1. Load your data
input_file = '../data/combined_bbb_classification_fixed.csv'
df = pd.read_csv(input_file)

# 2. Prepare data for PaDEL
# PaDEL accepts a .smi file with columns: [SMILES] [Name]
# We'll create a unique ID for each molecule to ensure accuracy (handling missing/duplicate names)
df['unique_id'] = 'Mol_' + df.index.astype(str) + '_' + df['name'].fillna('Unknown').astype(str)

# Select only SMILES and ID, drop any rows without SMILES
smi_data = df[['smiles', 'unique_id']].dropna(subset=['smiles'])

# Save to a temporary .smi file (tab-separated, no header)
smi_file = 'temp_molecules.smi'
smi_data.to_csv(smi_file, sep='\t', index=False, header=False)

# 3. Calculate Descriptors
# descriptors=True calculates 1D/2D descriptors. 
# fingerprints=True would add fingerprints (PubChem, etc.) but takes longer.
output_file = 'padel_descriptors_output.csv'
print(f"Calculating PaDEL descriptors for {len(smi_data)} molecules...")

try:
    # This function calls the PaDEL-Descriptor Java executable
    from_smiles(
        smi_file, 
        output_csv=output_file, 
        descriptors=True,   # Set to True for standard descriptors
        fingerprints=False, # Set to True if you also need fingerprints (slower)
        threads=4           # Use 4 CPU cores for speed
    )
    print(f"Success! Descriptors saved to: {output_file}")

except Exception as e:
    print("An error occurred. Make sure Java is installed and in your PATH.")
    print(f"Error details: {e}")

finally:
    # 4. Cleanup temporary file
    if os.path.exists(smi_file):
        os.remove(smi_file)

Calculating PaDEL descriptors for 9637 molecules...
An error occurred. Make sure Java is installed and in your PATH.
Error details: PaDEL-Descriptor failed on temp_molecules.smi. Ensure input structure is correct.


In [7]:
import pandas as pd
from padelpy import from_smiles
import os
import time

# 1. Load your input data
input_file = '../data/combined_bbb_classification_fixed.csv'
df = pd.read_csv(input_file)

# Output file name
output_file = 'padel_results_loop.csv'

# Check if file exists to resume
if os.path.isfile(output_file):
    print(f"Resuming... appending to {output_file}")
    existing_df = pd.read_csv(output_file)
    processed_count = len(existing_df)
    print(f"Already processed {processed_count} molecules.")
    df_to_process = df.iloc[processed_count:]
else:
    print(f"Starting new output file: {output_file}")
    processed_count = 0
    df_to_process = df

print(f"Processing {len(df_to_process)} molecules one by one...")

for index, row in df_to_process.iterrows():
    smi = row['smiles']
    mol_name = row['name']
    
    try:
        # Calculate descriptors
        descriptors = from_smiles(smi, descriptors=True, fingerprints=False, timeout=45)
        
        # Check if descriptors were actually returned
        if descriptors:
            # FIX IS HERE: Wrap 'descriptors' in brackets [ ] to make it a list
            desc_row = pd.DataFrame([descriptors])
            
            # Add identifiers
            desc_row.insert(0, 'Original_Name', mol_name)
            desc_row.insert(1, 'Original_SMILES', smi)
            
            # Write to CSV
            write_header = not os.path.isfile(output_file)
            desc_row.to_csv(output_file, mode='a', index=False, header=write_header)
            
            print(f"Row {index} ({mol_name}): Success")
        else:
            print(f"Row {index} ({mol_name}): FAILED (Empty result from PaDEL)")

    except Exception as e:
        # This catches errors so the loop doesn't stop
        print(f"Row {index} ({mol_name}): CRASHED - {e}")
        
    # Optional: small sleep to prevent Java from choking on rapid restart
    # time.sleep(0.1)

print("Done!")

Starting new output file: padel_results_loop.csv
Processing 9637 molecules one by one...
Row 0 (sulphasalazine): Success
Row 1 (moxalactam): Success
Row 2 (clioquinol): Success
Row 3 (bbcpd11 (cimetidine analog) (y-g13)): Success
Row 4 (schembl614298): Success
Row 5 (uk-240,455): Success
Row 6 (morphine-6-glucuronide): Success
Row 7 (nitrofurantoin): Success
Row 8 (l-701,324): Success
Row 9 (33419-42-0): Success
Row 10 (icotidine): Success
Row 11 (ro 64-0802): Success
Row 12 (temelastine): Success
Row 13 (disodium;(6r,7s)-7-[[4-(2-amino-1-carboxylato-2-oxoethylidene)-1,3-dithietane-2-carbonyl]amino]-7-methoxy-3-[(1-methyltetrazol-5-yl)sulfanylmethyl]-8-oxo-5-thia-1-azabicyclo[4.2.0]oct-2-ene-2-carboxylate): Success
Row 14 (cefotetan): Success
Row 15 (1848-75-5): Success
Row 16 (2-[4-(5-bromo-3-methylpyridin-2-yl)butylamino]-5-[(6-methylpyridin-3-yl)methyl]-1,3-diazinan-4-one): Success
Row 17 (m2l-663581): Success
Row 18 (ritonavir): Success
Row 19 (bis-hydroxylated-l-663581): Success
R

KeyboardInterrupt: 

In [2]:
#adding current dataset to brainroute db 

import pandas as pd
import numpy as np
import gspread

# File paths - update these with your actual file paths
csv_file = '../data/combined_bbb_classification_fixed.csv'
google_sheet_url = 'https://docs.google.com/spreadsheets/d/1sCVKMH_n-Uc-LLPxKhUi1g-XmKVOSlUQZGwrsrDVP_4/edit?gid=0#gid=0'  # Full URL of your Google Sheet
worksheet_name = 'Sheet1'  # Name of the worksheet/tab
api_key = 'AIzaSyAnJ_4h11oUrKy1XJK35scSIjJ1NEwDhZ8'  # Your Google Sheets API key

# Set up Google Sheets authentication with API key
client = gspread.auth.api_key(api_key)

# Read the files
csv_df = pd.read_csv(csv_file)

# Open Google Sheet and read data
sheet = client.open_by_url(google_sheet_url).worksheet(worksheet_name)
excel_df = pd.DataFrame(sheet.get_all_records())

print(f"CSV rows: {len(csv_df)}")
print(f"Excel rows before merge: {len(excel_df)}")

# Transform BBB column to prediction format (0 -> BBB-, 1 -> BBB+)
if 'BBB' in csv_df.columns:
    csv_df['prediction'] = csv_df['BBB'].apply(lambda x: 'BBB+' if x == 1 else 'BBB-')
    csv_df = csv_df.drop(columns=['BBB'])

# Get all columns from excel sheet
excel_columns = excel_df.columns.tolist()

# Create a dataframe for CSV data with Excel's column structure
csv_aligned = pd.DataFrame()

for col in excel_columns:
    if col in csv_df.columns:
        # Column exists in CSV, copy the data
        csv_aligned[col] = csv_df[col]
    else:
        # Column doesn't exist in CSV, fill with NA
        csv_aligned[col] = np.nan

# Identify a unique key to check for duplicates
# Update 'id' below with your actual unique identifier column name
unique_key = 'name'  # Change this to your actual unique key column

if unique_key in excel_columns and unique_key in csv_aligned.columns:
    # Filter out rows that already exist in excel
    existing_keys = excel_df[unique_key].values
    new_rows = csv_aligned[~csv_aligned[unique_key].isin(existing_keys)]
    print(f"New rows to add: {len(new_rows)}")
else:
    # If no unique key, add all CSV rows (may create duplicates)
    new_rows = csv_aligned
    print("Warning: No unique key specified, adding all CSV rows")

# Combine excel data with new CSV rows
result_df = pd.concat([excel_df, new_rows], ignore_index=True)

# Update Google Sheet with merged data
sheet.clear()  # Clear existing data
sheet.update([result_df.columns.values.tolist()] + result_df.values.tolist())

print(f"Google Sheet rows after merge: {len(result_df)}")
print(f"Updated Google Sheet: {google_sheet_name}")
print("\nColumn mapping summary:")
print(f"- Google Sheet columns: {len(excel_columns)}")
print(f"- CSV columns matched: {sum(1 for col in excel_columns if col in csv_df.columns or col == 'prediction')}")
print(f"- Columns filled with NA: {sum(1 for col in excel_columns if col not in csv_df.columns and col != 'prediction')}")

CSV rows: 9637
Excel rows before merge: 33
New rows to add: 9633


  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_aligned[col] = csv_df[col]
  csv_alig

APIError: APIError: [401]: API keys are not supported by this API. Expected OAuth2 access token or other authentication credentials that assert a principal. See https://cloud.google.com/docs/authentication

In [1]:
print("test")

test


In [5]:
import pandas as pd
import numpy as np

# File paths - update these with your actual file paths
csv_file = '../data/combined_bbb_classification_fixed.csv'
google_sheet_url = 'https://docs.google.com/spreadsheets/d/1sCVKMH_n-Uc-LLPxKhUi1g-XmKVOSlUQZGwrsrDVP_4/edit?gid=0#gid=0'  # Full URL of your Google Sheet
worksheet_name = 'Sheet1'  # Name of the worksheet/tab where you want to append data

# Read the CSV
csv_df = pd.read_csv(csv_file)

print(f"CSV rows: {len(csv_df)}")
print(f"CSV columns: {list(csv_df.columns)}")

# Transform BBB column to prediction format (0 -> BBB-, 1 -> BBB+)
if 'BBB' in csv_df.columns:
    csv_df['prediction'] = csv_df['BBB'].apply(lambda x: 'BBB+' if x == 1 else 'BBB-')
    csv_df = csv_df.drop(columns=['BBB'])
    print("Transformed BBB column to prediction column")

# Open the Google Sheet manually in your browser and:
# 1. Make sure you're logged in
# 2. Go to File > Download > Comma-separated values (.csv)
# 3. Save it as 'existing_sheet.csv'
# Then read it here to get the column structure
existing_sheet_file = '../data/Neurogate_database - Sheet1.csv'  # Download your current Google Sheet as CSV

try:
    existing_df = pd.read_csv(existing_sheet_file)
    excel_columns = existing_df.columns.tolist()
    print(f"\nGoogle Sheet columns: {excel_columns}")
except FileNotFoundError:
    print("\nCouldn't find existing_sheet.csv. Using CSV columns as-is.")
    excel_columns = csv_df.columns.tolist()

# Align CSV data with Google Sheet columns
aligned_data = {}
for col in excel_columns:
    if col in csv_df.columns:
        aligned_data[col] = csv_df[col]
    else:
        aligned_data[col] = np.nan

csv_aligned = pd.DataFrame(aligned_data)

print(f"\nAligned CSV data: {len(csv_aligned)} rows")
print(f"Columns matched: {sum(1 for col in excel_columns if col in csv_df.columns)}")
print(f"Columns filled with NA: {sum(1 for col in excel_columns if col not in csv_df.columns)}")

# Save the aligned data to a new CSV file
output_file = 'data_to_append.csv'
csv_aligned.to_csv(output_file, index=False)

print(f"\n✓ Saved aligned data to: {output_file}")
print("\nNext steps:")
print("1. Open your Google Sheet")
print(f"2. Go to the last row of data in the '{worksheet_name}' tab")
print(f"3. Open {output_file} in Excel/Sheets")
print("4. Copy all data (Ctrl+A, then Ctrl+C)")
print("5. Paste into your Google Sheet below the last row")
print("\nOr use Google Sheets import:")
print("1. File > Import")
print(f"2. Upload {output_file}")
print("3. Choose 'Append to current sheet'")
print("4. Click Import")

CSV rows: 9637
CSV columns: ['Unnamed: 0', 'name', 'smiles', 'BBB', 'MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 

In [1]:
import pandas as pd 
df = pd.read_csv("../data/padel_results_loop.csv")
columns_with_na = df.columns[df.isnull().any()].tolist()


In [3]:
print(len(columns_with_na))

717


In [4]:
nan_counts = df.isnull().sum()

In [5]:
print(nan_counts)

Original_Name      1099
Original_SMILES       0
nAcid                 0
ALogP                64
ALogp2               64
                   ... 
Ts                 2586
As                 2586
Vs                 2586
Ks                 2586
Ds                 2588
Length: 1877, dtype: int64
