In [56]:
import os
os.chdir("/Users/Mohammed/Desktop/SMALLMOL_MD")

In [57]:
import os
os.environ["PATH"] += ":/opt/homebrew/bin"

In [58]:
import os
import pandas as pd
from subprocess import run
import MDAnalysis as mda
import numpy as np
import matplotlib.pyplot as plt
from MDAnalysis.analysis import distances
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.rdmolfiles import MolFromPDBFile

results = [] # Data storehouse

# 📥 Read SMALLMOL IDs from second column
csv_file = "SMALLMOL_SMILES_LIST.csv"
try:
    mol_df = pd.read_csv(csv_file, encoding="utf-8")
    cid_list = mol_df.iloc[:, 1].dropna().astype(str).tolist()
except FileNotFoundError:
    print("❌ ERROR: File 'SMALLMOL_SMILES_LIST.csv' not found!")
    exit()

def run_cmd(command, input_text="0", cwd=None):
    result = run(f"echo {input_text} | {command}", shell=True, cwd=cwd, capture_output=True, text=True)
    if result.returncode != 0:
        print("ERROR:")
        print(result.stderr)
    else:
        print("INFO:")
        print(result.stdout)

for cid in cid_list:
    base_dir = f"/Users/Mohammed/Desktop/SMALLMOL_MD/P{cid}"
    print(base_dir)

    if not os.path.exists(base_dir):
        print(f"⚠️  SMALLMOL {cid} directory not found, skipping...")
        continue

    
    for rep in range(1, 4):
        rep_dir = os.path.join(base_dir, f"md_rep{rep}")
        if not os.path.exists(rep_dir):
            print(f"⚠️  {rep_dir} not found, skipping...")
            continue

        xtc_path = os.path.join(rep_dir, "md_whole.xtc")
        tpr_path = os.path.join(rep_dir, "md.tpr")

        if os.path.exists(xtc_path):
            print(f"✅  md_whole.xtc already exists for P{cid} rep{rep}, skipping GROMACS post-processing...")
        else:
            print(f"\n📂 Processing P{cid} — md_rep{rep}...\n")

            # Step 1: Index creation
            run_cmd("gmx make_ndx -f md.tpr -o index.ndx", input_text="q", cwd=rep_dir)

            # Step 2: Remove PBC jumps
            run_cmd("gmx trjconv -s md.tpr -f md.trr -o md_nojump.xtc -pbc nojump", input_text="0", cwd=rep_dir)

            # Step 3: Center on UNL but keep all atoms
            run_cmd("gmx trjconv -s md.tpr -f md_nojump.xtc -o md_center.xtc -center -pbc mol", input_text="2 0", cwd=rep_dir)

            # Step 4: Make molecule whole
            run_cmd("gmx trjconv -s md.tpr -f md_center.xtc -o md_whole.xtc -pbc whole", input_text="0", cwd=rep_dir)

        # Step 5: Load trajectory
        try:
            u = mda.Universe(tpr_path, xtc_path)
        except Exception as e:
            print(f"❌ ERROR: Failed to load trajectory for P{cid} rep{rep}: {e}")
            continue

        ligand = u.select_atoms("resname UNL")

        lengths, rgs, areas, tpsa_values = [], [], [], []
        rmsds, com_xs, com_ys, com_zs = [], [], [], []

        ref_positions = ligand.positions.copy()

        #Save a frame to PDB for TPSA
        try:
            u.trajectory[0]
            frame_pdb = os.path.join(rep_dir, "frame0.pdb")
            ligand.write(frame_pdb)

            mol = MolFromPDBFile(frame_pdb, sanitize=False)
            mol = Chem.AddHs(mol)
            Chem.SanitizeMol(mol)
            ref_tpsa = rdMolDescriptors.CalcTPSA(mol)
        except Exception as e:
            print(f"⚠️  TPSA calc failed for P{cid} rep{rep}: {e}")
            ref_tpsa = np.nan

        # Frame analysis
        for ts in u.trajectory:
            try:
                D = distances.self_distance_array(ligand.positions)
                mol_length = np.max(D)
                rg = ligand.radius_of_gyration()
                coords = ligand.positions - ligand.positions.mean(axis=0)
                _, S, _ = np.linalg.svd(coords)
                area = np.pi * S[0] * S[1]
                # RMSD calculation (against frame 0)
                rmsd = np.sqrt(np.mean(np.sum((ligand.positions - ref_positions) ** 2, axis=1)))
                rmsds.append(rmsd)

                # Center of mass tracking
                com = ligand.center_of_mass()
                com_xs.append(com[0])
                com_ys.append(com[1])
                com_zs.append(com[2])

            except Exception as e:
                print(f"⚠️  Frame error: {e}")
                continue

            lengths.append(mol_length)
            rgs.append(rg)
            areas.append(area)
            tpsa_values.append(ref_tpsa)

        # Compute COM displacement
        com_coords = np.column_stack((com_xs, com_ys, com_zs))
        disp = com_coords - com_coords[0]
        disp_magnitude = np.linalg.norm(disp, axis=1)

        # Estimate diffusion coefficient from MSD (linear fit)
        try:
            msd = np.sum(disp**2, axis=1)
            frame_interval_ps = 20  # Based on mdp: 2 fs × 10000 steps
            times = np.arange(len(msd)) * frame_interval_ps
            from scipy.stats import linregress
            slope, _, _, _, _ = linregress(times, msd)
            diffusion_coeff = slope / 6  # 3D Einstein relation
            diffusion_coeff = diffusion_coeff * 1e-16  # [Å²/ps → cm²/s]

        except Exception as e:
            print(f"⚠️  Diffusion coeff calc failed for P{cid} rep{rep}: {e}")
            diffusion_coeff = np.nan

        # Save results
        df = pd.DataFrame({
            "ID": cid,
            "rep": rep,
            "frame": range(len(lengths)),
            "length": lengths,
            "rg": rgs,
            "area": areas,
            "tpsa": tpsa_values,
            "rmsd": rmsds,
            "com_x": com_xs,
            "com_y": com_ys,
            "com_z": com_zs,
            "com_disp": disp_magnitude,
            "diff_coeff": [diffusion_coeff] * len(lengths),  # same for all frames
        })

        results.append(df)
        #df.to_csv(output_csv, index=False)
        print(f"✅ Collected analysis for P{cid} rep{rep}")
    
final_df = pd.concat(results, ignore_index=True)
#final_df.to_csv("final_results_1.csv", index=False)



/Users/Mohammed/Desktop/SMALLMOL_MD/P1
⚠️  SMALLMOL 1 directory not found, skipping...
/Users/Mohammed/Desktop/SMALLMOL_MD/P2
✅  md_whole.xtc already exists for P2 rep1, skipping GROMACS post-processing...
✅ Collected analysis for P2 rep1
✅  md_whole.xtc already exists for P2 rep2, skipping GROMACS post-processing...
✅ Collected analysis for P2 rep2
✅  md_whole.xtc already exists for P2 rep3, skipping GROMACS post-processing...
✅ Collected analysis for P2 rep3
/Users/Mohammed/Desktop/SMALLMOL_MD/P3
✅  md_whole.xtc already exists for P3 rep1, skipping GROMACS post-processing...
✅ Collected analysis for P3 rep1
✅  md_whole.xtc already exists for P3 rep2, skipping GROMACS post-processing...
✅ Collected analysis for P3 rep2
✅  md_whole.xtc already exists for P3 rep3, skipping GROMACS post-processing...
✅ Collected analysis for P3 rep3
/Users/Mohammed/Desktop/SMALLMOL_MD/P4
✅  md_whole.xtc already exists for P4 rep1, skipping GROMACS post-processing...
✅ Collected analysis for P4 rep1
✅  md

In [59]:
final_df.to_csv("MD_results.csv", index=False)

In [60]:
import pandas as pd

# Load data
df = pd.read_csv("MD_results.csv")
df["ID"] = df["ID"].astype(str)  # ensure ID is string for grouping

descriptor = "rg"
threshold = 0.02  # 2% difference tolerance
min_frames = 200
step = 5  # frames to discard per iteration

# Collect cleaned replicates
filtered_data = []
skipped = []

grouped = df.groupby(["ID", "rep"])

for (cid, rep), group in grouped:
    working = group.sort_values("frame").copy()

    while True:
        total_frames = len(working)
        if total_frames < min_frames:
            print(f"❌ Skipping {cid} rep {rep} — too few frames left")
            skipped.append((cid, rep))
            break

        mid = total_frames // 2
        first_half = working.iloc[:mid]
        last_half = working.iloc[-mid:]

        avg_first = first_half[descriptor].mean()
        avg_last = last_half[descriptor].mean()
        diff_pct = abs(avg_last - avg_first) / avg_last

        if diff_pct <= threshold:
            filtered_data.append(working)
            break
        else:
            working = working.iloc[step:]

# Combine and export
rg_truncated = pd.concat(filtered_data, ignore_index=True)
rg_truncated.to_csv("rg_truncated.csv", index=False)
print(f"✅ Trimming complete. Output saved to 'rg_truncated.csv'")
print(f"Retained {len(filtered_data)} replicates. Skipped {len(skipped)} due to instability.")

❌ Skipping 117 rep 1 — too few frames left
❌ Skipping 184 rep 1 — too few frames left
❌ Skipping 184 rep 2 — too few frames left
❌ Skipping 184 rep 3 — too few frames left
❌ Skipping 67 rep 3 — too few frames left
✅ Trimming complete. Output saved to 'rg_truncated.csv'
Retained 550 replicates. Skipped 5 due to instability.


In [61]:
import pandas as pd

# Load data
df = pd.read_csv("MD_results.csv")
df["ID"] = df["ID"].astype(str)  # ensure ID is string for grouping

descriptor = "length"
threshold = 0.02  # 2% difference tolerance
min_frames = 200
step = 5  # frames to discard per iteration

# Collect cleaned replicates
filtered_data = []
skipped = []

grouped = df.groupby(["ID", "rep"])

for (cid, rep), group in grouped:
    working = group.sort_values("frame").copy()

    while True:
        total_frames = len(working)
        if total_frames < min_frames:
            print(f"❌ Skipping {cid} rep {rep} — too few frames left")
            skipped.append((cid, rep))
            break

        mid = total_frames // 2
        first_half = working.iloc[:mid]
        last_half = working.iloc[-mid:]

        avg_first = first_half[descriptor].mean()
        avg_last = last_half[descriptor].mean()
        diff_pct = abs(avg_last - avg_first) / avg_last

        if diff_pct <= threshold:
            filtered_data.append(working)
            break
        else:
            working = working.iloc[step:]

# Combine and export
length_truncated = pd.concat(filtered_data, ignore_index=True)
length_truncated.to_csv("length_truncated.csv", index=False)
print(f"✅ Trimming complete. Output saved to 'length_truncated.csv'")
print(f"Retained {len(filtered_data)} replicates. Skipped {len(skipped)} due to instability.")

❌ Skipping 117 rep 1 — too few frames left
❌ Skipping 171 rep 1 — too few frames left
❌ Skipping 184 rep 1 — too few frames left
❌ Skipping 184 rep 2 — too few frames left
❌ Skipping 184 rep 3 — too few frames left
❌ Skipping 27 rep 3 — too few frames left
❌ Skipping 52 rep 3 — too few frames left
❌ Skipping 67 rep 1 — too few frames left
❌ Skipping 67 rep 3 — too few frames left
✅ Trimming complete. Output saved to 'length_truncated.csv'
Retained 546 replicates. Skipped 9 due to instability.


In [62]:
import pandas as pd

# Load data
df = pd.read_csv("MD_results.csv")
df["ID"] = df["ID"].astype(str)  # ensure ID is string for grouping

descriptor = "rmsd"
threshold = 0.02  # 2% difference tolerance
min_frames = 200
step = 5  # frames to discard per iteration

# Collect cleaned replicates
filtered_data = []
skipped = []

grouped = df.groupby(["ID", "rep"])

for (cid, rep), group in grouped:
    working = group.sort_values("frame").copy()

    while True:
        total_frames = len(working)
        if total_frames < min_frames:
            print(f"❌ Skipping {cid} rep {rep} — too few frames left")
            skipped.append((cid, rep))
            break

        mid = total_frames // 2
        first_half = working.iloc[:mid]
        last_half = working.iloc[-mid:]

        avg_first = first_half[descriptor].mean()
        avg_last = last_half[descriptor].mean()
        diff_pct = abs(avg_last - avg_first) / avg_last

        if diff_pct <= threshold:
            filtered_data.append(working)
            break
        else:
            working = working.iloc[step:]

# Combine and export
rmsd_truncated = pd.concat(filtered_data, ignore_index=True)
rmsd_truncated.to_csv("rmsd_truncated.csv", index=False)
print(f"✅ Trimming complete. Output saved to 'length_truncated.csv'")
print(f"Retained {len(filtered_data)} replicates. Skipped {len(skipped)} due to instability.")

❌ Skipping 14 rep 2 — too few frames left
❌ Skipping 159 rep 2 — too few frames left
❌ Skipping 176 rep 3 — too few frames left
❌ Skipping 184 rep 1 — too few frames left
❌ Skipping 184 rep 2 — too few frames left
❌ Skipping 184 rep 3 — too few frames left
❌ Skipping 187 rep 3 — too few frames left
❌ Skipping 19 rep 1 — too few frames left
❌ Skipping 27 rep 2 — too few frames left
❌ Skipping 50 rep 3 — too few frames left
❌ Skipping 57 rep 2 — too few frames left
❌ Skipping 74 rep 3 — too few frames left
❌ Skipping 77 rep 3 — too few frames left
❌ Skipping 87 rep 1 — too few frames left
✅ Trimming complete. Output saved to 'length_truncated.csv'
Retained 541 replicates. Skipped 14 due to instability.


In [63]:
import pandas as pd

# Load data
df = pd.read_csv("MD_results.csv")
df["ID"] = df["ID"].astype(str)  # ensure ID is string for grouping

descriptor = "area"
threshold = 0.02  # 2% difference tolerance
min_frames = 200
step = 5  # frames to discard per iteration

# Collect cleaned replicates
filtered_data = []
skipped = []

grouped = df.groupby(["ID", "rep"])

for (cid, rep), group in grouped:
    working = group.sort_values("frame").copy()

    while True:
        total_frames = len(working)
        if total_frames < min_frames:
            print(f"❌ Skipping {cid} rep {rep} — too few frames left")
            skipped.append((cid, rep))
            break

        mid = total_frames // 2
        first_half = working.iloc[:mid]
        last_half = working.iloc[-mid:]

        avg_first = first_half[descriptor].mean()
        avg_last = last_half[descriptor].mean()
        diff_pct = abs(avg_last - avg_first) / avg_last

        if diff_pct <= threshold:
            filtered_data.append(working)
            break
        else:
            working = working.iloc[step:]

# Combine and export
area_truncated = pd.concat(filtered_data, ignore_index=True)
area_truncated.to_csv("area_truncated.csv", index=False)
print(f"✅ Trimming complete. Output saved to 'area_truncated.csv'")
print(f"Retained {len(filtered_data)} replicates. Skipped {len(skipped)} due to instability.")

❌ Skipping 117 rep 1 — too few frames left
❌ Skipping 184 rep 1 — too few frames left
❌ Skipping 184 rep 2 — too few frames left
❌ Skipping 184 rep 3 — too few frames left
❌ Skipping 27 rep 3 — too few frames left
❌ Skipping 67 rep 3 — too few frames left
❌ Skipping 81 rep 2 — too few frames left
✅ Trimming complete. Output saved to 'area_truncated.csv'
Retained 548 replicates. Skipped 7 due to instability.


In [64]:
import pandas as pd

# Load data
df = pd.read_csv("MD_results.csv")
df["ID"] = df["ID"].astype(str)  # ensure ID is string for grouping

descriptor = "com_x"
threshold = 0.02  # 2% difference tolerance
min_frames = 200
step = 5  # frames to discard per iteration

# Collect cleaned replicates
filtered_data = []
skipped = []

grouped = df.groupby(["ID", "rep"])

for (cid, rep), group in grouped:
    working = group.sort_values("frame").copy()

    while True:
        total_frames = len(working)
        if total_frames < min_frames:
            print(f"❌ Skipping {cid} rep {rep} — too few frames left")
            skipped.append((cid, rep))
            break

        mid = total_frames // 2
        first_half = working.iloc[:mid]
        last_half = working.iloc[-mid:]

        avg_first = first_half[descriptor].mean()
        avg_last = last_half[descriptor].mean()
        diff_pct = abs(avg_last - avg_first) / avg_last

        if diff_pct <= threshold:
            filtered_data.append(working)
            break
        else:
            working = working.iloc[step:]

# Combine and export
com_x_truncated = pd.concat(filtered_data, ignore_index=True)
com_x_truncated.to_csv("com_x_truncated.csv", index=False)
print(f"✅ Trimming complete. Output saved to 'com_x_truncated.csv'")
print(f"Retained {len(filtered_data)} replicates. Skipped {len(skipped)} due to instability.")

❌ Skipping 122 rep 1 — too few frames left
❌ Skipping 123 rep 3 — too few frames left
❌ Skipping 176 rep 3 — too few frames left
❌ Skipping 184 rep 1 — too few frames left
❌ Skipping 184 rep 2 — too few frames left
❌ Skipping 184 rep 3 — too few frames left
❌ Skipping 27 rep 2 — too few frames left
✅ Trimming complete. Output saved to 'com_x_truncated.csv'
Retained 548 replicates. Skipped 7 due to instability.


In [65]:
import pandas as pd

# Load data
df = pd.read_csv("MD_results.csv")
df["ID"] = df["ID"].astype(str)  # ensure ID is string for grouping

descriptor = "com_y"
threshold = 0.02  # 2% difference tolerance
min_frames = 200
step = 5  # frames to discard per iteration

# Collect cleaned replicates
filtered_data = []
skipped = []

grouped = df.groupby(["ID", "rep"])

for (cid, rep), group in grouped:
    working = group.sort_values("frame").copy()

    while True:
        total_frames = len(working)
        if total_frames < min_frames:
            print(f"❌ Skipping {cid} rep {rep} — too few frames left")
            skipped.append((cid, rep))
            break

        mid = total_frames // 2
        first_half = working.iloc[:mid]
        last_half = working.iloc[-mid:]

        avg_first = first_half[descriptor].mean()
        avg_last = last_half[descriptor].mean()
        diff_pct = abs(avg_last - avg_first) / avg_last

        if diff_pct <= threshold:
            filtered_data.append(working)
            break
        else:
            working = working.iloc[step:]

# Combine and export
com_y_truncated = pd.concat(filtered_data, ignore_index=True)
com_y_truncated.to_csv("com_y_truncated.csv", index=False)
print(f"✅ Trimming complete. Output saved to 'com_y_truncated.csv'")
print(f"Retained {len(filtered_data)} replicates. Skipped {len(skipped)} due to instability.")

❌ Skipping 123 rep 3 — too few frames left
❌ Skipping 176 rep 3 — too few frames left
❌ Skipping 184 rep 1 — too few frames left
❌ Skipping 184 rep 2 — too few frames left
❌ Skipping 184 rep 3 — too few frames left
❌ Skipping 27 rep 2 — too few frames left
❌ Skipping 27 rep 3 — too few frames left
✅ Trimming complete. Output saved to 'com_y_truncated.csv'
Retained 548 replicates. Skipped 7 due to instability.


In [66]:
import pandas as pd

# Load data
df = pd.read_csv("MD_results.csv")
df["ID"] = df["ID"].astype(str)  # ensure ID is string for grouping

descriptor = "com_z"
threshold = 0.02  # 2% difference tolerance
min_frames = 200
step = 5  # frames to discard per iteration

# Collect cleaned replicates
filtered_data = []
skipped = []

grouped = df.groupby(["ID", "rep"])

for (cid, rep), group in grouped:
    working = group.sort_values("frame").copy()

    while True:
        total_frames = len(working)
        if total_frames < min_frames:
            print(f"❌ Skipping {cid} rep {rep} — too few frames left")
            skipped.append((cid, rep))
            break

        mid = total_frames // 2
        first_half = working.iloc[:mid]
        last_half = working.iloc[-mid:]

        avg_first = first_half[descriptor].mean()
        avg_last = last_half[descriptor].mean()
        diff_pct = abs(avg_last - avg_first) / avg_last

        if diff_pct <= threshold:
            filtered_data.append(working)
            break
        else:
            working = working.iloc[step:]

# Combine and export
com_z_truncated = pd.concat(filtered_data, ignore_index=True)
com_z_truncated.to_csv("com_z_truncated.csv", index=False)
print(f"✅ Trimming complete. Output saved to 'com_z_truncated.csv'")
print(f"Retained {len(filtered_data)} replicates. Skipped {len(skipped)} due to instability.")

❌ Skipping 149 rep 2 — too few frames left
❌ Skipping 176 rep 3 — too few frames left
❌ Skipping 184 rep 1 — too few frames left
❌ Skipping 184 rep 2 — too few frames left
❌ Skipping 184 rep 3 — too few frames left
❌ Skipping 27 rep 2 — too few frames left
❌ Skipping 67 rep 3 — too few frames left
✅ Trimming complete. Output saved to 'com_z_truncated.csv'
Retained 548 replicates. Skipped 7 due to instability.


In [67]:
import pandas as pd

# Load data
df = pd.read_csv("MD_results.csv")
df["ID"] = df["ID"].astype(str)  # ensure ID is string for grouping

descriptor = "tpsa"
threshold = 0.02  # 2% difference tolerance
min_frames = 200
step = 5  # frames to discard per iteration

# Collect cleaned replicates
filtered_data = []
skipped = []

grouped = df.groupby(["ID", "rep"])

for (cid, rep), group in grouped:
    working = group.sort_values("frame").copy()

    while True:
        total_frames = len(working)
        if total_frames < min_frames:
            print(f"❌ Skipping {cid} rep {rep} — too few frames left")
            skipped.append((cid, rep))
            break

        mid = total_frames // 2
        first_half = working.iloc[:mid]
        last_half = working.iloc[-mid:]

        avg_first = first_half[descriptor].mean()
        avg_last = last_half[descriptor].mean()
        diff_pct = abs(avg_last - avg_first) / avg_last

        if diff_pct <= threshold:
            filtered_data.append(working)
            break
        else:
            working = working.iloc[step:]

# Combine and export
tpsa_truncated = pd.concat(filtered_data, ignore_index=True)
tpsa_truncated.to_csv("tpsa_truncated.csv", index=False)
print(f"✅ Trimming complete. Output saved to 'tpsa_truncated.csv'")
print(f"Retained {len(filtered_data)} replicates. Skipped {len(skipped)} due to instability.")

❌ Skipping 184 rep 1 — too few frames left
❌ Skipping 184 rep 2 — too few frames left
❌ Skipping 184 rep 3 — too few frames left
✅ Trimming complete. Output saved to 'tpsa_truncated.csv'
Retained 552 replicates. Skipped 3 due to instability.


In [102]:
import pandas as pd

# Define files with feature names
files = {
    "length": "length_truncated.csv",
    "rg": "rg_truncated.csv",
    "area": "area_truncated.csv",
    "tpsa": "tpsa_truncated.csv",
    "rmsd": "rmsd_truncated.csv",
    "com_x": "com_x_truncated.csv",
    "com_y": "com_y_truncated.csv",
    "com_z": "com_z_truncated.csv"
}

df_list = []

for feature, file in files.items():
    df = pd.read_csv(file)

    # Ensure required columns are present
    required_cols = ["ID", "rep", "frame"]
    value_col = [col for col in df.columns if col not in required_cols][0]

    # Rename the value column to 'value'
    df = df.rename(columns={value_col: "value"})

    # Add MD_feature column
    df["MD_feature"] = feature

    # Append the selected columns only
    df_list.append(df[["ID", "rep", "frame", "MD_feature", "value"]])

# Final vertical concatenation
df_final = pd.concat(df_list, axis=0, ignore_index=True)

# Optionally save
df_final.to_csv("MD_Final.csv", index=False)

print(df_final.head())
print(df_final.shape)
print("Unique MD features found:")
print(features_counts)

   ID  rep  frame MD_feature      value
0  10    1      0     length  18.727976
1  10    1      1     length  17.356714
2  10    1      2     length  18.580078
3  10    1      3     length  17.390956
4  10    1      4     length  16.542931
(4165136, 5)
Unique MD features found:
MD_feature
tpsa      552552
com_z     544028
com_y     543298
com_x     534623
rg        528490
area      508423
length    501896
rmsd      451826
Name: count, dtype: int64


In [110]:
import pandas as pd

# Load long-form MD data
df = pd.read_csv("MD_Final.csv") 
summary_list = []
cid_list = df["ID"].unique().tolist()

# Relevant features for computation
features_needed = ["length", "rg", "area", "tpsa"]

# Pivot to simplify per-frame access
pivot_df = df[df["MD_feature"].isin(features_needed)].pivot_table(
    index=["ID", "rep", "frame"],
    columns="MD_feature",
    values="value"
).reset_index()

for cid in cid_list:
    for rep in range(1, 4):
        df_rep = pivot_df[(pivot_df["ID"] == cid) & (pivot_df["rep"] == rep)]

        if df_rep.empty:
            print(f"⚠️ Skipping ID {cid}, rep {rep} due to missing data")
            continue

        window = 200
        last_frames = df_rep[df_rep["frame"] >= df_rep["frame"].max() - window]

        convergence_score = last_frames["rg"].std() if not last_frames.empty else float('nan')
        flexibility_index = df_rep["rg"].std()

        folding_index_1 = df_rep["rg"].iloc[0]
        folding_index_2 = df_rep["rg"].iloc[50:].mean() if len(df_rep) > 50 else df_rep["rg"].mean()
        folding_index_rg = folding_index_1 - folding_index_2

        folding_index_l = (
            df_rep["length"].iloc[0] - df_rep["length"].iloc[50:].mean()
            if len(df_rep) > 50 else df_rep["length"].iloc[0] - df_rep["length"].mean()
        )

        rep_summary = pd.DataFrame({
            "ID": [cid],
            "rep": [rep],
            "mean_length": [df_rep["length"].mean()],
            "std_length": [df_rep["length"].std()],
            "min_length": [df_rep["length"].min()],
            "max_length": [df_rep["length"].max()],
            "mean_rg": [df_rep["rg"].mean()],
            "std_rg": [df_rep["rg"].std()],
            "mean_area": [df_rep["area"].mean()],
            "std_area": [df_rep["area"].std()],
            "mean_tpsa": [df_rep["tpsa"].mean()],
            "std_tpsa": [df_rep["tpsa"].std()],
            "convergence_score": [convergence_score],
            "folding_index_rg": [folding_index_rg],
            "folding_index_l": [folding_index_l],
            "flexibility_index": [flexibility_index]
        })

        summary_list.append(rep_summary)

# Combine replicate summaries
summary = pd.concat(summary_list, ignore_index=True)

# Average over replicates per ID
cid_summary = summary.groupby("ID").agg({
    "mean_length": "mean",
    "std_length": "mean",
    "min_length": "mean",
    "max_length": "mean",
    "mean_rg": "mean",
    "std_rg": "mean",
    "mean_area": "mean",
    "std_area": "mean",
    "mean_tpsa": "mean",
    "std_tpsa": "mean",
    "convergence_score": "mean",
    "folding_index_rg": "mean",
    "folding_index_l": "mean",
    "flexibility_index": "mean"
}).reset_index()


In [112]:
import pandas as pd
from tdc.single_pred import ADME

# Load small molecule ADMET data
smallmol = pd.read_csv('SMALLMOL_ADMET.csv')
smallmol = smallmol.rename(columns={'compound_ID': 'ID'})
smallmol = smallmol[['ID', 'smiles', 'Y', 'dataset']]

# Ensure ID is a string in both DataFrames
smallmol["ID"] = smallmol["ID"].astype(str)
cid_summary["ID"] = cid_summary["ID"].astype(str)  # Assuming you have `cid_summary` already

# Filter only molecules present in cid_summary
smallmol = smallmol[smallmol["ID"].isin(cid_summary["ID"])]

# Merge
merged = smallmol.merge(cid_summary, on="ID", how="left")
merged

Unnamed: 0,ID,smiles,Y,dataset,mean_length,std_length,min_length,max_length,mean_rg,std_rg,mean_area,std_area,mean_tpsa,std_tpsa,convergence_score,folding_index_rg,folding_index_l,flexibility_index
0,2,N#Cc1ccc(C(c2ccc(C#N)cc2)n2cncn2)cc1,1.00,bioavailability_ma,13.860008,0.882187,11.547472,15.849547,13.860008,0.882187,13.860008,0.882187,13.860008,0.882187,0.834302,-1.067115,-1.067115,0.882187
1,2,N#Cc1ccc(C(c2ccc(C#N)cc2)n2cncn2)cc1,45.00,half_life_obach,13.860008,0.882187,11.547472,15.849547,13.860008,0.882187,13.860008,0.882187,13.860008,0.882187,0.834302,-1.067115,-1.067115,0.882187
2,2,N#Cc1ccc(C(c2ccc(C#N)cc2)n2cncn2)cc1,1.70,lipophilicity_astrazeneca,13.860008,0.882187,11.547472,15.849547,13.860008,0.882187,13.860008,0.882187,13.860008,0.882187,0.834302,-1.067115,-1.067115,0.882187
3,3,COc1ccc2cc([C@H](C)C(=O)O)ccc2c1,1.00,bioavailability_ma,11.032185,0.353359,9.987534,12.345535,11.049788,0.369715,11.048087,0.366099,11.040256,0.367223,0.333455,0.455290,0.470696,0.369715
4,3,COc1ccc2cc([C@H](C)C(=O)O)ccc2c1,0.13,lipophilicity_astrazeneca,11.032185,0.353359,9.987534,12.345535,11.049788,0.369715,11.048087,0.366099,11.040256,0.367223,0.333455,0.455290,0.470696,0.369715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,189,N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O,-1.08,lipophilicity_astrazeneca,9.215979,0.516819,8.088687,10.627369,9.310864,0.541043,9.307710,0.540521,9.307710,0.540521,0.512765,0.139755,-0.016771,0.541043
348,190,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0.00,bbb_martins,17.003216,0.494119,14.631100,19.648101,17.003216,0.494119,16.980123,0.444100,17.003216,0.494119,0.376584,0.036495,0.036495,0.494119
349,190,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0.00,bioavailability_ma,17.003216,0.494119,14.631100,19.648101,17.003216,0.494119,16.980123,0.444100,17.003216,0.494119,0.376584,0.036495,0.036495,0.494119
350,192,C[C@H]1c2cccc(O)c2C(=O)C2=C(O)[C@]3(O)C(=O)C(C...,1.00,bioavailability_ma,12.993817,0.423772,11.137173,13.951446,12.993817,0.423772,12.993817,0.423772,12.993817,0.423772,0.439441,-0.462445,-0.462445,0.423772


In [114]:
merged.to_csv("SMALLMOL_MD.csv", index=False)

In [116]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from tqdm import tqdm

# Load SMILES file
df = pd.read_csv("SMALLMOL_SMILES_LIST.csv")
df.columns = [col.lower() for col in df.columns]
assert "smiles" in df.columns, "CSV must contain a 'smiles' column"
df["ID"] = df.get("compound_ID", df.index.astype(str))

# All RDKit descriptors excluding TPSA
descriptor_names = [desc[0] for desc in Descriptors.descList if desc[0] != "TPSA"]
descriptor_funcs = {name: getattr(Descriptors, name) for name in descriptor_names}

# Calculate descriptors
results = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    mol = Chem.MolFromSmiles(row["smiles"])
    if mol is None:
        continue
    desc_values = {desc: func(mol) for desc, func in descriptor_funcs.items()}
    desc_values["ID"] = row["ID"]
    desc_values["smiles"] = row["smiles"]
    results.append(desc_values)

# Save to DataFrame
df_rdkit = pd.DataFrame(results)
df_rdkit.to_csv("SMALLMOL_RDKit.csv", index=False)
print("✅ RDKit descriptors saved WITHOUT TPSA")

100%|██████████████████████████████████████████████████████████████████| 193/193 [00:02<00:00, 73.90it/s]


✅ RDKit descriptors saved WITHOUT TPSA


In [118]:
import pandas as pd

# Load both datasets
df_md = pd.read_csv("SMALLMOL_MD.csv")
df_rdkit = pd.read_csv("SMALLMOL_RDKit.csv")

# Standardize ID type
df_md["ID"] = df_md["ID"].astype(str)
df_rdkit["ID"] = df_rdkit["ID"].astype(str)

# If RDKit also has a 'smiles' column, rename it to avoid duplication during merge
if "smiles" in df_rdkit.columns and "smiles" in df_md.columns:
    df_rdkit = df_rdkit.drop(columns=["smiles"])  # prefer MD's version

# Merge on ID
df_merged = pd.merge(df_md, df_rdkit, on="ID", how="left")

# Reorder columns: ID, smiles, Y at beginning, rest follow
identifier_cols = [col for col in ["ID", "smiles", "Y"] if col in df_merged.columns]
other_cols = [col for col in df_merged.columns if col not in identifier_cols]
df_merged = df_merged[identifier_cols + other_cols]

# Save result
df_merged.to_csv("SMALLMOL_MD_RDKit_ADMET.csv", index=False)
print("✅ Merged file saved as: SMALLMOL_MD_RDKit_Merged.csv")

✅ Merged file saved as: SMALLMOL_MD_RDKit_Merged.csv


In [120]:
print("🧮 Merged DataFrame shape:", df_merged.shape)

🧮 Merged DataFrame shape: (352, 227)


In [122]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("SMALLMOL_MD_RDKit_ADMET.csv")

# Step 1: Define column groups
meta_cols = ['ID', 'smiles', 'dataset', 'Y']
md_keywords = ['length', 'rg', 'area', 'convergence', 'folding', 'flexibility', 'tpsa']
md_cols = [col for col in df.columns if any(k in col.lower() for k in md_keywords)]
rdkit_cols = [col for col in df.columns if col not in meta_cols + md_cols]

# Step 2: Filter RDKit features and drop NaNs
rdkit_df = df[rdkit_cols].dropna()
meta_md_df = df.loc[rdkit_df.index, meta_cols + md_cols]
y = meta_md_df['Y']

# Step 3: Correlation with Y
correlation_with_y = rdkit_df.corrwith(y).abs().sort_values(ascending=False)
sorted_features = correlation_with_y.index.tolist()

# Step 4: Drop RDKit features highly correlated with each other (threshold)
threshold = 0.9
selected_rdkit = []
corr_matrix = rdkit_df[sorted_features].corr().abs()

for feature in sorted_features:
    if not selected_rdkit:
        selected_rdkit.append(feature)
    elif all(corr_matrix.loc[feature, selected_rdkit] < threshold):
        selected_rdkit.append(feature)

# Step 5: Combine metadata, MD, and selected RDKit features
final_df = pd.concat([meta_md_df, rdkit_df[selected_rdkit]], axis=1)

# Optional: Save
# final_df.to_csv("MD_RDKit_Selected.csv", index=False)

# Summary
print(f"Original RDKit features: {len(rdkit_cols)}")
print(f"Selected RDKit features: {len(selected_rdkit)}")
print(f"Total columns in final DataFrame: {final_df.shape[1]}")
print("Top selected RDKit features:", selected_rdkit[:10])

Original RDKit features: 202
Selected RDKit features: 139
Total columns in final DataFrame: 164
Top selected RDKit features: ['fr_unbrch_alkane', 'PEOE_VSA1', 'VSA_EState4', 'fr_ether', 'VSA_EState8', 'SlogP_VSA2', 'fr_Nhpyrrole', 'SlogP_VSA8', 'NumHDonors', 'PEOE_VSA10']


In [124]:
# Step 1: Define preferred column order
ordered_cols = ['ID', 'smiles', 'dataset', 'Y']
remaining_cols = [col for col in final_df.columns if col not in ordered_cols]
final_ordered_df = final_df[ordered_cols + remaining_cols]

# Step 2: Save to CSV
final_ordered_df.to_csv("MD_ADMET_RDKIT_FS.csv", index=False)

print("✅ File saved as 'MD_ADMET_RDKIT_FS.csv' with proper column order.")

✅ File saved as 'MD_ADMET_RDKIT_FS.csv' with proper column order.


In [128]:
# --- Parse MD run summary text → table (+ averages) --------------------------
# Works in Jupyter. Paste your text into DATA below, or set DATA=None and
# provide a file path in DATA_FILE.

from __future__ import annotations
import csv
import io
import re
from datetime import timedelta
from typing import List, Dict, Optional

try:
    import pandas as pd
    HAVE_PANDAS = True
except Exception:
    HAVE_PANDAS = False

# === INPUT ==================================================================
# Option A: paste your text between the triple quotes:
DATA = """<134    1       Tue Jul  8 23:56:15 2025 Wed Jul  9 01:43:09 2025   01:46:54        269.406
134    2       Tue Jul  8 23:58:02 2025 Wed Jul  9 01:39:24 2025   01:41:22        284.135
134    3       Wed Jul  9 00:27:36 2025 Wed Jul  9 01:53:07 2025   01:25:31        336.763
135    1       Wed Jul  9 01:42:29 2025 Wed Jul  9 03:08:26 2025   01:25:57        335.085
135    2       Wed Jul  9 01:46:16 2025 Wed Jul  9 03:11:36 2025   01:25:20        337.534
135    3       Wed Jul  9 01:57:29 2025 Wed Jul  9 03:42:20 2025   01:44:51        274.652
136    1       Wed Jul  9 03:12:08 2025 Wed Jul  9 04:34:24 2025   01:22:16        350.038
136    2       Wed Jul  9 03:15:14 2025 Wed Jul  9 04:38:45 2025   01:23:31        344.842
136    3       Wed Jul  9 03:45:37 2025 Wed Jul  9 05:28:32 2025   01:42:55        279.872
137    1       Wed Jul  9 04:44:03 2025 Wed Jul  9 06:08:21 2025   01:24:18        341.601
137    2       Wed Jul  9 04:50:34 2025 Wed Jul  9 06:14:10 2025   01:23:36        344.514
137    3       Wed Jul  9 05:33:04 2025 Wed Jul  9 07:01:51 2025   01:28:47        324.373
138    1       Wed Jul  9 06:13:13 2025 Wed Jul  9 07:36:00 2025   01:22:47        347.909
138    2       Wed Jul  9 06:18:15 2025 Wed Jul  9 07:42:03 2025   01:23:48        343.654
138    3       Wed Jul  9 07:47:43 2025 Wed Jul  9 09:30:03 2025   01:42:20        281.447
139    1       Wed Jul  9 08:34:45 2025 Wed Jul  9 10:24:33 2025   01:49:48        262.320
139    2       Wed Jul  9 10:30:37 2025 Wed Jul  9 11:55:08 2025   01:24:31        340.762
139    3       Wed Jul  9 11:59:23 2025 Wed Jul  9 13:22:29 2025   01:23:06        346.563
140    1       Wed Jul  9 13:26:29 2025 Wed Jul  9 15:26:58 2025   02:00:29        239.026
140    2       Wed Jul  9 17:22:42 2025 Wed Jul  9 19:01:38 2025   01:38:56        291.107
140    3       Wed Jul  9 19:06:40 2025 Wed Jul  9 21:04:35 2025   01:57:55        244.267
141    1       Wed Jul  9 21:09:37 2025 Wed Jul  9 22:33:49 2025   01:24:12        342.052
141    2       Wed Jul  9 22:36:52 2025 Thu Jul 10 00:00:36 2025   01:23:44        343.935
141    3       Thu Jul 10 00:04:52 2025 Thu Jul 10 01:28:57 2025   01:24:05        342.549
142    1       Thu Jul 10 01:32:55 2025 Thu Jul 10 03:18:10 2025   01:45:15        273.623
142    2       Thu Jul 10 03:23:44 2025 Thu Jul 10 05:04:31 2025   01:40:47        285.769
142    3       Thu Jul 10 05:09:43 2025 Thu Jul 10 06:52:08 2025   01:42:25        281.223
143    1       Thu Jul 10 06:57:24 2025 Thu Jul 10 08:29:03 2025   01:31:39        314.232
143    2       Thu Jul 10 07:50:51 2025 Thu Jul 10 09:21:56 2025   01:31:05        316.199
143    3       Thu Jul 10 08:32:48 2025 Thu Jul 10 10:20:38 2025   01:47:50        267.063
144    1       Thu Jul 10 09:25:40 2025 Thu Jul 10 10:44:49 2025   01:19:09        363.910
144    2       Thu Jul 10 10:49:36 2025 Thu Jul 10 12:07:26 2025   01:17:50        370.070
144    3       Thu Jul 10 12:12:52 2025 Thu Jul 10 13:33:05 2025   01:20:13        359.040
145    1       Thu Jul 10 13:38:12 2025 Thu Jul 10 15:24:55 2025   01:46:43        269.880
145    2       Thu Jul 10 15:28:33 2025 Thu Jul 10 17:15:00 2025   01:46:27        270.538
145    3       Thu Jul 10 17:18:30 2025 Thu Jul 10 19:06:08 2025   01:47:38        267.579
146    1       Thu Jul 10 19:11:29 2025 Thu Jul 10 20:30:57 2025   01:19:28        362.369
146    2       Thu Jul 10 20:36:51 2025 Thu Jul 10 22:00:37 2025   01:23:46        343.796
146    3       Thu Jul 10 22:07:54 2025 Thu Jul 10 23:28:41 2025   01:20:47        356.507
147    1       Thu Jul 10 23:33:19 2025 Fri Jul 11 00:51:16 2025   01:17:57        369.442
147    2       Fri Jul 11 00:55:27 2025 Fri Jul 11 02:10:43 2025   01:15:16        382.631
147    3       Fri Jul 11 02:15:33 2025 Fri Jul 11 03:27:31 2025   01:11:58        400.185
148    1       Fri Jul 11 02:26:13 2025 Fri Jul 11 03:51:39 2025   01:25:26        337.127
148    2       Fri Jul 11 03:30:59 2025 Fri Jul 11 04:52:45 2025   01:21:46        352.252
148    3       Fri Jul 11 03:58:27 2025 Fri Jul 11 05:23:05 2025   01:24:38        340.253
149    1       Fri Jul 11 04:57:15 2025 Fri Jul 11 06:21:53 2025   01:24:38        340.289
149    2       Fri Jul 11 05:26:54 2025 Fri Jul 11 07:13:18 2025   01:46:24        270.696
149    3       Fri Jul 11 06:28:07 2025 Fri Jul 11 07:52:53 2025   01:24:46        339.761
151    1       Fri Jul 11 08:07:22 2025 Fri Jul 11 09:26:34 2025   01:19:12        363.650
151    2       Fri Jul 11 08:28:40 2025 Fri Jul 11 09:52:52 2025   01:24:12        342.053
151    3       Fri Jul 11 09:57:32 2025 Fri Jul 11 11:44:50 2025   01:47:18        268.422
152    1       Fri Jul 11 11:50:09 2025 Fri Jul 11 13:18:23 2025   01:28:14        326.408
152    2       Fri Jul 11 13:23:13 2025 Fri Jul 11 14:56:15 2025   01:33:02        309.566
152    3       Fri Jul 11 14:56:00 2025 Fri Jul 11 16:47:27 2025   01:51:27        258.423
153    1       Fri Jul 11 14:55:58 2025 Fri Jul 11 16:21:40 2025   01:25:42        336.044
153    2       Fri Jul 11 15:07:39 2025 Fri Jul 11 16:33:01 2025   01:25:22        337.364
153    3       Fri Jul 11 16:37:44 2025 Fri Jul 11 18:04:14 2025   01:26:30        332.898
154    1       Fri Jul 11 17:02:19 2025 Fri Jul 11 18:28:56 2025   01:26:37        332.529
154    2       Fri Jul 11 18:35:17 2025 Fri Jul 11 20:21:48 2025   01:46:31        270.359
154    3       Fri Jul 11 20:25:48 2025 Fri Jul 11 22:11:14 2025   01:45:26        273.170
155    1       Fri Jul 11 22:17:31 2025 Fri Jul 11 23:59:01 2025   01:41:30        283.734
155    2       Sat Jul 12 00:04:51 2025 Sat Jul 12 01:48:11 2025   01:43:20        278.691
155    3       Sat Jul 12 01:53:27 2025 Sat Jul 12 03:38:01 2025   01:44:34        275.404
156    1       Sat Jul 12 03:48:19 2025 Sat Jul 12 05:32:23 2025   01:44:04        276.743
156    2       Sat Jul 12 05:09:16 2025 Sat Jul 12 06:54:41 2025   01:45:25        273.176
156    3       Sat Jul 12 05:39:20 2025 Sat Jul 12 07:20:48 2025   01:41:28        283.847
157    1       Sat Jul 12 07:28:18 2025 Sat Jul 12 09:10:56 2025   01:42:38        280.612
157    2       Sat Jul 12 07:50:03 2025 Sat Jul 12 09:21:51 2025   01:31:48        313.771
157    3       Sat Jul 12 09:18:27 2025 Sat Jul 12 11:03:19 2025   01:44:52        274.658
158    1       Sat Jul 12 09:28:11 2025 Sat Jul 12 10:52:45 2025   01:24:34        340.603
158    2       Sat Jul 12 11:14:34 2025 Sat Jul 12 12:38:14 2025   01:23:40        344.179
158    3       Sat Jul 12 12:48:11 2025 Sat Jul 12 14:12:44 2025   01:24:33        340.592
159    1       Sat Jul 12 14:21:53 2025 Sat Jul 12 16:02:47 2025   01:40:54        285.455
159    2       Sat Jul 12 16:09:04 2025 Sat Jul 12 17:30:26 2025   01:21:22        353.925
159    3       Sat Jul 12 17:40:22 2025 Sat Jul 12 19:20:43 2025   01:40:21        286.982
160    1       Sat Jul 12 19:30:22 2025 Sat Jul 12 20:55:52 2025   01:25:30        336.808
160    2       Sat Jul 12 21:03:18 2025 Sat Jul 12 22:32:30 2025   01:29:12        322.908
160    3       Sat Jul 12 22:38:25 2025 Sun Jul 13 00:05:47 2025   01:27:22        329.662
161    1       Sat Jul 12 23:56:41 2025 Sun Jul 13 01:39:03 2025   01:42:22        281.312
161    2       Sun Jul 13 00:12:02 2025 Sun Jul 13 01:54:04 2025   01:42:02        282.288
161    3       Sun Jul 13 01:46:15 2025 Sun Jul 13 03:29:52 2025   01:43:37        277.957
162    1       Sun Jul 13 02:01:01 2025 Sun Jul 13 03:45:56 2025   01:44:55        274.522
162    2       Sun Jul 13 03:37:10 2025 Sun Jul 13 05:19:29 2025   01:42:19        281.473
162    3       Sun Jul 13 03:51:23 2025 Sun Jul 13 05:35:45 2025   01:44:22        275.949
163    1       Sun Jul 13 05:29:22 2025 Sun Jul 13 06:54:26 2025   01:25:04        338.521
163    2       Sun Jul 13 05:45:56 2025 Sun Jul 13 07:09:36 2025   01:23:40        344.249
163    3       Sun Jul 13 06:32:46 2025 Sun Jul 13 07:55:43 2025   01:22:57        347.161
164    1       Sun Jul 13 06:32:46 2025 Sun Jul 13 07:56:06 2025   01:23:20        345.588
164    2       Sun Jul 13 07:05:44 2025 Sun Jul 13 08:48:58 2025   01:43:14        278.962
164    3       Sun Jul 13 07:18:17 2025 Sun Jul 13 09:02:31 2025   01:44:14        276.332
165    1       Sun Jul 13 08:05:50 2025 Sun Jul 13 09:52:50 2025   01:47:00        269.145
165    2       Sun Jul 13 08:06:10 2025 Sun Jul 13 09:54:12 2025   01:48:02        266.562
165    3       Sun Jul 13 09:17:33 2025 Sun Jul 13 11:03:50 2025   01:46:17        270.971
166    1       Sun Jul 13 09:59:49 2025 Sun Jul 13 11:40:43 2025   01:40:54        285.426
166    2       Sun Jul 13 09:59:50 2025 Sun Jul 13 11:24:33 2025   01:24:43        339.929
166    3       Sun Jul 13 11:36:10 2025 Sun Jul 13 13:00:34 2025   01:24:24        341.219
167    1       Sun Jul 13 11:51:43 2025 Sun Jul 13 13:19:15 2025   01:27:32        329.050
167    2       Sun Jul 13 13:09:52 2025 Sun Jul 13 14:35:35 2025   01:25:43        336.014
167    3       Sun Jul 13 13:30:31 2025 Sun Jul 13 14:57:29 2025   01:26:58        331.177
168    1       Sun Jul 13 14:42:06 2025 Sun Jul 13 16:27:25 2025   01:45:19        273.492
168    2       Sun Jul 13 15:46:34 2025 Sun Jul 13 17:33:47 2025   01:47:13        268.645
168    3       Sun Jul 13 17:40:45 2025 Sun Jul 13 19:26:07 2025   01:45:22        273.293
170    1       Sun Jul 13 19:51:00 2025 Sun Jul 13 21:37:15 2025   01:46:15        271.044
170    2       Sun Jul 13 21:46:35 2025 Sun Jul 13 23:27:03 2025   01:40:28        286.633
170    3       Sun Jul 13 23:36:26 2025 Mon Jul 14 01:19:27 2025   01:43:01        279.567
171    1       Mon Jul 14 01:28:42 2025 Mon Jul 14 03:02:23 2025   01:33:41        307.380
171    2       Mon Jul 14 02:03:43 2025 Mon Jul 14 03:37:24 2025   01:33:41        307.386
171    3       Mon Jul 14 03:14:54 2025 Mon Jul 14 04:50:42 2025   01:35:48        300.634
172    1       Mon Jul 14 05:08:08 2025 Mon Jul 14 06:30:29 2025   01:22:21        349.764
172    2       Mon Jul 14 06:44:54 2025 Mon Jul 14 08:27:06 2025   01:42:12        281.798
172    3       Mon Jul 14 08:31:36 2025 Mon Jul 14 09:54:01 2025   01:22:25        349.390
173    1       Mon Jul 14 10:03:22 2025 Mon Jul 14 11:21:52 2025   01:18:30        366.873
173    2       Mon Jul 14 11:30:19 2025 Mon Jul 14 12:49:10 2025   01:18:51        365.256
173    3       Mon Jul 14 12:58:48 2025 Mon Jul 14 14:34:43 2025   01:35:55        300.292
174    1       Tue Jul 15 03:03:47 2025 Tue Jul 15 04:47:13 2025   01:43:26        278.427
174    2       Tue Jul 15 04:55:37 2025 Tue Jul 15 06:37:57 2025   01:42:20        281.422
174    3       Tue Jul 15 06:45:13 2025 Tue Jul 15 08:26:50 2025   01:41:37        283.429
175    1       Tue Jul 15 08:35:03 2025 Tue Jul 15 10:19:48 2025   01:44:45        274.961
175    2       Tue Jul 15 10:30:31 2025 Tue Jul 15 11:56:42 2025   01:26:11        334.114
175    3       Tue Jul 15 12:04:41 2025 Tue Jul 15 13:52:58 2025   01:48:17        265.947
176    1       Tue Jul 15 13:59:11 2025 Tue Jul 15 15:21:06 2025   01:21:55        351.520
176    2       Tue Jul 15 15:28:50 2025 Tue Jul 15 17:13:24 2025   01:44:34        275.454
176    3       Tue Jul 15 17:23:09 2025 Tue Jul 15 18:50:11 2025   01:27:02        330.937
177    1       Tue Jul 15 19:00:55 2025 Tue Jul 15 20:36:01 2025   01:35:06        302.824
177    2       Tue Jul 15 20:45:06 2025 Tue Jul 15 22:20:17 2025   01:35:11        302.577
177    3       Tue Jul 15 22:30:04 2025 Wed Jul 16 00:05:38 2025   01:35:34        301.389
178    1       Wed Jul 16 00:14:45 2025 Wed Jul 16 01:57:47 2025   01:43:02        279.525
178    2       Wed Jul 16 02:06:36 2025 Wed Jul 16 03:53:45 2025   01:47:09        268.761
178    3       Wed Jul 16 04:05:55 2025 Wed Jul 16 05:30:21 2025   01:24:26        341.151
179    1       Wed Jul 16 05:41:47 2025 Wed Jul 16 07:32:45 2025   01:50:58        259.541
179    2       Wed Jul 16 07:43:43 2025 Wed Jul 16 09:31:28 2025   01:47:45        267.259
179    3       Wed Jul 16 09:45:50 2025 Wed Jul 16 11:37:04 2025   01:51:14        258.899
180    1       Wed Jul 16 11:42:28 2025 Wed Jul 16 12:59:59 2025   01:17:31        371.494
180    2       Wed Jul 16 13:05:33 2025 Wed Jul 16 14:23:56 2025   01:18:23        367.446
180    3       Wed Jul 16 14:27:45 2025 Wed Jul 16 16:06:30 2025   01:38:45        291.633
181    1       Wed Jul 16 16:14:50 2025 Wed Jul 16 17:50:35 2025   01:35:45        300.770
181    2       Wed Jul 16 18:02:37 2025 Wed Jul 16 19:39:07 2025   01:36:30        298.461
181    3       Wed Jul 16 19:46:51 2025 Wed Jul 16 21:23:44 2025   01:36:53        297.283
182    1       Wed Jul 16 21:31:38 2025 Wed Jul 16 23:20:56 2025   01:49:18        263.483
182    2       Wed Jul 16 23:30:07 2025 Thu Jul 17 01:20:33 2025   01:50:26        260.789
182    3       Thu Jul 17 01:29:39 2025 Thu Jul 17 03:03:15 2025   01:33:36        307.721
183    1       Thu Jul 17 03:12:28 2025 Thu Jul 17 04:55:29 2025   01:43:01        279.580
183    2       Thu Jul 17 05:00:54 2025 Thu Jul 17 06:24:47 2025   01:23:53        343.326
183    3       Thu Jul 17 06:29:53 2025 Thu Jul 17 08:18:10 2025   01:48:17        266.002
184    1       Thu Jul 17 08:26:38 2025                            --:--:--             NA
184    2       Thu Jul 17 08:34:56 2025                            --:--:--             NA
184    3       Thu Jul 17 08:45:09 2025                            --:--:--             NA
185    1       Thu Jul 17 09:19:35 2025 Thu Jul 17 10:39:11 2025   01:19:36        361.810
185    2       Thu Jul 17 10:45:35 2025 Thu Jul 17 12:22:03 2025   01:36:28        298.551
185    3       Thu Jul 17 12:27:33 2025 Thu Jul 17 13:46:36 2025   01:19:03        364.313
186    1       Thu Jul 17 13:57:42 2025 Thu Jul 17 15:38:15 2025   01:40:33        286.437
186    2       Thu Jul 17 15:42:24 2025 Thu Jul 17 17:20:03 2025   01:37:39        294.923
186    3       Thu Jul 17 17:25:07 2025 Thu Jul 17 19:04:16 2025   01:39:09        290.500
187    1       Thu Jul 17 19:08:02 2025 Thu Jul 17 21:00:05 2025   01:52:03        257.023
187    2       Thu Jul 17 21:06:27 2025 Thu Jul 17 22:35:59 2025   01:29:32        321.657
187    3       Thu Jul 17 22:40:40 2025 Fri Jul 18 00:32:05 2025   01:51:25        258.502
188    1       Fri Jul 18 00:40:09 2025 Fri Jul 18 02:34:13 2025   01:54:04        252.504
188    2       Fri Jul 18 02:38:11 2025 Fri Jul 18 04:32:43 2025   01:54:32        251.469
188    3       Fri Jul 18 04:38:19 2025 Fri Jul 18 06:31:16 2025   01:52:57        254.984
189    1       Fri Jul 18 06:16:48 2025 Fri Jul 18 07:53:02 2025   01:36:14        299.232
189    2       Fri Jul 18 06:35:06 2025 Fri Jul 18 07:55:30 2025   01:20:24        358.226
189    3       Fri Jul 18 07:59:19 2025 Fri Jul 18 09:34:53 2025   01:35:34        301.367
190    1       Fri Jul 18 08:02:14 2025 Fri Jul 18 09:55:08 2025   01:52:54        255.091
190    2       Fri Jul 18 09:58:31 2025 Fri Jul 18 11:28:23 2025   01:29:52        320.516
190    3       Fri Jul 18 11:34:56 2025 Fri Jul 18 13:05:12 2025   01:30:16        319.032
191    1       Fri Jul 18 13:10:20 2025 Fri Jul 18 14:29:29 2025   01:19:09        363.914
191    2       Fri Jul 18 14:35:46 2025 Fri Jul 18 15:57:27 2025   01:21:41        352.539
191    3       Fri Jul 18 16:04:53 2025 Fri Jul 18 17:26:49 2025   01:21:56        351.438
192    1       Fri Jul 18 17:33:08 2025 Fri Jul 18 19:16:49 2025   01:43:41        277.779
192    2       Fri Jul 18 19:20:43 2025 Fri Jul 18 21:04:33 2025   01:43:50        277.359
192    3       Fri Jul 18 21:13:26 2025 Fri Jul 18 22:59:37 2025   01:46:11        271.209
193    1       Fri Jul 18 23:07:40 2025 Sat Jul 19 00:30:39 2025   01:22:59        347.075
193    2       Sat Jul 19 00:37:21 2025 Sat Jul 19 02:21:13 2025   01:43:52        277.270
193    3       Sat Jul 19 02:25:13 2025 Sat Jul 19 04:07:49 2025   01:42:36        280.733>"""
# Replace the "..." with the full middle of your block, or set DATA=None and
# use a file path:

# Option B: read from a file path (set to a string like "/path/to/file.txt")
DATA_FILE = None  # e.g., "/home/you/sim_table.txt"

# Write a CSV too?
WRITE_CSV = True
CSV_PATH = "simulation_times.csv"

# === PARSER =================================================================

def parse_block(text: str) -> List[Dict[str, Optional[str]]]:
    rows: List[Dict[str, Optional[str]]] = []
    for raw in text.strip().splitlines():
        line = raw.strip()
        if not line:
            continue

        # Strip leading "<" and trailing ">" if present
        if line.startswith("<"):
            line = line[1:].lstrip()
        if line.endswith(">"):
            line = line[:-1].rstrip()

        parts = line.split()
        if len(parts) < 8:
            # too short to be valid; skip
            continue

        mol = parts[0]
        rep = parts[1]

        # start timestamp is 5 tokens after rep
        start = " ".join(parts[2:7])

        # duration + performance are always the last two tokens
        dur_token = parts[-2] if len(parts) >= 2 else ""
        perf_token = parts[-1] if len(parts) >= 1 else ""

        # end timestamp exists when we have ≥14 tokens (2 id tokens + 5 start + 5 end + 2 tail)
        end = " ".join(parts[7:12]) if len(parts) >= 14 else ""

        # normalize duration/performance
        duration = None if dur_token == "--:--:--" else dur_token
        performance = None if perf_token == "NA" else perf_token

        rows.append(
            {
                "Molecule": mol,
                "Replicate": rep,
                "Start": start,
                "End": end,
                "Duration": duration,        # "HH:MM:SS" or None
                "Performance_ns_day": performance,  # string number or None
            }
        )
    return rows


def hhmmss_to_seconds(hhmmss: str) -> int:
    h, m, s = hhmmss.split(":")
    return int(h) * 3600 + int(m) * 60 + int(s)


def seconds_to_hhmmss(sec: float) -> str:
    sec = int(round(sec))
    return str(timedelta(seconds=sec))


# === MAIN ===================================================================

# Load text
if DATA is None and DATA_FILE:
    with open(DATA_FILE, "r") as f:
        DATA = f.read()
elif DATA is None:
    raise SystemExit("No input provided. Set DATA (pasted text) or DATA_FILE (path).")

# Parse
records = parse_block(DATA)

# Compute numeric durations (seconds) for completed rows
for r in records:
    d = r["Duration"]
    r["Duration_seconds"] = hhmmss_to_seconds(d) if d else None

# Display table (prefer pandas if available)
headers = ["Molecule", "Replicate", "Start", "End", "Duration", "Performance_ns_day"]
if HAVE_PANDAS:
    df = pd.DataFrame(records, columns=headers + ["Duration_seconds"])
    # Show a clean view
    display_cols = headers  # hide the seconds helper by default
    try:
        from IPython.display import display
        display(df[display_cols])
    except Exception:
        print(df[display_cols].to_string(index=False))
else:
    # fallback pretty print
    widths = [8, 9, 24, 24, 10, 18]
    print("".join(h.ljust(w) for h, w in zip(headers, widths)))
    print("-" * sum(widths))
    for r in records:
        row = [
            r["Molecule"] or "",
            r["Replicate"] or "",
            r["Start"] or "",
            r["End"] or "",
            r["Duration"] or "",
            r["Performance_ns_day"] or "",
        ]
        print("".join(str(val).ljust(w) for val, w in zip(row, widths)))

# Write CSV if requested
if WRITE_CSV:
    with open(CSV_PATH, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        for r in records:
            writer.writerow([
                r["Molecule"],
                r["Replicate"],
                r["Start"],
                r["End"],
                r["Duration"] if r["Duration"] else "",
                r["Performance_ns_day"] if r["Performance_ns_day"] else "",
            ])
    print(f"\nSaved CSV → {CSV_PATH}")

# Averages
completed_secs = [r["Duration_seconds"] for r in records if r["Duration_seconds"] is not None]
overall_avg = seconds_to_hhmmss(sum(completed_secs)/len(completed_secs)) if completed_secs else "NA"

# per-molecule avg (only completed)
per_mol: Dict[str, List[int]] = {}
for r in records:
    if r["Duration_seconds"] is not None:
        per_mol.setdefault(r["Molecule"], []).append(r["Duration_seconds"])

per_mol_avg = {mol: seconds_to_hhmmss(sum(v)/len(v)) for mol, v in per_mol.items()}

print("\n[AVERAGES]")
print(f"Overall average duration (completed only): {overall_avg}")
print(f"Per-molecule average durations (completed only):")
# Pretty print sorted by molecule id (as int if possible)
def sort_key(k: str):
    try:
        return int(k)
    except:
        return k

for mol in sorted(per_mol_avg.keys(), key=sort_key):
    print(f"  {mol}: {per_mol_avg[mol]}")

Unnamed: 0,Molecule,Replicate,Start,End,Duration,Performance_ns_day
0,134,1,Tue Jul 8 23:56:15 2025,Wed Jul 9 01:43:09 2025,01:46:54,269.406
1,134,2,Tue Jul 8 23:58:02 2025,Wed Jul 9 01:39:24 2025,01:41:22,284.135
2,134,3,Wed Jul 9 00:27:36 2025,Wed Jul 9 01:53:07 2025,01:25:31,336.763
3,135,1,Wed Jul 9 01:42:29 2025,Wed Jul 9 03:08:26 2025,01:25:57,335.085
4,135,2,Wed Jul 9 01:46:16 2025,Wed Jul 9 03:11:36 2025,01:25:20,337.534
...,...,...,...,...,...,...
169,192,2,Fri Jul 18 19:20:43 2025,Fri Jul 18 21:04:33 2025,01:43:50,277.359
170,192,3,Fri Jul 18 21:13:26 2025,Fri Jul 18 22:59:37 2025,01:46:11,271.209
171,193,1,Fri Jul 18 23:07:40 2025,Sat Jul 19 00:30:39 2025,01:22:59,347.075
172,193,2,Sat Jul 19 00:37:21 2025,Sat Jul 19 02:21:13 2025,01:43:52,277.270



Saved CSV → simulation_times.csv

[AVERAGES]
Overall average duration (completed only): 1:34:54
Per-molecule average durations (completed only):
  134: 1:37:56
  135: 1:32:03
  136: 1:29:34
  137: 1:25:34
  138: 1:29:38
  139: 1:32:28
  140: 1:52:27
  141: 1:24:00
  142: 1:42:49
  143: 1:36:51
  144: 1:19:04
  145: 1:46:56
  146: 1:21:20
  147: 1:15:04
  148: 1:23:57
  149: 1:31:56
  151: 1:30:14
  152: 1:37:34
  153: 1:25:51
  154: 1:39:31
  155: 1:43:08
  156: 1:43:39
  157: 1:39:46
  158: 1:24:16
  159: 1:34:12
  160: 1:27:21
  161: 1:42:40
  162: 1:43:52
  163: 1:23:54
  164: 1:36:56
  165: 1:47:06
  166: 1:30:00
  167: 1:26:44
  168: 1:45:58
  170: 1:43:15
  171: 1:34:23
  172: 1:28:59
  173: 1:24:25
  174: 1:42:28
  175: 1:39:44
  176: 1:31:10
  177: 1:35:17
  178: 1:38:12
  179: 1:49:59
  180: 1:24:53
  181: 1:36:23
  182: 1:44:27
  183: 1:38:24
  185: 1:25:02
  186: 1:39:07
  187: 1:44:20
  188: 1:53:51
  189: 1:30:44
  190: 1:37:41
  191: 1:20:55
  192: 1:44:34
  193: 1:36:29