# üìä Create Training Data from Experimental CSV

This notebook transforms your experimental data (Daten_clean.csv) into the format required for Fluor-RLAT training.

**Input:** CSV with columns: `name, solvent, abs, em, epsilon, mw, plqy, smiles`

**Output:** Training data files for each property:
- `new_train_{target}.csv` - Main data (152 columns)
- `new_train_smiles_{target}.csv` - Molecule Morgan fingerprints (1024 columns)
- `new_train_sol_{target}.csv` - Solvent Morgan fingerprints (1024 columns)

---

## 1. Setup

In [1]:
# ============================================================================
# Install Dependencies
# ============================================================================
!pip install rdkit -q
print("‚úÖ Dependencies installed!")

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m36.7/36.7 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h‚úÖ Dependencies installed!


In [2]:
# ============================================================================
# Import Libraries
# ============================================================================
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors

print("‚úÖ Libraries imported")

‚úÖ Libraries imported


In [3]:
# ============================================================================
# Mount Google Drive
# ============================================================================
from google.colab import drive

# Check if already mounted
import os
if os.path.exists('/content/drive/MyDrive'):
    print("‚úÖ Google Drive already mounted")
else:
    drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# ============================================================================
# Clone Repository (for reference data)
# ============================================================================
REPO_URL = "https://github.com/markste-in/fluor_tools.git"
REPO_DIR = "fluor_tools"

if not os.path.exists(REPO_DIR):
    print(f"üì• Cloning repository...")
    !git clone {REPO_URL} -q
    print("‚úÖ Repository cloned!")
else:
    print(f"‚úÖ Repository already exists")

DATA_DIR = f'./{REPO_DIR}/Fluor-RLAT/data'
print(f"üìÅ Reference data: {DATA_DIR}")

üì• Cloning repository...
‚úÖ Repository cloned!
üìÅ Reference data: ./fluor_tools/Fluor-RLAT/data


## 2. Configuration

In [5]:
# ============================================================================
# Configuration
# ============================================================================

# Input: Your experimental data CSV
INPUT_CSV = '/content/drive/MyDrive/fluor_models/Daten_clean.csv'

# Output: Where to save the training data
OUTPUT_DIR = '/content/drive/MyDrive/fluor_models/training_data'

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Solvent SMILES mapping (German names to SMILES)
SOLVENT_NAME_TO_SMILES = {
    'Toluol': 'Cc1ccccc1',
    'toluol': 'Cc1ccccc1',
    'Toluene': 'Cc1ccccc1',
    'toluene': 'Cc1ccccc1',
    'EtOH': 'CCO',
    'Ethanol': 'CCO',
    'MeOH': 'CO',
    'Methanol': 'CO',
    'DCM': 'ClCCl',
    'CH2Cl2': 'ClCCl',
    'Dichlormethan': 'ClCCl',
    'CHCl3': 'ClC(Cl)Cl',
    'Chloroform': 'ClC(Cl)Cl',
    'Benzol': 'c1ccccc1',
    'Benzene': 'c1ccccc1',
    'DMSO': 'CS(C)=O',
    'Wasser': 'O',
    'Water': 'O',
    'Aceton': 'CC(C)=O',
    'Acetone': 'CC(C)=O',
    'THF': 'C1CCOC1',
    'Cyclohexan': 'C1CCCCC1',
    'Cyclohexane': 'C1CCCCC1',
    'Hexan': 'CCCCCC',
    'Hexane': 'CCCCCC',
    'Acetonitril': 'CC#N',
    'Acetonitrile': 'CC#N',
    'ACN': 'CC#N',
    'DMF': 'CN(C)C=O',
    'Diethylether': 'CCOCC',
    'Et2O': 'CCOCC',
}

print(f"üìÇ Input:  {INPUT_CSV}")
print(f"üìÇ Output: {OUTPUT_DIR}")

üìÇ Input:  /content/drive/MyDrive/fluor_models/Daten_clean.csv
üìÇ Output: /content/drive/MyDrive/fluor_models/training_data


## 3. Load Reference Data

In [6]:
# ============================================================================
# Load Reference Data (solvent mapping and substructures)
# ============================================================================

# Load solvent mapping from original training data
solvent_mapping_df = pd.read_csv(f'{DATA_DIR}/00_solvent_mapping.csv')
SOLVENT_SMILES_TO_NUM = dict(zip(solvent_mapping_df['solvent'], solvent_mapping_df['solvent_num']))
print(f"‚úÖ Loaded {len(SOLVENT_SMILES_TO_NUM)} solvent mappings")

# Load substructure patterns for scaffold detection
substructure_df = pd.read_csv(f'{DATA_DIR}/00_mmp_substructure.csv')
SUBSTRUCTURE_PATTERNS = []
for idx, row in substructure_df.iterrows():
    try:
        # Replace attachment point with wildcard for substructure matching
        smarts = row['fragment'].replace('[*:1]', '*')
        pattern = Chem.MolFromSmarts(smarts)
        if pattern:
            SUBSTRUCTURE_PATTERNS.append((idx, pattern))
    except:
        pass
print(f"‚úÖ Loaded {len(SUBSTRUCTURE_PATTERNS)} substructure patterns")

# Tag name mapping (based on scaffold detection)
TAG_MAPPING = {
    'BODIPY': 5,
    'Coumarin': 3,
    'Rhodamine': 4,
    'Cyanine': 6,
    'PAHs': 8,
    'Other': 0
}

print("\nSolvent SMILES ‚Üí Number mapping (first 10):")
for smiles, num in list(SOLVENT_SMILES_TO_NUM.items())[:10]:
    print(f"  {smiles}: {num}")

‚úÖ Loaded 73 solvent mappings
‚úÖ Loaded 136 substructure patterns

Solvent SMILES ‚Üí Number mapping (first 10):
  ClCCl: 0
  CO: 1
  CCO: 2
  ClC(Cl)Cl: 3
  CC#N: 4
  C1CCOC1: 5
  Cc1ccccc1: 6
  CS(C)=O: 7
  O: 8
  CN(C)C=O: 9


## 4. Load Input Data

In [8]:
# ============================================================================
# Load Input CSV
# ============================================================================

input_df = pd.read_csv(INPUT_CSV)
print(f"üìÇ Loaded {len(input_df)} rows from {INPUT_CSV}")
print(f"\nColumns: {list(input_df.columns)}")
print(f"\nFirst 5 rows:")
display(input_df.head())

# Check data availability per property
print("\nüìä Data availability per property:")
for prop in ['abs', 'em', 'plqy', 'epsilon']:
    if prop in input_df.columns:
        count = input_df[prop].notna().sum()
        print(f"  {prop}: {count} samples with data")

üìÇ Loaded 126 rows from /content/drive/MyDrive/fluor_models/Daten_clean.csv

Columns: ['name', 'solvent', 'abs', 'em', 'epsilon', 'mw', 'plqy', 'smiles']

First 5 rows:


Unnamed: 0,name,solvent,abs,em,epsilon,mw,plqy,smiles
0,BBOT,Toluol,376.0,434.0,47577.985,430.57,1.0,CC(C)(C)c1ccc2oc(nc2c1)c1sc(cc1)c1oc2ccc(cc2n1...
1,Coumarin 1,,,,,231.295,,CC1=CC(=O)Oc2cc(ccc21)N(CC)CC
2,Coumarin 343,,,,,285.299,,O=C1Oc2c3CCCN4CCCc(cc2C=C1C(=O)O)c43
3,Coumarin 30,,,,,347.481,,Cn1c(nc2ccccc12)C1=Cc2ccc(cc2OC1=O)N(CC)CC
4,Coumarin 102,Toluol,373.0,420.0,,255.317,0.77,O=C1C=C(C)c2cc3CCCN4CCCc(c2O1)c43



üìä Data availability per property:
  abs: 119 samples with data
  em: 117 samples with data
  plqy: 10 samples with data
  epsilon: 78 samples with data


## 5. Feature Extraction Functions

In [9]:
# ============================================================================
# Feature Extraction Functions
# ============================================================================

def compute_morgan_fingerprint(smiles, radius=2, n_bits=1024):
    """Generate Morgan fingerprint as numpy array."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
    return np.array(fp, dtype=np.int32)


def compute_molecular_descriptors(smiles):
    """Compute molecular descriptors matching training data format."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    # Count double bonds (including aromatic)
    double_bond_count = sum(
        1 for bond in mol.GetBonds() 
        if bond.GetBondType() == Chem.BondType.DOUBLE or bond.GetIsAromatic()
    )
    
    return {
        'Molecular_Weight': Descriptors.MolWt(mol),
        'LogP': Descriptors.MolLogP(mol),
        'TPSA': Descriptors.TPSA(mol),
        'Double_Bond_Count': double_bond_count,
        'Ring_Count': mol.GetRingInfo().NumRings(),
    }


def detect_scaffold_tag(smiles):
    """Detect scaffold type and return tag number and name."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return 0, 'Other'
    
    # BODIPY pattern: [B](-F)(-F) connected to two nitrogens
    bodipy_pattern = Chem.MolFromSmarts('[#5](-F)(-F)~[#7]')
    if bodipy_pattern and mol.HasSubstructMatch(bodipy_pattern):
        return 5, 'BODIPY'
    
    # Coumarin pattern: benzene fused with pyrone
    coumarin_pattern = Chem.MolFromSmarts('O=C1C=Cc2ccccc2O1')
    if coumarin_pattern and mol.HasSubstructMatch(coumarin_pattern):
        return 3, 'Coumarin'
    
    # Rhodamine pattern: xanthene core
    rhodamine_pattern = Chem.MolFromSmarts('c1ccc2c(c1)C(c1ccccc1O2)c1ccccc1')
    if rhodamine_pattern and mol.HasSubstructMatch(rhodamine_pattern):
        return 4, 'Rhodamine'
    
    # Cyanine pattern: polymethine chain
    cyanine_pattern = Chem.MolFromSmarts('[N+]=C-C=C-C=C-[N]')
    if cyanine_pattern and mol.HasSubstructMatch(cyanine_pattern):
        return 6, 'Cyanine'
    
    # PAHs: multiple fused aromatic rings
    if mol.GetRingInfo().NumRings() >= 3:
        aromatic_rings = sum(1 for ring in mol.GetRingInfo().AtomRings() 
                            if all(mol.GetAtomWithIdx(i).GetIsAromatic() for i in ring))
        if aromatic_rings >= 3:
            return 8, 'PAHs'
    
    return 0, 'Other'


def compute_scaffold_flags(smiles, patterns):
    """Compute binary scaffold flags for 136 substructure patterns."""
    mol = Chem.MolFromSmiles(smiles)
    flags = np.zeros(136, dtype=np.int32)
    
    if mol is None:
        return flags
    
    for idx, pattern in patterns:
        if idx < 136:
            try:
                if mol.HasSubstructMatch(pattern):
                    flags[idx] = 1
            except:
                pass
    
    return flags


def get_solvent_num(solvent_name, solvent_smiles_to_num, name_to_smiles):
    """Convert solvent name to solvent number."""
    # First try to get SMILES from name
    solvent_smiles = name_to_smiles.get(solvent_name)
    if solvent_smiles:
        # Then get number from SMILES
        return solvent_smiles_to_num.get(solvent_smiles, 0), solvent_smiles
    return 0, None


print("‚úÖ Feature extraction functions defined")

‚úÖ Feature extraction functions defined


## 6. Process Data

In [10]:
# ============================================================================
# Process All Molecules
# ============================================================================
from tqdm.notebook import tqdm

processed_rows = []
smiles_fps = []
solvent_fps = []
skipped = []

print("üîÑ Processing molecules...\n")

for idx, row in tqdm(input_df.iterrows(), total=len(input_df), desc="Processing"):
    smiles = row.get('smiles')
    solvent_name = str(row.get('solvent', '')).strip()
    
    # Skip if no SMILES
    if pd.isna(smiles) or not smiles:
        skipped.append((row.get('name', idx), 'No SMILES'))
        continue
    
    # Get solvent info
    solvent_num, solvent_smiles = get_solvent_num(
        solvent_name, SOLVENT_SMILES_TO_NUM, SOLVENT_NAME_TO_SMILES
    )
    
    if solvent_smiles is None:
        skipped.append((row.get('name', idx), f'Unknown solvent: {solvent_name}'))
        continue
    
    # Compute Morgan fingerprints
    mol_fp = compute_morgan_fingerprint(smiles)
    sol_fp = compute_morgan_fingerprint(solvent_smiles)
    
    if mol_fp is None:
        skipped.append((row.get('name', idx), 'Invalid molecule SMILES'))
        continue
    if sol_fp is None:
        skipped.append((row.get('name', idx), 'Invalid solvent SMILES'))
        continue
    
    # Compute molecular descriptors
    descriptors = compute_molecular_descriptors(smiles)
    if descriptors is None:
        skipped.append((row.get('name', idx), 'Could not compute descriptors'))
        continue
    
    # Detect scaffold tag
    tag, tag_name = detect_scaffold_tag(smiles)
    
    # Compute scaffold flags
    scaffold_flags = compute_scaffold_flags(smiles, SUBSTRUCTURE_PATTERNS)
    
    # Compute log(epsilon) -> k
    epsilon = row.get('epsilon', np.nan)
    k = np.log10(epsilon) if pd.notna(epsilon) and epsilon > 0 else np.nan
    
    # Build the row matching train_*.csv format
    processed_row = {
        'split': 'train',  # Will be added to training set
        'smiles': smiles,
        'solvent': solvent_smiles,
        'abs': row.get('abs', np.nan),
        'em': row.get('em', np.nan),
        'plqy': row.get('plqy', np.nan),
        'k': k,
        'tag_name': tag_name,
        'solvent_num': solvent_num,
        'tag': tag,
        'Molecular_Weight': descriptors['Molecular_Weight'],
        'LogP': descriptors['LogP'],
        'TPSA': descriptors['TPSA'],
        'Double_Bond_Count': descriptors['Double_Bond_Count'],
        'Ring_Count': descriptors['Ring_Count'],
        'unimol_plus': 3.0,  # Default value
    }
    
    # Add scaffold flags
    for i in range(136):
        processed_row[f'fragment_{i+1}'] = scaffold_flags[i]
    
    processed_rows.append(processed_row)
    smiles_fps.append(mol_fp)
    solvent_fps.append(sol_fp)

print(f"\n‚úÖ Processed {len(processed_rows)} molecules")
print(f"‚ö†Ô∏è  Skipped {len(skipped)} molecules")

if skipped:
    print("\nSkipped molecules:")
    for name, reason in skipped[:10]:
        print(f"  ‚Ä¢ {name}: {reason}")
    if len(skipped) > 10:
        print(f"  ... and {len(skipped) - 10} more")

üîÑ Processing molecules...



Processing:   0%|          | 0/126 [00:00<?, ?it/s]

[08:59:50] Explicit valence for atom # 10 B, 4, is greater than permitted



‚úÖ Processed 114 molecules
‚ö†Ô∏è  Skipped 12 molecules

Skipped molecules:
  ‚Ä¢ Coumarin 1: Unknown solvent: nan
  ‚Ä¢ Coumarin 343¬†¬†¬†: Unknown solvent: nan
  ‚Ä¢ Coumarin 30: Unknown solvent: nan
  ‚Ä¢ Coumarin 153: Unknown solvent: nan
  ‚Ä¢ FR-NH2: Unknown solvent: 10mM Tris pH8
  ‚Ä¢ Nilblau: Unknown solvent: nan
  ‚Ä¢ Rh-123: Unknown solvent: nan
  ‚Ä¢ Rh-110: Unknown solvent: nan
  ‚Ä¢ Rh-B: Unknown solvent: 10mM Tris pH8
  ‚Ä¢ Rh-NH2: Unknown solvent: 10mM Tris pH8
  ... and 2 more




## 7. Create DataFrames

In [11]:
# ============================================================================
# Create DataFrames
# ============================================================================

# Main data DataFrame
main_df = pd.DataFrame(processed_rows)

# Fingerprint DataFrames
smiles_fp_df = pd.DataFrame(smiles_fps, columns=[f'smiles_fp_{i}' for i in range(1024)])
solvent_fp_df = pd.DataFrame(solvent_fps, columns=[f'sol_fp_{i}' for i in range(1024)])

print(f"‚úÖ Main DataFrame: {main_df.shape}")
print(f"‚úÖ SMILES FP DataFrame: {smiles_fp_df.shape}")
print(f"‚úÖ Solvent FP DataFrame: {solvent_fp_df.shape}")

# Show summary
print("\nüìä Data summary:")
print(main_df[['smiles', 'solvent', 'abs', 'em', 'plqy', 'k', 'tag_name']].head(10))

‚úÖ Main DataFrame: (114, 152)
‚úÖ SMILES FP DataFrame: (114, 1024)
‚úÖ Solvent FP DataFrame: (114, 1024)

üìä Data summary:
                                              smiles    solvent    abs     em  \
0  CC(C)(C)c1ccc2oc(nc2c1)c1sc(cc1)c1oc2ccc(cc2n1...  Cc1ccccc1  376.0  434.0   
1                  O=C1C=C(C)c2cc3CCCN4CCCc(c2O1)c43  Cc1ccccc1  373.0  420.0   
2            O=C1Oc2cc(ccc2C=C1c1sc2ccccc2n1)N(CC)CC  Cc1ccccc1  439.0  487.0   
3  O=C(N1CCC(CC1)CN)c1ccccc1C1=C2C=C\C(=N/CC(F)(F...         CO  512.0  533.0   
4        O=C1Oc2cc(ccc2C=C1c1oc2ccc(Cl)cc2n1)N(CC)CC  Cc1ccccc1  434.0  475.0   
5       O=C1Oc2cc(ccc2C(C#N)=C1c1oc2ccccc2n1)N(CC)CC  Cc1ccccc1  523.0  566.0   
6       O=C1Oc2cc(ccc2C(C#N)=C1c1oc2ccccc2n1)N(CC)CC  ClC(Cl)Cl  534.0  585.0   
7            O=C1C=C2Oc3cc(ccc3N=C2c2ccccc12)N(CC)CC  Cc1ccccc1  525.0  585.0   
8  CC\1=CC2=C(c3cc(C)c(NCC)cc3OC2=C/C/1=[NH+]/CC)...         CO  528.0  551.0   
9  O=C([O-])c1ccccc1C1=C2C=C3CCC[N+]=4CCCC(=C2Oc2...         CO 

## 8. Save Training Data Files

In [12]:
# ============================================================================
# Save Training Data for Each Target Property
# ============================================================================

targets = {
    'abs': 'Absorption wavelength (nm)',
    'em': 'Emission wavelength (nm)',
    'plqy': 'Quantum yield (0-1)',
    'k': 'Log molar absorptivity'
}

print("üíæ Saving training data files...\n")

for target, description in targets.items():
    # Filter rows with valid target values
    mask = main_df[target].notna()
    target_df = main_df[mask].copy()
    target_smiles_fp = smiles_fp_df[mask].copy()
    target_solvent_fp = solvent_fp_df[mask].copy()
    
    if len(target_df) == 0:
        print(f"‚ö†Ô∏è  {target}: No valid data, skipping")
        continue
    
    # Reset indices
    target_df = target_df.reset_index(drop=True)
    target_smiles_fp = target_smiles_fp.reset_index(drop=True)
    target_solvent_fp = target_solvent_fp.reset_index(drop=True)
    
    # Save files
    main_path = os.path.join(OUTPUT_DIR, f'new_train_{target}.csv')
    smiles_path = os.path.join(OUTPUT_DIR, f'new_train_smiles_{target}.csv')
    solvent_path = os.path.join(OUTPUT_DIR, f'new_train_sol_{target}.csv')
    
    target_df.to_csv(main_path, index=False)
    target_smiles_fp.to_csv(smiles_path, index=False)
    target_solvent_fp.to_csv(solvent_path, index=False)
    
    print(f"‚úÖ {target} ({description}):")
    print(f"   ‚Ä¢ {len(target_df)} samples")
    print(f"   ‚Ä¢ {main_path}")
    print(f"   ‚Ä¢ {smiles_path}")
    print(f"   ‚Ä¢ {solvent_path}")
    print()

üíæ Saving training data files...

‚úÖ abs (Absorption wavelength (nm)):
   ‚Ä¢ 114 samples
   ‚Ä¢ /content/drive/MyDrive/fluor_models/training_data/new_train_abs.csv
   ‚Ä¢ /content/drive/MyDrive/fluor_models/training_data/new_train_smiles_abs.csv
   ‚Ä¢ /content/drive/MyDrive/fluor_models/training_data/new_train_sol_abs.csv

‚úÖ em (Emission wavelength (nm)):
   ‚Ä¢ 112 samples
   ‚Ä¢ /content/drive/MyDrive/fluor_models/training_data/new_train_em.csv
   ‚Ä¢ /content/drive/MyDrive/fluor_models/training_data/new_train_smiles_em.csv
   ‚Ä¢ /content/drive/MyDrive/fluor_models/training_data/new_train_sol_em.csv

‚úÖ plqy (Quantum yield (0-1)):
   ‚Ä¢ 10 samples
   ‚Ä¢ /content/drive/MyDrive/fluor_models/training_data/new_train_plqy.csv
   ‚Ä¢ /content/drive/MyDrive/fluor_models/training_data/new_train_smiles_plqy.csv
   ‚Ä¢ /content/drive/MyDrive/fluor_models/training_data/new_train_sol_plqy.csv

‚úÖ k (Log molar absorptivity):
   ‚Ä¢ 74 samples
   ‚Ä¢ /content/drive/MyDrive/fluor_models

## 9. Merge with Existing Training Data (Optional)

In [13]:
# ============================================================================
# Merge New Data with Existing Training Data
# ============================================================================

MERGE_WITH_EXISTING = True  # Set to False to skip merging

if MERGE_WITH_EXISTING:
    print("üîÄ Merging new data with existing training data...\n")
    
    for target in ['abs', 'em', 'plqy', 'k']:
        # Paths
        existing_main = os.path.join(DATA_DIR, f'train_{target}.csv')
        existing_smiles = os.path.join(DATA_DIR, f'train_smiles_{target}.csv')
        existing_solvent = os.path.join(DATA_DIR, f'train_sol_{target}.csv')
        
        new_main = os.path.join(OUTPUT_DIR, f'new_train_{target}.csv')
        new_smiles = os.path.join(OUTPUT_DIR, f'new_train_smiles_{target}.csv')
        new_solvent = os.path.join(OUTPUT_DIR, f'new_train_sol_{target}.csv')
        
        # Check if new data exists
        if not os.path.exists(new_main):
            print(f"‚ö†Ô∏è  {target}: No new data to merge")
            continue
        
        # Load existing data
        existing_main_df = pd.read_csv(existing_main)
        existing_smiles_df = pd.read_csv(existing_smiles)
        existing_solvent_df = pd.read_csv(existing_solvent)
        
        # Load new data
        new_main_df = pd.read_csv(new_main)
        new_smiles_df = pd.read_csv(new_smiles)
        new_solvent_df = pd.read_csv(new_solvent)
        
        # Rename columns to match existing format
        new_smiles_df.columns = existing_smiles_df.columns
        new_solvent_df.columns = existing_solvent_df.columns
        
        # Concatenate
        merged_main = pd.concat([existing_main_df, new_main_df], ignore_index=True)
        merged_smiles = pd.concat([existing_smiles_df, new_smiles_df], ignore_index=True)
        merged_solvent = pd.concat([existing_solvent_df, new_solvent_df], ignore_index=True)
        
        # Save merged files
        merged_main_path = os.path.join(OUTPUT_DIR, f'merged_train_{target}.csv')
        merged_smiles_path = os.path.join(OUTPUT_DIR, f'merged_train_smiles_{target}.csv')
        merged_solvent_path = os.path.join(OUTPUT_DIR, f'merged_train_sol_{target}.csv')
        
        merged_main.to_csv(merged_main_path, index=False)
        merged_smiles.to_csv(merged_smiles_path, index=False)
        merged_solvent.to_csv(merged_solvent_path, index=False)
        
        print(f"‚úÖ {target}: {len(existing_main_df)} existing + {len(new_main_df)} new = {len(merged_main)} total")
        print(f"   ‚Üí {merged_main_path}")
        print()
else:
    print("‚ÑπÔ∏è  Skipping merge with existing data (MERGE_WITH_EXISTING=False)")

üîÄ Merging new data with existing training data...

‚úÖ abs: 21948 existing + 114 new = 22062 total
   ‚Üí /content/drive/MyDrive/fluor_models/training_data/merged_train_abs.csv

‚úÖ em: 16833 existing + 112 new = 16945 total
   ‚Üí /content/drive/MyDrive/fluor_models/training_data/merged_train_em.csv

‚úÖ plqy: 12998 existing + 10 new = 13008 total
   ‚Üí /content/drive/MyDrive/fluor_models/training_data/merged_train_plqy.csv

‚úÖ k: 6976 existing + 74 new = 7050 total
   ‚Üí /content/drive/MyDrive/fluor_models/training_data/merged_train_k.csv



## 10. Summary & Next Steps

In [14]:
# ============================================================================
# Summary
# ============================================================================

print("=" * 70)
print("üìä SUMMARY")
print("=" * 70)

print(f"\nüìÇ Output directory: {OUTPUT_DIR}")
print(f"\nüìÅ Files created:")

for f in sorted(os.listdir(OUTPUT_DIR)):
    fpath = os.path.join(OUTPUT_DIR, f)
    size = os.path.getsize(fpath)
    rows = len(pd.read_csv(fpath)) if f.endswith('.csv') else 0
    print(f"   ‚Ä¢ {f}: {rows} rows, {size/1024:.1f} KB")

print("\n" + "=" * 70)
print("üìã NEXT STEPS")
print("=" * 70)
print("""
To use this data for training:

1. **Option A: Use merged data (recommended)**
   - Copy merged_train_*.csv files to replace the original train_*.csv
   - This combines your new data with the original ~22k samples
   - Run the training notebook with the merged data

2. **Option B: Fine-tune on new data only**
   - Use new_train_*.csv files directly
   - Start from pretrained models and fine-tune
   - Smaller dataset may lead to overfitting

3. **Update training notebook paths:**
   In Fluor_RLAT_Training.ipynb, change DATA_DIR to:
   DATA_DIR = '/content/drive/MyDrive/fluor_models/training_data'
   
   And use the merged files:
   train_df = pd.read_csv(f'{DATA_DIR}/merged_train_{target}.csv')
""")

print("\n‚úÖ Done!")

üìä SUMMARY

üìÇ Output directory: /content/drive/MyDrive/fluor_models/training_data

üìÅ Files created:
   ‚Ä¢ merged_train_abs.csv: 22062 rows, 9612.7 KB
   ‚Ä¢ merged_train_em.csv: 16945 rows, 7386.9 KB
   ‚Ä¢ merged_train_k.csv: 7050 rows, 3114.5 KB
   ‚Ä¢ merged_train_plqy.csv: 13008 rows, 5712.1 KB
   ‚Ä¢ merged_train_smiles_abs.csv: 22062 rows, 44127.9 KB
   ‚Ä¢ merged_train_smiles_em.csv: 16945 rows, 33893.9 KB
   ‚Ä¢ merged_train_smiles_k.csv: 7050 rows, 14103.9 KB
   ‚Ä¢ merged_train_smiles_plqy.csv: 13008 rows, 26019.9 KB
   ‚Ä¢ merged_train_sol_abs.csv: 22062 rows, 44127.9 KB
   ‚Ä¢ merged_train_sol_em.csv: 16945 rows, 33893.9 KB
   ‚Ä¢ merged_train_sol_k.csv: 7050 rows, 14103.9 KB
   ‚Ä¢ merged_train_sol_plqy.csv: 13008 rows, 26019.9 KB
   ‚Ä¢ new_train_abs.csv: 114 rows, 51.1 KB
   ‚Ä¢ new_train_em.csv: 112 rows, 50.3 KB
   ‚Ä¢ new_train_k.csv: 74 rows, 34.6 KB
   ‚Ä¢ new_train_plqy.csv: 10 rows, 6.0 KB
   ‚Ä¢ new_train_smiles_abs.csv: 114 rows, 241.9 KB
   ‚Ä¢ new_tra

## 11. Deploy to Fluor-RLAT Data Directory

In [15]:
# ============================================================================
# Deploy merged data to Fluor-RLAT data directory
# ============================================================================
# This cell backs up existing files and copies the merged training data
# to the Fluor-RLAT/data directory for training

import shutil
from datetime import datetime

# Target directory (cloned repo's data folder)
TARGET_DIR = DATA_DIR  # './fluor_tools/Fluor-RLAT/data'

# Files to deploy (merged files ‚Üí train files)
DEPLOY_FILES = {
    'merged_train_abs.csv': 'train_abs.csv',
    'merged_train_em.csv': 'train_em.csv',
    'merged_train_plqy.csv': 'train_plqy.csv',
    'merged_train_k.csv': 'train_k.csv',
    'merged_train_smiles_abs.csv': 'train_smiles_abs.csv',
    'merged_train_smiles_em.csv': 'train_smiles_em.csv',
    'merged_train_smiles_plqy.csv': 'train_smiles_plqy.csv',
    'merged_train_smiles_k.csv': 'train_smiles_k.csv',
    'merged_train_sol_abs.csv': 'train_sol_abs.csv',
    'merged_train_sol_em.csv': 'train_sol_em.csv',
    'merged_train_sol_plqy.csv': 'train_sol_plqy.csv',
    'merged_train_sol_k.csv': 'train_sol_k.csv',
}

print("=" * 70)
print("üöÄ DEPLOYING MERGED DATA TO FLUOR-RLAT")
print("=" * 70)
print(f"\nüìÇ Source: {OUTPUT_DIR}")
print(f"üìÇ Target: {TARGET_DIR}")

# Timestamp for backup suffix
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

backed_up = []
deployed = []
skipped = []

for source_name, target_name in DEPLOY_FILES.items():
    source_path = os.path.join(OUTPUT_DIR, source_name)
    target_path = os.path.join(TARGET_DIR, target_name)
    backup_path = os.path.join(TARGET_DIR, f"{target_name}.bak")
    
    # Check if source file exists
    if not os.path.exists(source_path):
        skipped.append(f"{source_name} (source not found)")
        continue
    
    # If target exists, back it up
    if os.path.exists(target_path):
        # If .bak already exists, rename with timestamp
        if os.path.exists(backup_path):
            backup_path = os.path.join(TARGET_DIR, f"{target_name}.bak.{timestamp}")
        
        shutil.move(target_path, backup_path)
        backed_up.append(f"{target_name} ‚Üí {os.path.basename(backup_path)}")
    
    # Copy the merged file to target
    shutil.copy2(source_path, target_path)
    deployed.append(f"{source_name} ‚Üí {target_name}")

# Report
print("\n" + "-" * 70)
print("üì¶ BACKUP (existing files renamed to *.bak)")
print("-" * 70)
if backed_up:
    for item in backed_up:
        print(f"   ‚úì {item}")
else:
    print("   (no existing files to backup)")

print("\n" + "-" * 70)
print("üì§ DEPLOYED (merged files copied)")
print("-" * 70)
if deployed:
    for item in deployed:
        print(f"   ‚úì {item}")
else:
    print("   (no files deployed)")

if skipped:
    print("\n" + "-" * 70)
    print("‚ö†Ô∏è  SKIPPED")
    print("-" * 70)
    for item in skipped:
        print(f"   ‚Ä¢ {item}")

print("\n" + "=" * 70)
print(f"‚úÖ Deployment complete! {len(deployed)} files deployed, {len(backed_up)} backed up")
print("=" * 70)

# Verify deployment
print("\nüìä Verification - New training data sizes:")
for target_name in ['train_abs.csv', 'train_em.csv', 'train_plqy.csv', 'train_k.csv']:
    target_path = os.path.join(TARGET_DIR, target_name)
    if os.path.exists(target_path):
        rows = len(pd.read_csv(target_path))
        print(f"   ‚Ä¢ {target_name}: {rows} rows")

üöÄ DEPLOYING MERGED DATA TO FLUOR-RLAT

üìÇ Source: /content/drive/MyDrive/fluor_models/training_data
üìÇ Target: ./fluor_tools/Fluor-RLAT/data

----------------------------------------------------------------------
üì¶ BACKUP (existing files renamed to *.bak)
----------------------------------------------------------------------
   ‚úì train_abs.csv ‚Üí train_abs.csv.bak
   ‚úì train_em.csv ‚Üí train_em.csv.bak
   ‚úì train_plqy.csv ‚Üí train_plqy.csv.bak
   ‚úì train_k.csv ‚Üí train_k.csv.bak
   ‚úì train_smiles_abs.csv ‚Üí train_smiles_abs.csv.bak
   ‚úì train_smiles_em.csv ‚Üí train_smiles_em.csv.bak
   ‚úì train_smiles_plqy.csv ‚Üí train_smiles_plqy.csv.bak
   ‚úì train_smiles_k.csv ‚Üí train_smiles_k.csv.bak
   ‚úì train_sol_abs.csv ‚Üí train_sol_abs.csv.bak
   ‚úì train_sol_em.csv ‚Üí train_sol_em.csv.bak
   ‚úì train_sol_plqy.csv ‚Üí train_sol_plqy.csv.bak
   ‚úì train_sol_k.csv ‚Üí train_sol_k.csv.bak

----------------------------------------------------------------------