In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from rdkit import Chem
from rdkit.Chem import AllChem
import warnings
warnings.filterwarnings('ignore')

def smiles_to_morgan_fingerprint(smiles, radius=2, n_bits=2048):
    """
    Convert SMILES string to Morgan fingerprint
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    return np.array(fingerprint)

def load_and_process_data(filename):
    """
    Load data and convert SMILES to Morgan fingerprints
    """
    # Load data
    df = pd.read_csv(filename)
    
    # Convert SMILES to fingerprints
    fingerprints = []
    valid_indices = []
    
    for idx, smiles in enumerate(df['canonical_smiles']):
        fp = smiles_to_morgan_fingerprint(smiles)
        if fp is not None:
            fingerprints.append(fp)
            valid_indices.append(idx)
    
    # Filter data for valid SMILES
    X = np.array(fingerprints)
    y = df['value'].iloc[valid_indices].values
    
    print(f"Successfully processed {len(X)} out of {len(df)} molecules")
    return X, y

def perform_cross_validation(model, X, y, model_name, cv_folds=5):
    """
    Perform k-fold cross-validation
    """
    # Define cross-validation strategy
    cv = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    # Perform cross-validation for RMSE (negative MSE scores)
    mse_scores = cross_val_score(model, X, y, 
                                 cv=cv, 
                                 scoring='neg_mean_squared_error',
                                 n_jobs=-1)
    
    # Convert to RMSE (positive values)
    rmse_scores = np.sqrt(-mse_scores)
    
    # Perform cross-validation for R²
    r2_scores = cross_val_score(model, X, y, 
                                cv=cv, 
                                scoring='r2',
                                n_jobs=-1)
    
    print(f"{model_name} Cross-Validation Results ({cv_folds}-fold):")
    print(f"  RMSE: Mean = {rmse_scores.mean():.4f}, Std = {rmse_scores.std():.4f}")
    print(f"  R²:   Mean = {r2_scores.mean():.4f}, Std = {r2_scores.std():.4f}")
    print("  Individual Fold RMSE:", ", ".join([f"{score:.4f}" for score in rmse_scores]))
    print("  Individual Fold R²:  ", ", ".join([f"{score:.4f}" for score in r2_scores]))
    print("-" * 60)
    
    return {
        'cv_rmse_mean': rmse_scores.mean(),
        'cv_rmse_std': rmse_scores.std(),
        'cv_r2_mean': r2_scores.mean(),
        'cv_r2_std': r2_scores.std(),
        'cv_rmse_scores': rmse_scores,
        'cv_r2_scores': r2_scores
    }

def evaluate_model(model, X_test, y_test, model_name):
    """
    Evaluate model performance on test set
    """
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"{model_name} Test Set Performance:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R²:   {r2:.4f}")
    print("-" * 40)
    
    return rmse, r2

def main(filename, cv_folds=5):
    """
    Main function to run the complete pipeline with cross-validation
    """
    # Load and process data
    print("Loading and processing data...")
    X, y = load_and_process_data(filename)
    
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print(f"\nTraining set size: {X_train.shape[0]}")
    print(f"Test set size: {X_test.shape[0]}")
    print(f"Cross-validation folds: {cv_folds}")
    print()
    
    # Initialize models with ALL DEFAULT hyperparameters
    models = {
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Support Vector Machine': SVR()
    }
    
    # Train and evaluate models with cross-validation
    results = {}
    
    for model_name, model in models.items():
        print(f"\n{'='*60}")
        print(f"Evaluating {model_name}")
        print('='*60)
        
        # Perform cross-validation on training data
        cv_results = perform_cross_validation(model, X_train, y_train, 
                                              model_name, cv_folds)
        
        # Train on full training data
        print(f"\nTraining {model_name} on full training set...")
        model.fit(X_train, y_train)
        
        # Evaluate on test set
        test_rmse, test_r2 = evaluate_model(model, X_test, y_test, model_name)
        
        # Store all results
        results[model_name] = {
            'cv_rmse_mean': cv_results['cv_rmse_mean'],
            'cv_rmse_std': cv_results['cv_rmse_std'],
            'cv_r2_mean': cv_results['cv_r2_mean'],
            'cv_r2_std': cv_results['cv_r2_std'],
            'test_rmse': test_rmse,
            'test_r2': test_r2
        }
    
    return results

if __name__ == "__main__":
    # Run the analysis
    filename = "standardized_nc_data_tp.csv"  # Change this for different files
    cv_folds = 5  # Number of cross-validation folds
    
    results = main(filename, cv_folds)
    
    # Print summary
    print("\n" + "="*70)
    print("SUMMARY OF RESULTS")
    print("="*70)
    
    # Create summary table
    summary_data = []
    for model_name, metrics in results.items():
        summary_data.append({
            'Model': model_name,
            'CV RMSE (mean±std)': f"{metrics['cv_rmse_mean']:.4f} ± {metrics['cv_rmse_std']:.4f}",
            'CV R² (mean±std)': f"{metrics['cv_r2_mean']:.4f} ± {metrics['cv_r2_std']:.4f}",
            'Test RMSE': f"{metrics['test_rmse']:.4f}",
            'Test R²': f"{metrics['test_r2']:.4f}"
        })
    
    # Print summary table
    summary_df = pd.DataFrame(summary_data)
    print("\nModel Performance Summary:")
    print(summary_df.to_string(index=False))
    
    # Print default hyperparameter information
    print("\n" + "="*70)
    print("DEFAULT HYPERPARAMETERS USED")
    print("="*70)
    print("K-Nearest Neighbors:")
    print("  - n_neighbors: 5")
    print("  - weights: 'uniform'")
    print("  - algorithm: 'auto'")
    print("  - leaf_size: 30")
    print("  - p: 2 (Euclidean distance)")
    print("  - metric: 'minkowski'")
    print()
    
    print("Random Forest:")
    print("  - n_estimators: 100")
    print("  - criterion: 'squared_error'")
    print("  - max_depth: None")
    print("  - min_samples_split: 2")
    print("  - min_samples_leaf: 1")
    print("  - max_features: 1.0")
    print("  - random_state: None")
    print()
    
    print("Support Vector Machine:")
    print("  - kernel: 'rbf'")
    print("  - C: 1.0")
    print("  - epsilon: 0.1")
    print("  - gamma: 'scale'")
    print("  - degree: 3")
    print("  - coef0: 0.0")
    
    # Optional
    results_df = pd.DataFrame.from_dict(results, orient='index')
    results_df.to_csv('model_results_with_cv.csv')
    print("\nResults saved to 'model_results_with_cv.csv'")

Loading and processing data...
Successfully processed 417 out of 417 molecules

Training set size: 333
Test set size: 84
Cross-validation folds: 5


Evaluating K-Nearest Neighbors
K-Nearest Neighbors Cross-Validation Results (5-fold):
  RMSE: Mean = 0.1309, Std = 0.0171
  R²:   Mean = 0.7329, Std = 0.0311
  Individual Fold RMSE: 0.1203, 0.1623, 0.1177, 0.1181, 0.1361
  Individual Fold R²:   0.7279, 0.7155, 0.7769, 0.7565, 0.6878
------------------------------------------------------------

Training K-Nearest Neighbors on full training set...
K-Nearest Neighbors Test Set Performance:
  RMSE: 0.1378
  R²:   0.7540
----------------------------------------

Evaluating Random Forest
Random Forest Cross-Validation Results (5-fold):
  RMSE: Mean = 0.1084, Std = 0.0100
  R²:   Mean = 0.8101, Std = 0.0320
  Individual Fold RMSE: 0.1056, 0.1229, 0.0937, 0.1042, 0.1154
  Individual Fold R²:   0.7786, 0.8334, 0.8553, 0.8120, 0.7710
------------------------------------------------------------

Trai