## Deep Learning of the all  Outputs of the CsPbCI3QDs

### **Deep Learning Overview**

Deep learning is a powerful subset of machine learning that mimics the structure and function of the human brain. It uses artificial neural networks (ANNs) to uncover complex patterns in data and deliver accurate predictions.

In this project, a deep learning model is designed to predict the target variables (`size_nm`, `S_abs_nm_Y1`, and `PL`) using the following architecture:

1. **Dense Layers**: Fully connected layers with ReLU activation functions to capture non-linear relationships between features.
2. **Dropout Layers**: Regularization layers that randomly ignore neurons during training to prevent overfitting.
3. **L2 Regularization**: Penalty applied to weights to improve the model’s generalization ability.
4. **Adam Optimizer**: An efficient optimization algorithm with an adaptive learning rate for faster convergence.

The model is trained using the **Mean Squared Error (MSE)** loss function to minimize the prediction error. Performance is evaluated on both training and testing datasets using the following metrics:
- **R² (Coefficient of Determination)**: Indicates how well the model explains the variance in the target variable.
- **RMSE (Root Mean Squared Error)**: Measures the average error in predictions.
- **MAE (Mean Absolute Error)**: Calculates the average absolute difference between observed and predicted values.


In [None]:
# Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Step 1: Load and Preprocess Data
def load_and_preprocess_data(file_path, modify=False):
    """
    Load and preprocess the dataset.
    
    Parameters:
        file_path (str): Path to the Excel file.
        modify (bool): Whether to perform feature engineering and save as 'modified_data.xlsx'.
    
    Returns:
        pd.DataFrame: Preprocessed dataset.
    """
    data = pd.read_excel(file_path)
    
    if modify:
        # Feature Engineering: Interaction between 'Cl_mmol' and 'Pb_mmol'
        data['Cl_Pb_interact'] = data['Cl_mmol'] * data['Pb_mmol']
        
        # Normalizing skewed features
        data['Cl_mmol_log'] = np.log(data['Cl_mmol'] + 1)
        
        # Remove outliers based on Z-score
        z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
        data = data[(z_scores < 3).all(axis=1)]
        
        # Scale numerical features
        scaler = StandardScaler()
        numerical_columns = data.select_dtypes(include=[np.number]).columns
        data[numerical_columns] = scaler.fit_transform(data[numerical_columns])
        
        # Save the modified dataset
        data.to_excel("modified_data.xlsx", index=False)
        print("Modified dataset saved as 'modified_data.xlsx'.")
    
    return data

# Load and preprocess the original dataset
file_path_original = "./CsPbCl3_QDs.xlsx"
data_original = load_and_preprocess_data(file_path_original)

# Load and preprocess the modified dataset
file_path_modified = "./modified_data.xlsx"
data_modified = load_and_preprocess_data(file_path_modified, modify=False)

# Step 2: Prepare Data for Machine Learning
def prepare_ml_data(data, target_column):
    """
    Prepare the dataset for machine learning.
    
    Parameters:
        data (pd.DataFrame): Dataset.
        target_column (str): Target variable.
    
    Returns:
        Tuple: Features (X), target (y), and train-test splits (X_train, X_test, y_train, y_test).
    """
    # Identify categorical columns
    categorical_columns = data.select_dtypes(include=['object']).columns
    
    # Apply one-hot encoding to categorical columns
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = one_hot_encoder.fit_transform(data[categorical_columns])
    one_hot_encoded_df = pd.DataFrame(
        one_hot_encoded, 
        columns=one_hot_encoder.get_feature_names_out(categorical_columns)
    )
    
    # Replace categorical columns with one-hot encoded columns
    data_encoded = data.drop(categorical_columns, axis=1)
    data_encoded = pd.concat([data_encoded, one_hot_encoded_df], axis=1)
    
    # Define features and target
    X = data_encoded.drop(target_column, axis=1)
    y = data_encoded[target_column]
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X, y, X_train, X_test, y_train, y_test

# Step 3: Define and Train Deep Learning Model
def train_deep_learning_model(X_train, y_train, X_test, y_test):
    """
    Train a deep learning model on the dataset.
    
    Parameters:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target.
        X_test (pd.DataFrame): Testing features.
        y_test (pd.Series): Testing target.
    
    Returns:
        dict: Model predictions and performance metrics.
    """
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.fillna(X_train.mean()))
    X_test_scaled = scaler.transform(X_test.fillna(X_train.mean()))
    
    # Define the model
    model = Sequential([
        Dense(64, input_dim=X_train_scaled.shape[1], activation='relu', kernel_regularizer=l2(0.01)),
        Dropout(0.5),
        Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
        Dropout(0.5),
        Dense(1, activation='linear')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    
    # Train the model
    model.fit(X_train_scaled, y_train.fillna(y_train.mean()), epochs=100, batch_size=10, verbose=1, validation_split=0.2)
    
    # Predictions
    predictions_train = model.predict(X_train_scaled).flatten()
    predictions_test = model.predict(X_test_scaled).flatten()
    
    # Compute performance metrics
    metrics = {
        "Train R2": r2_score(y_train.fillna(y_train.mean()), predictions_train),
        "Train RMSE": np.sqrt(mean_squared_error(y_train.fillna(y_train.mean()), predictions_train)),
        "Train MAE": mean_absolute_error(y_train.fillna(y_train.mean()), predictions_train),
        "Test R2": r2_score(y_test, predictions_test),
        "Test RMSE": np.sqrt(mean_squared_error(y_test, predictions_test)),
        "Test MAE": mean_absolute_error(y_test, predictions_test)
    }
    
    return {
        "predictions_train": predictions_train,
        "predictions_test": predictions_test,
        "metrics": metrics
    }

# Step 4: Evaluate Targets
targets = ['size_nm', 'S_abs_nm_Y1', 'PL']
results = {}

for target in targets:
    print(f"Evaluating target: {target}")
    
    # Prepare data
    _, _, X_train, X_test, y_train, y_test = prepare_ml_data(data_modified, target)
    
    # Train model and get results
    results[target] = train_deep_learning_model(X_train, y_train, X_test, y_test)

    # Print metrics
    print(f"Metrics for {target}:")
    for metric, value in results[target]["metrics"].items():
        print(f"  {metric}: {value:.4f}")
    print("\n")

# Step 5: Visualization
fig, axs = plt.subplots(3, 2, figsize=(15, 15))

for i, target in enumerate(targets):
    y_test = results[target]['predictions_test']
    predictions_test = results[target]['predictions_test']

    # Plot 1: Observed vs Predicted
    sns.scatterplot(x=np.arange(len(y_test)), y=y_test, ax=axs[i, 0], label='Observed', color='red')
    sns.scatterplot(x=np.arange(len(predictions_test)), y=predictions_test, ax=axs[i, 0], label='Predicted', color='blue')
    axs[i, 0].set_title(f'{target} - Observed vs Predicted')
    
    # Plot 2: Residuals
    residuals = y_test - predictions_test
    sns.histplot(residuals, ax=axs[i, 1], kde=True, color='green')
    axs[i, 1].set_title(f'{target} - Residuals Distribution')

plt.tight_layout()
plt.show()
