## Decision Tree

### The following codes show the performance of the test and train data using Decision Tree model

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from scipy.stats import uniform

# Read the data
excel_file = "/Users/mehmetsiddik/Desktop/CsPbCI3_modified.xlsx"
data = pd.read_excel(excel_file)

# Identify and one-hot encode categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoded = one_hot_encoder.fit_transform(data[categorical_columns])
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(categorical_columns))

# Replace categorical columns with one-hot encoded columns
data_encoded = data.drop(categorical_columns, axis=1)
data_encoded = pd.concat([data_encoded, one_hot_encoded_df], axis=1)

# Target variables
targets = ['size_nm', 'S_abs_nm_Y1', 'PL']

# Initialize results dictionaries
results = {}
predictions = {}

for target in targets:
    print(f"Evaluating target: {target}")
    X = data_encoded.drop(target, axis=1)
    y = data_encoded[target]

    # Fill missing values with the median
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    X = pd.DataFrame(imp.fit_transform(X), columns=X.columns)

    # Scale features
    scaler = MinMaxScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Decision Tree Regressor with randomized search for hyperparameter tuning
    param_dist = {
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    dtr = DecisionTreeRegressor(random_state=42)
    random_search = RandomizedSearchCV(dtr, param_dist, cv=5, n_iter=10, random_state=42, verbose=1)
    random_search.fit(X_train, y_train)

    # Best model from random search
    best_model = random_search.best_estimator_
    predictions_train = best_model.predict(X_train)
    predictions_test = best_model.predict(X_test)

    # Store predictions
    predictions[target] = {
        'y_test': y_test,
        'predictions_test': predictions_test
    }

    # Performance metrics
    results[target] = {
        'Train R2': r2_score(y_train, predictions_train),
        'Test R2': r2_score(y_test, predictions_test),
        'Train RMSE': np.sqrt(mean_squared_error(y_train, predictions_train)),
        'Test RMSE': np.sqrt(mean_squared_error(y_test, predictions_test)),
        'Train MAE': mean_absolute_error(y_train, predictions_train),
        'Test MAE': mean_absolute_error(y_test, predictions_test)
    }

    # Print performance metrics
    print("Performance for train data:")
    print("R2:", r2_score(y_train, predictions_train))
    print("RMSE:", np.sqrt(mean_squared_error(y_train, predictions_train)))
    print("MAE:", mean_absolute_error(y_train, predictions_train))

    print("Performance for test data:")
    print("R2:", r2_score(y_test, predictions_test))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, predictions_test)))
    print("MAE:", mean_absolute_error(y_test, predictions_test))
    print("\n")

print(results)


In [None]:
# Import necessary libraries for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Define figure and axes for subplots
fig, axs = plt.subplots(3, 2, figsize=(10, 10))  # Adjust the figure size as needed

# Define the titles for the plots
titles = ['size_nm', 'S_abs_nm_Y1', 'PL']

# Loop through the rows and create the plots for sample numbers and observed vs predicted
for i, target in enumerate(targets):
    y_test = predictions[target]['y_test']
    predictions_test = predictions[target]['predictions_test']
    
    # Plot (a): Sample Number vs Predicted Values
    sns.scatterplot(x=np.arange(1, len(y_test) + 1), y=y_test.values, ax=axs[i, 0], label='Observed', color='red', s=100)
    sns.scatterplot(x=np.arange(1, len(y_test) + 1), y=predictions_test, ax=axs[i, 0], label='Predicted', color='#4363d8', s=100)
    axs[i, 0].set(xlabel='Sample Number', ylabel='Values (nm)', title=f'{titles[i]}')
    axs[i, 0].legend()

    # Plot (b): Observed vs Predicted Values
    residuals = y_test.values - predictions_test
    sns.scatterplot(x=y_test.values, y=predictions_test, hue=residuals, ax=axs[i, 1], palette='Reds', s=100)
    axs[i, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')
    axs[i, 1].set(xlabel='Observed values (nm)', ylabel='Predicted values (nm)', title=f'{titles[i]}')
    axs[i, 1].get_legend().remove()

# Adjust the layout to make room for the suptitle
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
