In [None]:
import sys
from data.input_data import DatasetCreator
from sklearn.model_selection import train_test_split
from pathlib import Path

# Get the absolute path of the current file
current_file_path = Path('decision_boundary.ipynb').resolve()

# Get the directory of the current file
project_dir = current_file_path.parent

# Add the project directory to sys.path
sys.path.insert(0, str(project_dir))

# Step 1: Create Datasets
dataset_creator = DatasetCreator()
blob_dataset = dataset_creator.create_blob_dataset()
circles_dataset = dataset_creator.create_make_circles_dataset()

# Step 2: Split Data into Training, Validation, and Test Sets
X_blob, y_blob = blob_dataset['X'], blob_dataset['y']
X_circles, y_circles = circles_dataset['X'], circles_dataset['y']

# Split blob dataset into training and temporary (remaining) data
X_blob_train_temp, X_blob_test, y_blob_train_temp, y_blob_test = train_test_split(X_blob, y_blob, test_size=0.2,
                                                                                  random_state=42)
X_blob_train, X_blob_val, y_blob_train, y_blob_val = train_test_split(X_blob_train_temp, y_blob_train_temp,
                                                                      test_size=0.25, random_state=42)

print(f"Blob Dataset:")
print(f"Train set: {X_blob_train.shape}, Validation set: {X_blob_val.shape}, Test set: {X_blob_test.shape}")

# Split circles dataset into training and temporary (remaining) data
X_circles_train_temp, X_circles_test, y_circles_train_temp, y_circles_test = train_test_split(X_circles, y_circles,
                                                                                              test_size=0.2,
                                                                                              random_state=42)
X_circles_train, X_circles_val, y_circles_train, y_circles_val = train_test_split(X_circles_train_temp,
                                                                                  y_circles_train_temp, test_size=0.25,
                                                                                  random_state=42)

print(f"\nCircles Dataset:")
print(f"Train set: {X_circles_train.shape}, Validation set: {X_circles_val.shape}, Test set: {X_circles_test.shape}")

To calculate the shift and variation for the basic model (SVM) and the ensemble model (Random Forest), we can use the concepts of bias and variance:

1. **Shift (Bias)**: Bias measures how far off the predictions of a model are from the correct values on average.

2. **Variation (Variance)**: Variance measures how much the predictions for a given point vary between different realizations of the model.

Hereâ€™s how we can estimate these for our models:

### Shift (Bias) Calculation:

For bias estimation, we typically use the training error, assuming that the training set represents the true underlying distribution of the data. The bias can be approximated by the training error of the model.



In [None]:

from sklearn.metrics import mean_squared_error

# Calculate bias (shift) for SVM and Random Forest
svm_train_predictions = svm_model_blob.predict(X_blob_train)
rf_train_predictions = rf_model_blob.predict(X_blob_train)

svm_bias = mean_squared_error(y_blob_train, svm_train_predictions)
rf_bias = mean_squared_error(y_blob_train, rf_train_predictions)

print(f"Shift (Bias) for SVM: {svm_bias:.4f}")
print(f"Shift (Bias) for Random Forest: {rf_bias:.4f}")


### Variation (Variance) Calculation:

# For variance estimation, we can use the difference between the training error and the test error, as the test error reflects the variability in performance due to different datasets.


from sklearn.metrics import mean_squared_error

# Calculate variance (variation) for SVM and Random Forest
svm_test_predictions = svm_model_blob.predict(X_blob_test)
rf_test_predictions = rf_model_blob.predict(X_blob_test)

svm_variance = mean_squared_error(y_blob_test, svm_test_predictions) - svm_bias
rf_variance = mean_squared_error(y_blob_test, rf_test_predictions) - rf_bias

print(f"Variation (Variance) for SVM: {svm_variance:.4f}")
print(f"Variation (Variance) for Random Forest: {rf_variance:.4f}")
