# Importing Libraries
**For more examples of what Kosh can do visit [GitHub Examples](https://github.com/LLNL/kosh/tree/stable/examples).**

In [None]:
from numbers import Number
from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns
import kosh
import math
import statistics
import numpy as np
import os
import sys

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression

print(sys.argv[1])
if "-f" in sys.argv[1]:  # Running as notebook
    out_path = 'ball-bounce-metadata-machine-learning_20250205-150210'
    use_gpu = False
    %matplotlib inline
else:
    out_path = sys.argv[1]  # Running as script
    use_gpu = True

# Ensembles Initialization
database = os.path.join(out_path, 'ensembles_output.sqlite')
print(database)
datastore = kosh.connect(database)
print("Kosh is ready!")

# Printing Attributes and Features
test_rec = list(datastore.find())[1]
print('Attributes:')
print('\t',test_rec.list_attributes())
print('\n')
print('Features Sets:')
print('\t',test_rec.list_features())
time=test_rec['physics_cycle_series/time'][:]
image_path = os.path.join(out_path, 'metadata-machine-learning-ball-bounce/images')
os.makedirs(image_path, exist_ok=True)

# Loading Data

As mentioned in the `README.md`, if one is interested in predicting the final value rather than the transient behavior, one can use this metadata analysis instead of an LSTM or Transformer. Instead of loading our time series data for each run, we will load the metadata associated for each run. The metadata is known as Kosh dataset attributes. Each dataset can have its own attributes but we can extract all the attributes for all datasets into a Pandas DataFrame using `store.to_dataframe()`. The metadata for these runs was added in the `dsv_to_kosh.py` file in the `ingest-ball-bounce` step of the workflow.  See [kosh/examples/Example_Simulation_Workflow.ipynb](https://github.com/LLNL/kosh/blob/stable/examples/Example_Simulation_Workflow.ipynb) for more information on how to add metadata, update it, and extract it.

Below we see a dataframe of size `n_datasets x n_attributes` where `n_datasets` is how many simulations we ran in the workflow.

In [None]:
df = datastore.to_dataframe()
print(df)


# Splitting Data

We will extract our features and labels of interest from the dataframe above since `store.to_dataframe()` also includes other metadata by default. The features will be the initial conditions for the simulations `['x_pos_initial', 'x_vel_initial', 'y_pos_initial', 'y_vel_initial', 'z_pos_initial', 'z_vel_initial']` and the label will be the final x position `['x_pos_final']` when the simulation finished. We use SciKit Learn's `train_test_split()` method to split the data into train, validation, and test data. We also use Seaborn's `pairplot()` to quickly plot the features and labels in the Pandas DataFrame.

In [None]:
# Extracting necessary data
features = ['x_pos_initial', 'x_vel_initial', 'y_pos_initial', 'y_vel_initial', 'z_pos_initial', 'z_vel_initial']
labels = ['x_pos_final']
df_original = df[features + labels].copy()
df_original_features = df_original[features].copy()
df_original_labels = df_original[labels].copy()


# Splitting data
df_train_features, df_test_features, df_train_labels, df_test_labels = train_test_split(df_original_features, df_original_labels, test_size=0.2, random_state=42)
df_train_features, df_validation_features, df_train_labels, df_validation_labels = train_test_split(df_train_features, df_train_labels, test_size=0.2, random_state=42)

print(df_train_features.head())

print(f"Train Size features: {df_train_features.shape} and labels: {df_train_labels.shape}")
print(f"Validation Size features: {df_validation_features.shape} and labels: {df_validation_labels.shape}")
print(f"Test Size features: {df_test_features.shape} and labels: {df_test_labels.shape}")

# Temporarily combine features and labels to see all the data on the same plot
df_train_temp = pd.concat([df_train_features, df_train_labels], axis=1)
sns.pairplot(df_train_temp)
plt.show()

# Scaling the data
Scaling the data so that all the features are around the same magnitude helps the model converge faster due to how the optimizers update the weights.

In [None]:
# Scaler
scaler = StandardScaler()

# Train Data
df_train_features_scaled = scaler.fit_transform(df_train_features) # Fit AND transform only for train data

# Validation Data
df_validation_features_scaled = scaler.transform(df_validation_features) # Transform ONLY for validation data

# Test Data
df_test_features_scaled = scaler.transform(df_test_features) # Transform ONLY for test data

print("Not Scaled:\n",df_train_features.head())
print("Scaled:\n",df_train_features_scaled)

# Turning Pandas DataFrames into Matricies

Now we will convert our Pandas DataFrames into matricies so the Machine Learning algorithms can process the data.

In [None]:
X_train = df_train_features_scaled  # Already in matrix form thanks to scaler in previous code cell
y_train = df_train_labels.to_numpy()

X_validation = df_validation_features_scaled  # Already in matrix form thanks to scaler in previous code cell
y_validation = df_validation_labels.to_numpy()

X_test = df_test_features_scaled  # Already in matrix form thanks to scaler in previous code cell
y_test = df_test_labels.to_numpy()

# Train The Model

We will now train our model using `sklearn.linear_model.LinearRegression()` by using `sklearn.linear_model.LinearRegression.fit()`.

In [None]:
LinReg = LinearRegression()
LinReg.fit(X_train, y_train)


# Inference

Now that our model is trained, we can calculate the score of Coefficient of Determination $R^2$ `sklearn.linear_model.LinearRegression.score()` for our train, validation, and test data. We can also see what the model will infer/predict for the final x position `['x_pos_final']` given the features `['x_pos_initial', 'x_vel_initial', 'y_pos_initial', 'y_vel_initial', 'z_pos_initial', 'z_vel_initial']` using `sklearn.linear_model.LinearRegression.predict()`. We also calculate the residuals.


In [None]:
train_r_2 = LinReg.score(X_train, y_train)
y_train_predict = LinReg.predict(X_train)
residuals_train = y_train - y_train_predict

validation_r_2 = LinReg.score(X_validation, y_validation)
y_validation_predict = LinReg.predict(X_validation)
residuals_validation = y_validation - y_validation_predict

test_r_2 = LinReg.score(X_test, y_test)
y_test_predict = LinReg.predict(X_test)
residuals_test = y_test - y_test_predict

# Inference Plotting

We create three plots each for the train, vaidation, and test data. We get a pretty high $R^2$ which is great.

* `y vs y_predict` shows perfect prediction if the data points are along the slope of 1 line. We also label it with the $R^2$ value which shows perfect prediction if the value is 1.
* `residuals vs y_predict` values should be close to 0 meaning predictions `y_predict` were not far off from the true values `y`
* `residual distribution` same as above but from different angle


In [None]:
#######################
# Plot the prediction #
#######################

fig, ax = plt.subplots(nrows=3,ncols=3,figsize=(9,9))
fig.suptitle('Whole Prediction')

ax[0,0].scatter(y_train_predict,y_train, label = f"$R^2={train_r_2:.5f}")
ax[0,1].scatter(y_validation_predict,y_validation, label = f"$R^2={validation_r_2:.5f}$")
ax[0,2].scatter(y_test_predict,y_test, label = f"$R^2={test_r_2:.5f}")
ax[0,0].legend(fontsize='xx-small')
ax[0,0].set_title('Train y vs y_predict')
ax[0,1].legend(fontsize='xx-small')
ax[0,1].set_title('Validation y vs y_predict')
ax[0,2].legend(fontsize='xx-small')
ax[0,2].set_title('Test y vs y_predict')

ax[1,0].scatter(y_train_predict,residuals_train, label = f"$R^2={train_r_2:.5f}")
ax[1,1].scatter(y_validation_predict,residuals_validation, label = f"$R^2={validation_r_2:.5f}$")
ax[1,2].scatter(y_test_predict,residuals_test, label = f"$R^2={test_r_2:.5f}")
ax[1,0].legend(fontsize='xx-small')
ax[1,0].set_title('Train residuals vs y_predict')
ax[1,1].legend(fontsize='xx-small')
ax[1,1].set_title('Validation residuals vs y_predict')
ax[1,2].legend(fontsize='xx-small')
ax[1,2].set_title('Test residuals vs y_predict')

ax[2,0].hist(residuals_train, label = f"$R^2={train_r_2:.5f}")
ax[2,1].hist(residuals_validation, label = f"$R^2={validation_r_2:.5f}$")
ax[2,2].hist(residuals_test, label = f"$R^2={test_r_2:.5f}")
ax[2,0].legend(fontsize='xx-small')
ax[2,0].set_title('Train residual distribution')
ax[2,1].legend(fontsize='xx-small')
ax[2,1].set_title('Validation residual distribution')
ax[2,2].legend(fontsize='xx-small')
ax[2,2].set_title('Test residual distribution')

fig.tight_layout()
fig.savefig(os.path.join(image_path, 'whole_prediction.png'))