# Model Prototype

### This notebook shows how to create a baseline model pipeline and save it

##### We save the Spark Dataframe as an Iceberg Table. Iceberg is a new open table format backed by Apple, Netflix and Cloudera. 
##### In the context of ML Ops, the most anticipated feature is Time Travel i.e. the ability to reproduce the data and the schema across different versions in time
##### Finally, we create a simple PySpark pipeline and train a classifier with Keras/Tensorflow

* For a more comprehensive demo of Iceberg in CML, please visit the [Spark3 Iceberg CML Github Repository](https://github.com/pdefusco/Spark3_Iceberg_CML)
* For a more detailed introduction to CML Session, Notebooks, and Spark tips and trips please visit the [CML Total Beginner GitHub Repository](https://github.com/pdefusco/CML-Total-Beginner)
* For a more comprehensive example of the Atlas Python client mentioned below, please visit the [Atlas Client Example Notebook in the Data Integration with ML GitHub Repository](https://github.com/pdefusco/Data_Integration_wMachineLearning/blob/main/2_A_Atlas_Client_Example.ipynb)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from sklearn.datasets import make_circles
import tensorflow as tf
import pandas as pd
from helpers.plot_decision_boundary import *

In [5]:
!pip3 install petastorm

Collecting petastorm
  Using cached petastorm-0.11.3-py2.py3-none-any.whl (283 kB)
Collecting dill>=0.2.1
  Using cached dill-0.3.4-py2.py3-none-any.whl (86 kB)
Collecting future>=0.10.2
  Using cached future-0.18.2.tar.gz (829 kB)
Collecting pyarrow>=0.17.1
  Downloading pyarrow-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25.6 MB)
[K     |████████████████████████████████| 25.6 MB 4.2 MB/s eta 0:00:01
[?25hCollecting fsspec
  Using cached fsspec-2021.11.1-py3-none-any.whl (132 kB)
Collecting diskcache>=3.0.0
  Using cached diskcache-5.3.0-py3-none-any.whl (44 kB)
Collecting pyspark>=2.1.0
  Using cached pyspark-3.2.0.tar.gz (281.3 MB)
Collecting psutil>=4.0.0
  Downloading psutil-5.8.0-cp37-cp37m-manylinux2010_x86_64.whl (296 kB)
[K     |████████████████████████████████| 296 kB 109.6 MB/s eta 0:00:01
Collecting py4j==0.10.9.2
  Using cached py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
Building wheels for collected packages: future, pyspark
  Building wheel for fu

#### The Spark Session is created with the following configurations. If you get an error, ensure your CML Session is using Runtimes and Spark 3.1.

spark = SparkSession.builder.master('local[*]')\
  .config("spark.jars.packages","org.apache.iceberg:iceberg-spark3-runtime:0.12.1")\
  .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")\
  .config("spark.sql.catalog.spark_catalog","org.apache.iceberg.spark.SparkSessionCatalog")\
  .config("spark.sql.catalog.spark_catalog.type","hive")\
  .config("spark.hadoop.fs.s3a.s3guard.ddb.region","us-east-2")\
  .config("spark.yarn.access.hadoopFileSystems","s3a://gd01-uat2/")\
  .getOrCreate()

In [2]:
import os
import tempfile
import requests

def download_mnist_libsvm(mnist_data_dir):
    mnist_data_path = os.path.join(mnist_data_dir, "mnist.bz2")
    data_url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2"
    r = requests.get(data_url)
    with open(mnist_data_path, "wb") as f:
        f.write(r.content)


def get_mnist_dir():
    # This folder is baked into the docker image
    MNIST_DATA_DIR = "/home/cdsw/data/mnist/"

    if os.path.isdir(MNIST_DATA_DIR) and os.path.isfile(os.path.join(MNIST_DATA_DIR, 'mnist.bz2')):
        return MNIST_DATA_DIR

    download_mnist_libsvm(MNIST_DATA_DIR)
    return MNIST_DATA_DIR

In [3]:
mnist_dir = get_mnist_dir()

In [6]:
import logging

from pyspark.sql import SparkSession
from petastorm.spark import SparkDatasetConverter, make_spark_converter

try:
    from pyspark.sql.functions import col
except ImportError:
    raise ImportError("This script runs with PySpark>=3.0.0")


def get_compiled_model(lr=0.001):
    from tensorflow import keras

    model = keras.models.Sequential([
        keras.layers.Flatten(input_shape=(28, 28)),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(10),
    ])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),
                  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model


def train(dataset, steps=1000, lr=0.001):
    model = get_compiled_model(lr=lr)
    model.fit(dataset, steps_per_epoch=steps)
    return model

  from pyarrow import LocalFileSystem


In [56]:
# Get SparkSession
spark = SparkSession.builder.master('local[*]')\
  .config("spark.jars.packages","org.apache.iceberg:iceberg-spark3-runtime:0.12.1")\
  .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")\
  .config("spark.sql.catalog.spark_catalog","org.apache.iceberg.spark.SparkSessionCatalog")\
  .config("spark.sql.catalog.spark_catalog.type","hive")\
  .config("spark.hadoop.fs.s3a.s3guard.ddb.region","us-east-2")\
  .config("spark.yarn.access.hadoopFileSystems","s3a://gd01-uat2/")\
  .getOrCreate()

In [57]:
# Load and preprocess data using Spark
df = spark.read.format("libsvm") \
    .option("numFeatures", "784") \
    .load(mnist_dir) \
    .select(col("features"), col("label").cast("long").alias("label"))

In [62]:
df.schema

StructType(List(StructField(features,VectorUDT,true),StructField(label,LongType,true)))

In [70]:
from pyspark.ml.functions import vector_to_array

df.withColumn("xs", vector_to_array("features")).select([col("xs")[i] for i in range(784)])

DataFrame[xs[0]: double, xs[1]: double, xs[2]: double, xs[3]: double, xs[4]: double, xs[5]: double, xs[6]: double, xs[7]: double, xs[8]: double, xs[9]: double, xs[10]: double, xs[11]: double, xs[12]: double, xs[13]: double, xs[14]: double, xs[15]: double, xs[16]: double, xs[17]: double, xs[18]: double, xs[19]: double, xs[20]: double, xs[21]: double, xs[22]: double, xs[23]: double, xs[24]: double, xs[25]: double, xs[26]: double, xs[27]: double, xs[28]: double, xs[29]: double, xs[30]: double, xs[31]: double, xs[32]: double, xs[33]: double, xs[34]: double, xs[35]: double, xs[36]: double, xs[37]: double, xs[38]: double, xs[39]: double, xs[40]: double, xs[41]: double, xs[42]: double, xs[43]: double, xs[44]: double, xs[45]: double, xs[46]: double, xs[47]: double, xs[48]: double, xs[49]: double, xs[50]: double, xs[51]: double, xs[52]: double, xs[53]: double, xs[54]: double, xs[55]: double, xs[56]: double, xs[57]: double, xs[58]: double, xs[59]: double, xs[60]: double, xs[61]: double, xs[62]: 

In [63]:
spark.sql("DROP TABLE new_ice")

DataFrame[]

In [64]:
# Saving the Spark Dataframe as an Iceberg table
spark.sql("CREATE TABLE IF NOT EXISTS new_ice (features struct<type:tinyint,size:int,indices:array<int>,values:array<double>>, label BIGINT) USING iceberg")

DataFrame[]

In [65]:
df.write.format("iceberg").mode("overwrite").save("default.new_ice")

AnalysisException: Cannot write to 'spark_catalog.default.new_ice', too many data columns:
Table columns: 'features'
Data columns: 'features', 'label'

In [None]:
spark.read.format("iceberg").load("default.new_ice.snapshots").show(20, False)

In [55]:
spark.stop()

In [44]:
df.writeTo("spark_catalog.default.mnist_iceberg_cml").create()

In [46]:
spark.read.format("iceberg").load("default.mnist_iceberg_cml.snapshots").show(20, False)

AnalysisException: Table default.mnist_iceberg_cml.snapshots not found

In [41]:
spark.sql("select * from default.mnist_iceberg_cml").show()

Py4JJavaError: An error occurred while calling o387.showString.
: java.io.IOException: Can't get Master Kerberos principal for use as renewer
	at org.apache.hadoop.mapreduce.security.TokenCache.obtainTokensForNamenodesInternal(TokenCache.java:134)
	at org.apache.hadoop.mapreduce.security.TokenCache.obtainTokensForNamenodesInternal(TokenCache.java:102)
	at org.apache.hadoop.mapreduce.security.TokenCache.obtainTokensForNamenodes(TokenCache.java:81)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:217)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:328)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:442)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [36]:
# Saving the Spark Dataframe as an Iceberg table
spark.sql("CREATE TABLE IF NOT EXISTS ice_cml (features string, label bigint) USING iceberg")

df.write.format("iceberg").mode("overwrite").save("default.mnist_iceberg")

AnalysisException: Cannot write incompatible data to table 'spark_catalog.default.mnist_iceberg':
- Cannot write 'features': struct<type:tinyint,size:int,indices:array<int>,values:array<double>> is incompatible with string

In [32]:
spark.stop()

In [8]:


# Randomly split data into train and test dataset
df_train, df_test = df.randomSplit([0.9, 0.1], seed=12345)

# Set a cache directory for intermediate data.
# The path should be accessible by both Spark workers and driver.
spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF,
               "file:///tmp/petastorm/cache/tf-example")

converter_train = make_spark_converter(df_train)
converter_test = make_spark_converter(df_test)

def train_and_evaluate(_=None):
    import tensorflow.compat.v1 as tf  # pylint: disable=import-error

    with converter_train.make_tf_dataset() as dataset:
        dataset = dataset.map(lambda x: (tf.reshape(x.features, [-1, 28, 28]), x.label))
        model = train(dataset)

    with converter_test.make_tf_dataset(num_epochs=1) as dataset:
        dataset = dataset.map(lambda x: (tf.reshape(x.features, [-1, 28, 28]), x.label))
        hist = model.evaluate(dataset)

    return hist[1]

# Train and evaluate the model on the local machine
accuracy = train_and_evaluate()
logging.info("Train and evaluate the model on the local machine.")
logging.info("Accuracy: %.6f", accuracy)

  self._filesystem = pyarrow.localfs
Converting floating-point columns to float32
The median size 10765737 B (< 50 MB) of the parquet files is too small. Total size: 16658412 B. Increase the median file size by calling df.repartition(n) or df.coalesce(n), which might help improve the performance. Parquet files: file:///tmp/petastorm/cache/tf-example/20211204003041-appid-local-1638577702563-7cae4a6d-21d7-49ea-aa1b-7163ea42d401/part-00001-036bb370-60b7-4c80-931f-f84d2c303dbd-c000.parquet, ...
Converting floating-point columns to float32
The median size 1231397 B (< 50 MB) of the parquet files is too small. Total size: 1903223 B. Increase the median file size by calling df.repartition(n) or df.coalesce(n), which might help improve the performance. Parquet files: file:///tmp/petastorm/cache/tf-example/20211204003051-appid-local-1638577702563-dffa2649-1747-4b1a-8996-520243496975/part-00001-ae58eb09-37df-46fc-8aa2-997f1ee352ee-c000.parquet, ...




In [9]:
accuracy

0.6653121113777161

In [10]:
logging.info("Train and evaluate the model remotely on a spark worker, "
             "which can be used for distributed hyperparameter tuning.")
logging.info("Accuracy: %.6f", accuracy)

# Cleanup
converter_train.delete()
converter_test.delete()
spark.stop()

#### Just some fake data...

In [4]:
# Make 1000 examples
n_samples = 1000

# Create circles
X, y = make_circles(n_samples, 
                    noise=0.03, 
                    random_state=42)

circles = pd.DataFrame({"var1":X[:, 0], "var2":X[:, 1], "label":y})
circles.head()

Unnamed: 0,var1,var2,label
0,0.754246,0.231481,1
1,-0.756159,0.153259,1
2,-0.815392,0.173282,1
3,-0.393731,0.692883,1
4,0.442208,-0.896723,0


#### We can save the DataFrame as an Iceberg Table using Spark

In [5]:
# Creating a Spark Dataframe from the Pandas Dataframe
sparkDF=spark.createDataFrame(circles) 

In [12]:
# Saving the Spark Dataframe as an Iceberg table
spark.sql("CREATE TABLE IF NOT EXISTS ice_cml (var1 int, var2 int, label int) USING iceberg")

sparkDF.write.format("iceberg").mode("overwrite").save("default.ice_cml")

#### The table is automatically tracked by the Data Lake associated with the CML Workspace

#### To check that a new entry for the table has been added to Atlas in the Data Lake, go back to the CDP Homepage and open Data Catalog. 

#### Select the Data Lake (i.e. Cloud Environment) that your worskpace was built in. 

#### Use the Atlas Search bar at the top to browse for the table and click on it

#### Notice Atlas is tracking a lot of interesting Metadata including Table Attributes, Lineage, and a lot More. 

#### The Metadata can even be customized. [This notebook](https://github.com/pdefusco/Data_Integration_wMachineLearning/blob/main/2_A_Atlas_Client_Example.ipynb) shows how you can use the Atlas Python Client to build custom lineage flows.

#### Back to Modeling. We will use Keras and Tensorflow to build this classifier. Our data is in Spark though, so we will use Petastorm to transform the data Tensorflow-readable.

In [13]:
!pip3 install petastorm

Collecting petastorm
  Downloading petastorm-0.11.3-py2.py3-none-any.whl (283 kB)
[K     |████████████████████████████████| 283 kB 3.8 MB/s eta 0:00:01
Collecting psutil>=4.0.0
  Downloading psutil-5.8.0-cp36-cp36m-manylinux2010_x86_64.whl (291 kB)
[K     |████████████████████████████████| 291 kB 92.6 MB/s eta 0:00:01
Collecting future>=0.10.2
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 113.5 MB/s eta 0:00:01
[?25hCollecting fsspec
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 101.0 MB/s eta 0:00:01
[?25hCollecting diskcache>=3.0.0
  Downloading diskcache-5.3.0-py3-none-any.whl (44 kB)
[K     |████████████████████████████████| 44 kB 451 kB/s s eta 0:00:01
Collecting pyarrow>=0.17.1
  Downloading pyarrow-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25.6 MB)
[K     |████████████████████████████████| 25.6 MB 116.3 MB/s eta 0:00:01
Collecting dill>=0.2.1


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████████| 281.3 MB 34 kB/s 
Collecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 110.8 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: future, pyspark
  Building wheel for future (setup.py) ... [?25ldone
[?25h  Created wheel for future: filename=future-0.18.2-py3-none-any.whl size=491070 sha256=32fda17f62ac6e87eaef8a6e235595e4c7024d506fdcc3aaba3a14d39688e0fc
  Stored in directory: /home/cdsw/.cache/pip/wheels/6e/9c/ed/4499c9865ac1002697793e0ae05ba6be33553d098f3347fb94
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=1b9e3b29a52b66939ba7f5befa8ff94dbd56c39dcfbe7b8fc7924fe89509f65e
  Stored in directory: /home/cdsw/.cache/pip/wheels/e8/d9/e5/78436a0a3899d81410aeb45b200153113667f2e250f6882ada
Successfully built future pyspark
Installing collected packages

In [14]:
from petastorm.spark import SparkDatasetConverter, make_spark_converter
import tensorflow.compat.v1 as tf  # pylint: disable=import-error

In [21]:
# specify a cache dir first.

# Set a cache directory for intermediate data.
# The path should be accessible by both Spark workers and driver.
spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF,"file:///tmp/petastorm/cache/tf-example")

In [None]:
#!pip3 install s3fs

In [23]:
# create a converter from `df`
# it will materialize `df` to cache dir.
converter = make_spark_converter(sparkDF)

Converting floating-point columns to float32
The median size 5033 B (< 50 MB) of the parquet files is too small. Total size: 10062 B. Increase the median file size by calling df.repartition(n) or df.coalesce(n), which might help improve the performance. Parquet files: file:///tmp/petastorm/cache/tf-example/20211203233933-appid-local-1638570042715-2c963542-b1cd-4cd6-8bf5-1297cbb97040/part-00000-db00b0de-8eba-40c6-8675-01b28dff976b-c000.parquet, ...


In [None]:
converter

In [24]:
# Create the model (same as model_7)
model = tf.keras.Sequential([
  tf.keras.layers.Dense(4, activation="relu"), # hidden layer 1, using "relu" for activation (same as tf.keras.activations.relu)
  tf.keras.layers.Dense(4, activation="relu"),
  tf.keras.layers.Dense(1, activation="sigmoid") # output layer, using 'sigmoid' for the output
])

In [27]:
# Compile the model
model.compile(loss=tf.keras.losses.binary_crossentropy,
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), # increase learning rate from 0.001 to 0.01 for faster learning
                metrics=['accuracy'])

In [None]:
# make a tensorflow dataset from `converter`
with converter.make_tf_dataset() as dataset:
    # the `dataset` is `tf.data.Dataset` object
    # we can train/evaluate model on the `dataset`
    history = model.fit(dataset)
    # when exiting the context, the reader of the dataset will be closed
    
    # Evaluate our model on the test set
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"Model loss on the test set: {loss}")
    print(f"Model accuracy on the test set: {100*accuracy:.2f}%")


# delete the cached files of the dataframe.
converter.delete()

 607779/Unknown - 524s 860us/step - loss: -900.7520 - accuracy: 0.0000e+00

In [None]:
# Visualize with a plot
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu);


# Split data into train and test sets
X_train, y_train = X[:800], y[:800] # 80% of the data for the training set
X_test, y_test = X[800:], y[800:] # 20% of the data for the test set

# Check the shapes of the data
X_train.shape, X_test.shape # 800 examples in the training set, 200 examples in the test set

# Set random seed
tf.random.set_seed(42)

# Create the model (same as model_7)
model = tf.keras.Sequential([
  tf.keras.layers.Dense(4, activation="relu"), # hidden layer 1, using "relu" for activation (same as tf.keras.activations.relu)
  tf.keras.layers.Dense(4, activation="relu"),
  tf.keras.layers.Dense(1, activation="sigmoid") # output layer, using 'sigmoid' for the output
])

# Compile the model
model.compile(loss=tf.keras.losses.binary_crossentropy,
                optimizer=tf.keras.optimizers.Adam(lr=0.01), # increase learning rate from 0.001 to 0.01 for faster learning
                metrics=['accuracy'])

# Fit the model
history = model.fit(X_train, y_train, epochs=25)

# Evaluate our model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Model loss on the test set: {loss}")
print(f"Model accuracy on the test set: {100*accuracy:.2f}%")

# Plot the decision boundaries for the training and test sets
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title("Train")
plot_decision_boundary(model, X=X_train, y=y_train)
plt.subplot(1, 2, 2)
plt.title("Test")
plot_decision_boundary(model, X=X_test, y=y_test)
plt.show()

# You can access the information in the history variable using the .history attribute
pd.DataFrame(history.history)

# Plot the loss curves
pd.DataFrame(history.history).plot()
plt.title("Model training curves")


model.save('models/my_model.h5')