In [None]:
#for local device

import findspark
findspark.init()
findspark.find()

import requests
import feedparser
import pyspark
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from datetime import datetime
import pytz
from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext

# Initialize Spark
appName = "Project - Machine Learning Techniques on MQTT"
master = "local"

# Create Configuration object for Spark.
conf = pyspark.SparkConf()\
    .set('spark.driver.host','127.0.0.1')\
    .setAppName(appName)\
    .setMaster(master)


# Create Spark Context with the new configurations rather than relying on the default
sc = SparkContext.getOrCreate(conf=conf)
# You need to create SQL Context to conduct some database operations like what we will
sqlContext = SQLContext(sc)
# If you have SQL context, you create the session from the Spark Context
spark = sqlContext.sparkSession.builder.getOrCreate()


# spark = SparkSession.builder.master("local[*]").appName("sparktest").getOrCreate()

In [None]:
# For Cloud

# # import findspark
# # findspark.init()
# # findspark.find()
# import pyspark
# from pyspark.sql import SparkSession
# from pyspark import SparkContext, SQLContext

# import requests
# #import feedparser
# import pyspark
# from pyspark.sql.types import StructType, StructField, StringType, TimestampType
# from datetime import datetime
# import pytz
# from pyspark.sql import SparkSession
# from pyspark import SparkContext, SQLContext

# # Initialize Spark
# appName = "Project - Machine Learning Techniques on MQTT"
# master = "yarn"

# # Create Configuration object for Spark.
# # conf = pyspark.SparkConf()\
# #     .set('spark.driver.host','127.0.0.1')\
# #     .setAppName(appName)\
# #     .setMaster(master)

# conf = pyspark.SparkConf()\
#     .setAppName(appName)\
#     .setMaster(master)
# # Create Spark Context with the new configurations rather than relying on the default
# sc = SparkContext.getOrCreate(conf=conf)
# # You need to create SQL Context to conduct some database operations like what we will
# sqlContext = SQLContext(sc)
# # If you have SQL context, you create the session from the Spark Context
spark = sqlContext.sparkSession.builder.getOrCreate()

In [5]:
train_path = r"C:\Users\nickc\OneDrive\Documents\GitHub\course-project-option-2-nick1117\DataFolder\archive\Data\FINAL_CSV\train70_augmented.csv"
train_data = spark.read.csv(train_path, header=True, inferSchema=True)

test_path = r"C:\Users\nickc\OneDrive\Documents\GitHub\course-project-option-2-nick1117\DataFolder\archive\Data\FINAL_CSV\test30_augmented.csv"
test_data = spark.read.csv(test_path, header=True, inferSchema=True)

In [None]:
# run for cloud - will need to set up bucket

# train_path = 'gs://project-bucket-chermak/DataFolder/archive/Data/FINAL_CSV/train70_augmented.csv'
# train_data = spark.read.csv(train_path, header=True, inferSchema=True)

# test_path = 'gs://project-bucket-chermak/DataFolder/archive/Data/FINAL_CSV/test30_augmented.csv'
# test_data = spark.read.csv(test_path, header=True, inferSchema=True)

In [6]:
from pyspark.sql.functions import lit

train_df = train_data.withColumn("dataset_type", lit("train"))
test_df = test_data.withColumn("dataset_type", lit("test"))
df = train_df.union(test_df)

In [7]:
print(df.columns)

['tcp.flags', 'tcp.time_delta', 'tcp.len', 'mqtt.conack.flags', 'mqtt.conack.flags.reserved', 'mqtt.conack.flags.sp', 'mqtt.conack.val', 'mqtt.conflag.cleansess', 'mqtt.conflag.passwd', 'mqtt.conflag.qos', 'mqtt.conflag.reserved', 'mqtt.conflag.retain', 'mqtt.conflag.uname', 'mqtt.conflag.willflag', 'mqtt.conflags', 'mqtt.dupflag', 'mqtt.hdrflags', 'mqtt.kalive', 'mqtt.len', 'mqtt.msg', 'mqtt.msgid', 'mqtt.msgtype', 'mqtt.proto_len', 'mqtt.protoname', 'mqtt.qos', 'mqtt.retain', 'mqtt.sub.qos', 'mqtt.suback.qos', 'mqtt.ver', 'mqtt.willmsg', 'mqtt.willmsg_len', 'mqtt.willtopic', 'mqtt.willtopic_len', 'target', 'dataset_type']


In [8]:
new_column_names = [
    "tcp_flags",
    "tcp_time_delta",
    "tcp_length",
    "mqtt_connack_flags",
    "mqtt_connack_reserved_flags",
    "mqtt_connack_session_present",
    "mqtt_connack_return_code",
    "mqtt_connect_clean_session_flag",
    "mqtt_connect_password_flag",
    "mqtt_connect_qos_level",
    "mqtt_connect_reserved_flag",
    "mqtt_connect_retain_flag",
    "mqtt_connect_username_flag",
    "mqtt_connect_will_flag",
    "mqtt_connect_flags",
    "mqtt_duplicate_flag",
    "mqtt_header_flags",
    "mqtt_keep_alive_interval",
    "mqtt_length",
    "mqtt_message",
    "mqtt_message_id",
    "mqtt_message_type",
    "mqtt_protocol_length",
    "mqtt_protocol_name",
    "mqtt_qos_level",
    "mqtt_retain_flag",
    "mqtt_subscription_qos_level",
    "mqtt_subscription_ack_qos_level",
    "mqtt_version",
    "mqtt_will_message",
    "mqtt_will_message_length",
    "mqtt_will_topic",
    "mqtt_will_topic_length",
    "target_class",
    "dataset_type"
]

df = df.toDF(*new_column_names)

train_df = train_df.toDF(*new_column_names)
test_df = test_df.toDF(*new_column_names)
train_df = train_df.drop("dataset_type")
test_df = test_df.drop("dataset_type")

In [9]:
from pyspark.sql.functions import col, avg, count
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank
import matplotlib.pyplot as plt

In [10]:
from pyspark.ml import Pipeline, Transformer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.sql import DataFrame

col_names = new_column_names
    # Define the columns
binary_cols = [
    "mqtt_connack_reserved_flags",
    "mqtt_connack_session_present",
    "mqtt_connect_clean_session_flag",
    "mqtt_connect_password_flag",
    "mqtt_connect_reserved_flag",
    "mqtt_connect_retain_flag",
    "mqtt_connect_username_flag",
    "mqtt_connect_will_flag",
    "mqtt_duplicate_flag",
    "mqtt_qos_level",
    "mqtt_retain_flag",
    "mqtt_connect_qos_level"
]

nominal_cols = [
    "tcp_flags",
    "mqtt_connack_flags",
    "mqtt_connack_return_code",
    "mqtt_connect_flags",
    "mqtt_header_flags",
    "mqtt_message_type",
    
    'mqtt_protocol_name'
]

continuous_cols = [
    "tcp_time_delta",
    "tcp_length",
    "mqtt_keep_alive_interval",
    "mqtt_length",
    "mqtt_protocol_length",
    "mqtt_will_message_length",
    "mqtt_will_topic_length",
    "mqtt_will_topic",
    "mqtt_will_message",
    "mqtt_subscription_ack_qos_level",
    "mqtt_subscription_qos_level",
    "mqtt_message_id",
    "mqtt_message"
]

corelated_cols_to_remove = ["mqtt_connack_reserved_flags", "mqtt_connack_session_present", "mqtt_connect_qos_level", ## zero values
                            "mqtt_connect_reserved_flag", "mqtt_connect_retain_flag", "mqtt_connect_will_flag", "mqtt_message", 
                            "mqtt_subscription_qos_level", "mqtt_subscription_ack_qos_level", "mqtt_will_message", "mqtt_version",
                            "mqtt_will_message_length", "mqtt_will_topic", "mqtt_will_topic_length",'mqtt_protocol_name', "mqtt_message_id",


                            "mqtt_protool_length", "mqtt_keep_alive_interval"] #high correlation (1.00), (0.77)

class OutcomeCreater(Transformer):
    def __init__(self):
        super().__init__()

    def _transform(self, dataset):
        label_to_multiclass = udf(lambda name:
                                  0.0 if name == 'legitimate' else
                                  1.0 if name == 'slowite' else
                                  2.0 if name == 'bruteforce' else
                                  3.0 if name == 'flood' else
                                  4.0 if name == 'malformed' else
                                  5.0 if name == 'dos' else
                                  -1.0, DoubleType())
        output_df = dataset.withColumn('outcome', label_to_multiclass(col('target_class')))
        output_df = output_df.drop("target_class")
        output_df = output_df.withColumn('outcome', col('outcome').cast(DoubleType()))
        return output_df
    
class FeatureTypeCaster(Transformer):
    def __init__(self):
        super().__init__()

    def _transform(self, dataset: DataFrame):
        output_df = dataset
        for col_name in binary_cols + continuous_cols:
            output_df = output_df.withColumn(col_name, col(col_name).cast(DoubleType()))
        return output_df
    
class ColumnDropper(Transformer):
    def __init__(self, columns_to_drop=None):
        super().__init__()
        self.columns_to_drop = columns_to_drop

    def _transform(self, dataset):
        output_df = dataset
        for col_name in self.columns_to_drop:
            output_df = output_df.drop(col_name)
        return output_df

def get_preprocess_pipeline():
    stage_typecaster = FeatureTypeCaster()
    nominal_id_cols = [x+"_index" for x in nominal_cols]
    nominal_onehot_cols = [x+"_encoded" for x in nominal_cols]
    stage_nominal_indexer = StringIndexer(inputCols=nominal_cols, outputCols=nominal_id_cols)
    stage_nominal_onehot_encoder = OneHotEncoder(inputCols=nominal_id_cols, outputCols=nominal_onehot_cols)
    
    feature_cols = [col for col in continuous_cols + binary_cols + nominal_onehot_cols if col not in corelated_cols_to_remove]


    stage_vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="vectorized_features")
    stage_scaler = StandardScaler(inputCol='vectorized_features', outputCol='features')
    stage_outcome = OutcomeCreater()
    columns_to_drop = (
        nominal_cols + nominal_id_cols + nominal_onehot_cols +
        binary_cols + continuous_cols + ["vectorized_features"] + corelated_cols_to_remove
    )
    stage_column_dropper = ColumnDropper(columns_to_drop=columns_to_drop)

    pipeline = Pipeline(stages=[stage_typecaster, stage_nominal_indexer, stage_nominal_onehot_encoder,
                                stage_vector_assembler, stage_scaler, stage_outcome, stage_column_dropper])
    return pipeline


In [11]:
# Get the preprocessing pipeline
pipeline = get_preprocess_pipeline()

# Fit the pipeline to data
pipeline_model = pipeline.fit(train_df)

# Transform the data
train_df_preprocessed = pipeline_model.transform(train_df)
test_df_preprocessed = pipeline_model.transform(test_df)

# # Apply the outlier handling function
# train_df_with_outlier_handling = find_outliers(train_df_preprocessed)

# # Count rows with 4 or fewer outliers
# count_4_or_less_outliers = train_df_with_outlier_handling.filter(train_df_with_outlier_handling['total_outliers'] >= 4).count()

# print(f"Number of rows with 4 or fewer outliers: {count_4_or_less_outliers}")


In [12]:
print(train_df_preprocessed.dtypes)

[('features', 'vector'), ('outcome', 'double')]


In [13]:
# Cache the dataframes
train_df_final = train_df_preprocessed.cache()
test_df_final = test_df_preprocessed.cache()

In [14]:
train_df_final = train_df_preprocessed
test_df_final = test_df_preprocessed

In [15]:
from pyspark.sql import functions as F

train_df_final_slowmite = train_df_final.filter(col("outcome") == "1.0").count()
train_df_zeros = train_df_final.filter(F.col("outcome") == "0.0").sample(False, train_df_final_slowmite / train_df_final.filter(F.col("outcome") == "0.0").count())
train_df_final = train_df_final.filter(F.col("outcome") != "0.0")
train_df_final = train_df_final.union(train_df_zeros)


In [16]:
print(train_df_final.count())

8399438


In [19]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

evaluator = MulticlassClassificationEvaluator(labelCol="outcome", predictionCol="prediction", metricName="accuracy")

# Classifier 2: Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="outcome", numTrees=20)

# rf_paramGrid = (ParamGridBuilder()
#                 .addGrid(rf.numTrees, [10, 20, 40])
#                 .addGrid(rf.maxDepth, [5, 10, 15])
#                 .build())

rf_paramGrid = (ParamGridBuilder()
                .addGrid(rf.numTrees, [10, 20])  # Reduced options
                .addGrid(rf.maxDepth, [5, 10])   # Reduced options
                .build())

# Set up CrossValidator for Random Forest
rf_cv = CrossValidator(estimator=rf,
                       estimatorParamMaps=rf_paramGrid,
                       evaluator=evaluator,
                       numFolds=3)

print("Training Random Forest Model...")
rf_model = rf_cv.fit(train_df_final)

print("Evaluating Random Forest Model...")
rf_predictions = rf_model.transform(test_df_final)
rf_accuracy = evaluator.evaluate(rf_predictions)
print(f"Test Accuracy of Random Forest Model: {rf_accuracy}")

best_rf_model = rf_model.bestModel

print("Best Random Forest Model Parameters:")
print(f"NumTrees: {best_rf_model.getNumTrees}")
print(f"MaxDepth: {best_rf_model.getMaxDepth()}")


Training Random Forest Model...


Py4JJavaError: An error occurred while calling o8335.cache.
: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.DirectConstructorHandleAccessor.newInstance(DirectConstructorHandleAccessor.java:62)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:502)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:486)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
py4j.ClientServerConnection.run(ClientServerConnection.java:106)
java.base/java.lang.Thread.run(Thread.java:1570)

The currently active SparkContext was created at:

(No active SparkContext.)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:122)
	at org.apache.spark.sql.SparkSession.<init>(SparkSession.scala:113)
	at org.apache.spark.sql.SparkSession.cloneSession(SparkSession.scala:278)
	at org.apache.spark.sql.SparkSession$.getOrCloneSessionWithConfigsOff(SparkSession.scala:1255)
	at org.apache.spark.sql.execution.CacheManager.getOrCloneSessionWithConfigsOff(CacheManager.scala:406)
	at org.apache.spark.sql.execution.CacheManager.cacheQuery(CacheManager.scala:121)
	at org.apache.spark.sql.execution.CacheManager.cacheQuery(CacheManager.scala:93)
	at org.apache.spark.sql.Dataset.persist(Dataset.scala:3775)
	at org.apache.spark.sql.Dataset.cache(Dataset.scala:3785)
	at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1570)


In [17]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Define two classifiers
# Classifier 1: Logistic Regression
lr = LogisticRegression(featuresCol="features", labelCol="outcome", maxIter=10)



# Set up parameter grids for tuning
lr_paramGrid = (ParamGridBuilder()
                .addGrid(lr.regParam, [0.01, 0.1, 1.0])
                .addGrid(lr.maxIter, [1, 5, 10])
                .build())

# Define evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="outcome", predictionCol="prediction", metricName="accuracy")

# Set up CrossValidator for Logistic Regression
lr_cv = CrossValidator(estimator=lr,
                       estimatorParamMaps=lr_paramGrid,
                       evaluator=evaluator,
                       numFolds=3)

# Fit models
print("Training Logistic Regression Model...")
lr_model = lr_cv.fit(train_df_final)


# Evaluate on test data
print("Evaluating Logistic Regression Model...")
lr_predictions = lr_model.transform(test_df_final)
lr_accuracy = evaluator.evaluate(lr_predictions)
print(f"Test Accuracy of Logistic Regression Model: {lr_accuracy}")


# Record the best models and parameters
best_lr_model = lr_model.bestModel

print("Best Logistic Regression Model Parameters:")
print(f"RegParam: {best_lr_model._java_obj.getRegParam()}")
print(f"MaxIter: {best_lr_model._java_obj.getMaxIter()}")

Training Logistic Regression Model...
Evaluating Logistic Regression Model...
Test Accuracy of Logistic Regression Model: 0.6043538333333334
Best Logistic Regression Model Parameters:
RegParam: 0.01
MaxIter: 10


In [18]:
from pyspark.sql.types import ArrayType, FloatType
import torch
from torch.utils.data import Dataset,DataLoader
from torch import nn
import numpy as np 
import pandas as pd

to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType())) # keep as is

train_df_final, df_validate_final = train_df_final.randomSplit([0.8, 0.2], seed=42)

df_train_pandas = train_df_final.withColumn('features', to_array('features')).toPandas()
df_validate_pandas = df_validate_final.withColumn('features', to_array('features')).toPandas()
df_test_pandas = test_df_final.withColumn('features', to_array('features')).toPandas()

x_train = torch.from_numpy(np.array(df_train_pandas['features'].values.tolist(),np.float32))
y_train = torch.from_numpy(np.array(df_train_pandas['outcome'].values.tolist(),np.int64))

x_validate = torch.from_numpy(np.array(df_validate_pandas['features'].values.tolist(),np.float32))
y_validate = torch.from_numpy(np.array(df_validate_pandas['outcome'].values.tolist(),np.int64))

x_test = torch.from_numpy(np.array(df_test_pandas['features'].values.tolist(),np.float32))
y_test = torch.from_numpy(np.array(df_test_pandas['outcome'].values.tolist(),np.int64))

class MyDataset(Dataset): 
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

train_dataset = MyDataset(x_train, y_train)
test_dataset = MyDataset(x_test, y_test)
validate_dataset = MyDataset(x_validate, y_validate)

batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=False)


Py4JJavaError: An error occurred while calling o8318.collectToPython.
: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.sql.execution.SparkPlan$$anon$1._next(SparkPlan.scala:415)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.getNext(SparkPlan.scala:426)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.getNext(SparkPlan.scala:412)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.util.NextIterator.foreach(NextIterator.scala:21)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeCollect$1(SparkPlan.scala:449)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeCollect$1$adapted(SparkPlan.scala:448)
	at org.apache.spark.sql.execution.SparkPlan$$Lambda/0x0000029f58039338.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:448)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:390)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec$$Lambda/0x0000029f57f29148.apply(Unknown Source)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:418)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:390)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:4150)
	at org.apache.spark.sql.Dataset$$Lambda/0x0000029f583cf9d8.apply(Unknown Source)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4324)
	at org.apache.spark.sql.Dataset$$Lambda/0x0000029f57c36c28.apply(Unknown Source)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4322)
	at org.apache.spark.sql.Dataset$$Lambda/0x0000029f57b4e810.apply(Unknown Source)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$$$Lambda/0x0000029f57b51fe0.apply(Unknown Source)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.execution.SQLExecution$$$Lambda/0x0000029f57b4ead8.apply(Unknown Source)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)


In [None]:

class ShallowMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.sequential = nn.Sequential(
            nn.Linear(input_dim, 20),
            nn.ReLU(),
            nn.Linear(20, 20),
            nn.ReLU(),
            nn.Linear(20, output_dim)
        )
        
    def forward(self, x):
        y = self.sequential(x)
        return y
    
class DeepMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.sequential = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, output_dim)
        )
        
    def forward(self, x):
        y = self.sequential(x)
        return y

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs):
    best_val_accuracy = 0.0
    best_model_state = None

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0

        for x_batch, y_batch in train_loader:
            # Forward pass
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)

            # Zero gradients
            optimizer.zero_grad()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Statistics
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_train += y_batch.size(0)
            correct_train += (predicted == y_batch).sum().item()

        train_loss = running_loss / len(train_loader)
        train_accuracy = correct_train / total_train

        # Validation phase
        model.eval()
        running_val_loss = 0.0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                outputs = model(x_batch)
                loss = criterion(outputs, y_batch)
                running_val_loss += loss.item()

                _, predicted = torch.max(outputs.data, 1)
                total_val += y_batch.size(0)
                correct_val += (predicted == y_batch).sum().item()

        val_loss = running_val_loss / len(val_loader)
        val_accuracy = correct_val / total_val

        # Save the model if it has the best validation accuracy so far
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = model.state_dict()

        print(f"Epoch [{epoch+1}/{num_epochs}], "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

    # Load the best model state
    model.load_state_dict(best_model_state)
    return model, best_val_accuracy


In [None]:

def evaluate_model(model, test_loader, criterion):
    model.eval()
    correct_test = 0
    total_test = 0
    running_test_loss = 0.0

    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            running_test_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total_test += y_batch.size(0)
            correct_test += (predicted == y_batch).sum().item()

    test_loss = running_test_loss / len(test_loader)
    test_accuracy = correct_test / total_test

    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
    return test_accuracy


In [None]:
input_dim = x_train.shape[1]
output_dim = len(torch.unique(y_train))

# Hyperparameter options
learning_rates = [0.01, 0.001]
num_epochs_list = [10, 20]
batch_sizes = [32, 64]

best_val_accuracy_shallow = 0.0
best_params_shallow = {}
best_test_accuracy_shallow = 0.0

for lr in learning_rates:
    for num_epochs in num_epochs_list:
        for batch_size in batch_sizes:
            print(f"\nTraining ShallowMLP with lr={lr}, num_epochs={num_epochs}, batch_size={batch_size}")
            
            # Update DataLoaders with new batch_size
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=False)
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
            
            # Initialize model, criterion, and optimizer
            model = ShallowMLP(input_dim, output_dim)
            criterion = nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            
            # Train model using the validation set
            model, val_accuracy = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs)
            
            # Evaluate the model on the test set
            test_accuracy = evaluate_model(model, test_loader, criterion)
            
            # Record best model based on validation accuracy
            if val_accuracy > best_val_accuracy_shallow:
                best_val_accuracy_shallow = val_accuracy
                best_params_shallow = {'lr': lr, 'num_epochs': num_epochs, 'batch_size': batch_size}
                best_model_shallow = model.state_dict()
                best_test_accuracy_shallow = test_accuracy  # Save corresponding test accuracy

print("\nBest ShallowMLP Validation Accuracy: {:.4f}".format(best_val_accuracy_shallow))
print("Corresponding Test Accuracy: {:.4f}".format(best_test_accuracy_shallow))
print("Best Hyperparameters for ShallowMLP:")
print(best_params_shallow)

In [None]:
best_val_accuracy_deep = 0.0
best_params_deep = {}
best_test_accuracy_deep = 0.0

for lr in learning_rates:
    for num_epochs in num_epochs_list:
        for batch_size in batch_sizes:
            print(f"\nTraining DeepMLP with lr={lr}, num_epochs={num_epochs}, batch_size={batch_size}")

            # Update DataLoaders with new batch_size
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=False)
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

            # Initialize model, criterion, and optimizer
            model = DeepMLP(input_dim, output_dim)
            criterion = nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)

            # Train model
            model, val_accuracy = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs)

            # Evaluate on test set
            test_accuracy = evaluate_model(model, test_loader, criterion)

            # Record best model based on validation accuracy
            if val_accuracy > best_val_accuracy_deep:
                best_val_accuracy_deep = val_accuracy
                best_params_deep = {'lr': lr, 'num_epochs': num_epochs, 'batch_size': batch_size}
                best_model_deep_state = model.state_dict()
                best_test_accuracy_deep = test_accuracy

print("\nBest DeepMLP Validation Accuracy: {:.4f}".format(best_val_accuracy_deep))
print("Corresponding Test Accuracy: {:.4f}".format(best_test_accuracy_deep))
print("Best Hyperparameters for DeepMLP:")
print(best_params_deep)