In [81]:
import findspark
findspark.init()
findspark.find()

import requests
import feedparser
import pyspark
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from datetime import datetime
import pytz
from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext

# Initialize Spark
appName = "Project - Machine Learning Techniques on MQTT"
master = "local"

# Create Configuration object for Spark.
conf = pyspark.SparkConf()\
    .set('spark.driver.host','127.0.0.1')\
    .setAppName(appName)\
    .setMaster(master)

# Create Spark Context with the new configurations rather than relying on the default
sc = SparkContext.getOrCreate(conf=conf)
# You need to create SQL Context to conduct some database operations like what we will
sqlContext = SQLContext(sc)
# If you have SQL context, you create the session from the Spark Context
spark = sqlContext.sparkSession.builder.getOrCreate()



In [82]:
train_path = r"C:\Users\nickc\OneDrive\Documents\GitHub\course-project-option-2-nick1117\DataFolder\archive\Data\FINAL_CSV\train70_augmented.csv"
train_data = spark.read.csv(train_path, header=True, inferSchema=True)

test_path = r"C:\Users\nickc\OneDrive\Documents\GitHub\course-project-option-2-nick1117\DataFolder\archive\Data\FINAL_CSV\test30_augmented.csv"
test_data = spark.read.csv(test_path, header=True, inferSchema=True)

# done with data outliers
# train_data_top_5000 = train_data.limit(30000)
# test_data_top_5000 = test_data.limit(5000)

train_data_top_5000 = train_data.limit(100000)
test_data_top_5000 = test_data.limit(50000)

In [83]:
train_data_top_5000.show()

+----------+--------------+-------+-----------------+--------------------------+--------------------+---------------+----------------------+-------------------+----------------+---------------------+-------------------+------------------+---------------------+-------------+------------+-------------+-----------+--------+--------+----------+------------+--------------+--------------+--------+-----------+------------+---------------+--------+------------+----------------+--------------+------------------+----------+
| tcp.flags|tcp.time_delta|tcp.len|mqtt.conack.flags|mqtt.conack.flags.reserved|mqtt.conack.flags.sp|mqtt.conack.val|mqtt.conflag.cleansess|mqtt.conflag.passwd|mqtt.conflag.qos|mqtt.conflag.reserved|mqtt.conflag.retain|mqtt.conflag.uname|mqtt.conflag.willflag|mqtt.conflags|mqtt.dupflag|mqtt.hdrflags|mqtt.kalive|mqtt.len|mqtt.msg|mqtt.msgid|mqtt.msgtype|mqtt.proto_len|mqtt.protoname|mqtt.qos|mqtt.retain|mqtt.sub.qos|mqtt.suback.qos|mqtt.ver|mqtt.willmsg|mqtt.willmsg_len|mqtt.w

In [84]:
from pyspark.sql.functions import lit

train_df = train_data_top_5000.withColumn("dataset_type", lit("train"))
test_df = test_data_top_5000.withColumn("dataset_type", lit("test"))
df = train_df.union(test_df)

In [85]:
new_column_names = [
    "tcp_flags",
    "tcp_time_delta",
    "tcp_length",
    "mqtt_connack_flags",
    "mqtt_connack_reserved_flags",
    "mqtt_connack_session_present",
    "mqtt_connack_return_code",
    "mqtt_connect_clean_session_flag",
    "mqtt_connect_password_flag",
    "mqtt_connect_qos_level",
    "mqtt_connect_reserved_flag",
    "mqtt_connect_retain_flag",
    "mqtt_connect_username_flag",
    "mqtt_connect_will_flag",
    "mqtt_connect_flags",
    "mqtt_duplicate_flag",
    "mqtt_header_flags",
    "mqtt_keep_alive_interval",
    "mqtt_length",
    "mqtt_message",
    "mqtt_message_id",
    "mqtt_message_type",
    "mqtt_protocol_length",
    "mqtt_protocol_name",
    "mqtt_qos_level",
    "mqtt_retain_flag",
    "mqtt_subscription_qos_level",
    "mqtt_subscription_ack_qos_level",
    "mqtt_version",
    "mqtt_will_message",
    "mqtt_will_message_length",
    "mqtt_will_topic",
    "mqtt_will_topic_length",
    "target_class",
    "dataset_type"
]

df = df.toDF(*new_column_names)

train_df = train_df.toDF(*new_column_names)
test_df = test_df.toDF(*new_column_names)
train_df = train_df.drop("dataset_type")
test_df = test_df.drop("dataset_type")

In [86]:
from pyspark.sql.functions import col, avg, count
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank
import matplotlib.pyplot as plt
from pyspark.sql.functions import *

In [37]:
from functools import reduce

def column_add(a, b):
    return a + b

def find_outliers(df):
    # Identifying the numerical columns in a spark dataframe
    numeric_columns = [column[0] for column in df.dtypes if column[1] in ('int', 'double')]

    # Using the `for` loop to create new columns by identifying the outliers for each feature
    for column in numeric_columns:
        less_Q1 = 'less_Q1_{}'.format(column)
        more_Q3 = 'more_Q3_{}'.format(column)

        # Q1 : First Quartile ., Q3 : Third Quartile
        Q1 = df.approxQuantile(column, [0.25], relativeError=0)
        Q3 = df.approxQuantile(column, [0.75], relativeError=0)

        # IQR : Inter Quantile Range
        IQR = Q3[0] - Q1[0]

        # Selecting the data within -1.5*IQR to + 1.5*IQR
        less_Q1 = Q1[0] - 1.5 * IQR
        more_Q3 = Q3[0] + 1.5 * IQR

        isOutlierCol = 'is_outlier_{}'.format(column)
        df = df.withColumn(isOutlierCol, when((df[column] > more_Q3) | (df[column] < less_Q1), lit(1)).otherwise(lit(0)))

    # Selecting the specific columns created above to check for outliers
    selected_columns = [column for column in df.columns if column.startswith("is_outlier")]
    
    # Check if there are outlier columns
    if selected_columns:
        # Adding all the outlier columns into a new column "total_outliers" to see the total number of outliers
        df = df.withColumn('total_outliers', reduce(column_add, [df[col] for col in selected_columns], lit(0)))

    # Dropping the extra columns created above for a clean dataframe
    df = df.drop(*selected_columns)

    return df

In [87]:
from pyspark.ml import Pipeline, Transformer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.sql import DataFrame

col_names = new_column_names
    # Define the columns
binary_cols = [
    "mqtt_connack_reserved_flags",
    "mqtt_connack_session_present",
    "mqtt_connect_clean_session_flag",
    "mqtt_connect_password_flag",
    "mqtt_connect_reserved_flag",
    "mqtt_connect_retain_flag",
    "mqtt_connect_username_flag",
    "mqtt_connect_will_flag",
    "mqtt_duplicate_flag",
    "mqtt_retain_flag",
    "mqtt_connect_qos_level"
]

nominal_cols = [
    "tcp_flags",
    "mqtt_connack_flags",
    "mqtt_connack_return_code",
    "mqtt_connect_flags",
    "mqtt_header_flags",
    "mqtt_message_type",
    "mqtt_qos_level",
    "mqtt_version",
    "mqtt_message",
    'mqtt_protocol_name'
]

continuous_cols = [
    "tcp_time_delta",
    "tcp_length",
    "mqtt_keep_alive_interval",
    "mqtt_length",
    "mqtt_protocol_length",
    "mqtt_will_message_length",
    "mqtt_will_topic_length",
    "mqtt_will_topic",
    "mqtt_will_message",
    "mqtt_subscription_ack_qos_level",
    "mqtt_subscription_qos_level",
    "mqtt_message_id"
]

class OutcomeCreater(Transformer):
    def __init__(self):
        super().__init__()

    def _transform(self, dataset):
        label_to_multiclass = udf(lambda name:
                                  0.0 if name == 'legitimate' else
                                  1.0 if name == 'slowite' else
                                  2.0 if name == 'bruteforce' else
                                  3.0 if name == 'flood' else
                                  4.0 if name == 'malformed' else
                                  5.0 if name == 'dos' else
                                  -1.0, DoubleType())
        output_df = dataset.withColumn('outcome', label_to_multiclass(col('target_class')))
        output_df = output_df.drop("target_class")
        output_df = output_df.withColumn('outcome', col('outcome').cast(DoubleType()))
        return output_df
    
class FeatureTypeCaster(Transformer):
    def __init__(self):
        super().__init__()

    def _transform(self, dataset: DataFrame):
        output_df = dataset
        for col_name in binary_cols + continuous_cols:
            output_df = output_df.withColumn(col_name, col(col_name).cast(DoubleType()))
        return output_df
    
class ColumnDropper(Transformer):
    def __init__(self, columns_to_drop=None):
        super().__init__()
        self.columns_to_drop = columns_to_drop

    def _transform(self, dataset):
        output_df = dataset
        for col_name in self.columns_to_drop:
            output_df = output_df.drop(col_name)
        return output_df

def get_preprocess_pipeline():
    # Columns to remove
    corelated_cols_to_remove = ["mqtt_connack_reserved_flags", "mqtt_connack_session_present", "mqtt_connect_qos_level", ## zero values
                                "mqtt_connect_reserved_flag", "mqtt_connect_retain_flag", "mqtt_connect_will_flag", "mqtt_message", 
                                "mqtt_subscription_qos_level", "mqtt_subscription_ack_qos_level", "mqtt_will_message",
                                "mqtt_will_message_length", "mqtt_will_topic", "mqtt_will_topic_length",'mqtt_protocol_name',
                                
                                
                                "mqtt_protool_length", "mqtt_keep_alive_interval"] #high correlation (1.00), (0.77)

    # Stage where columns are cast as appropriate types
    stage_typecaster = FeatureTypeCaster()

    # Convert nominal columns to string type
    class NominalTypeCaster(Transformer):
        def __init__(self):
            super().__init__()

        def _transform(self, dataset):
            output_df = dataset
            for col_name in nominal_cols:
                output_df = output_df.withColumn(col_name, col(col_name).cast("string"))
            return output_df

    stage_nominal_typecaster = NominalTypeCaster()

    # Create a list of StringIndexers with handleInvalid='keep'
    stage_nominal_indexers = [
        StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid='keep') 
        for col in nominal_cols
    ]

    # Create a list of OneHotEncoders with handleInvalid='keep'
    stage_nominal_onehot_encoders = [
        OneHotEncoder(inputCol=col + "_index", outputCol=col + "_encoded", handleInvalid='keep') 
        for col in nominal_cols
    ]

    # Feature columns assembly
    feature_cols = continuous_cols + binary_cols + [col + "_encoded" for col in nominal_cols]

    # Remove correlated columns from features
    for col_name in corelated_cols_to_remove:
        if col_name in feature_cols:
            feature_cols.remove(col_name)

    stage_vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="vectorized_features")

    # Stage where we scale the features
    stage_scaler = StandardScaler(inputCol='vectorized_features', outputCol='features')

    # Stage for creating the outcome column
    stage_outcome = OutcomeCreater()

    # Removing unnecessary columns
    columns_to_drop = (
        nominal_cols + 
        [col + "_index" for col in nominal_cols] + 
        [col + "_encoded" for col in nominal_cols] + 
        binary_cols + continuous_cols + ['vectorized_features']
    )
    stage_column_dropper = ColumnDropper(columns_to_drop=columns_to_drop)

    # Connect the stages into a pipeline
    stages = (
        [stage_typecaster, stage_nominal_typecaster] + 
        stage_nominal_indexers + 
        stage_nominal_onehot_encoders + 
        [stage_vector_assembler, stage_scaler, stage_outcome, stage_column_dropper]
    )
    pipeline = Pipeline(stages=stages)

    return pipeline


In [58]:
df_with_outlier_handling = find_outliers(train_df)
df_with_outlier_handling.show(1, vertical=True)

-RECORD 0-------------------------------------
 tcp_flags                       | 0x00000018 
 tcp_time_delta                  | 1.000039   
 tcp_length                      | 13         
 mqtt_connack_flags              | 0          
 mqtt_connack_reserved_flags     | 0.0        
 mqtt_connack_session_present    | 0.0        
 mqtt_connack_return_code        | 0.0        
 mqtt_connect_clean_session_flag | 0.0        
 mqtt_connect_password_flag      | 0.0        
 mqtt_connect_qos_level          | 0.0        
 mqtt_connect_reserved_flag      | 0.0        
 mqtt_connect_retain_flag        | 0.0        
 mqtt_connect_username_flag      | 0.0        
 mqtt_connect_will_flag          | 0.0        
 mqtt_connect_flags              | 0          
 mqtt_duplicate_flag             | 0.0        
 mqtt_header_flags               | 0x00000030 
 mqtt_keep_alive_interval        | 0.0        
 mqtt_length                     | 11.0       
 mqtt_message                    | 32         
 mqtt_message

In [59]:
df_with_outlier_handling.groupby("total_outliers").count().show()

+--------------+-----+
|total_outliers|count|
+--------------+-----+
|             1| 7188|
|             0|19498|
|             2| 1163|
|             6|  258|
|             4|  930|
|             5|  769|
|             3|  116|
|             7|   75|
|             8|    3|
+--------------+-----+



In [60]:
df_with_substituted_na_and_outliers = df_with_outlier_handling.\
        filter(df_with_outlier_handling['total_Outliers']<=4)
print(df_with_substituted_na_and_outliers.count())

28895


In [61]:
df_with_numeric_target = df_with_substituted_na_and_outliers.withColumn(
    "target_class",
    when(df_with_substituted_na_and_outliers["target_class"] == "legitimate", 0.0)
    .when(df_with_substituted_na_and_outliers["target_class"] == "slowite", 1.0)
    .when(df_with_substituted_na_and_outliers["target_class"] == "bruteforce", 2.0)
    .when(df_with_substituted_na_and_outliers["target_class"] == "flood", 3.0)
    .when(df_with_substituted_na_and_outliers["target_class"] == "malformed", 4.0)
    .when(df_with_substituted_na_and_outliers["target_class"] == "dos", 5.0)
    .otherwise(None)  # In case of unexpected values, set to None or any default
)

In [62]:
df_with_numeric_target = df_with_numeric_target.drop("tcp_flags")
df_with_numeric_target = df_with_numeric_target.drop("mqtt_header_flags")
df_with_numeric_target = df_with_numeric_target.drop("mqtt_connack_flags")
df_with_numeric_target = df_with_numeric_target.drop("mqtt_message")
df_with_numeric_target = df_with_numeric_target.drop("mqtt_protocol_name")
#df_with_numeric_target = df_with_numeric_target.drop("target_class")
df_with_numeric_target = df_with_numeric_target.drop("mqtt_connect_flags")
#drop string variables
#df_with_substituted_na_and_outliers.dtypes
correlation_matrix = df_with_numeric_target.toPandas().corr()
print(correlation_matrix)

                                 tcp_time_delta  tcp_length  \
tcp_time_delta                         1.000000    0.042445   
tcp_length                             0.042445    1.000000   
mqtt_connack_reserved_flags                 NaN         NaN   
mqtt_connack_session_present                NaN         NaN   
mqtt_connack_return_code              -0.011030   -0.022957   
mqtt_connect_clean_session_flag       -0.008830   -0.018037   
mqtt_connect_password_flag                  NaN         NaN   
mqtt_connect_qos_level                      NaN         NaN   
mqtt_connect_reserved_flag                  NaN         NaN   
mqtt_connect_retain_flag                    NaN         NaN   
mqtt_connect_username_flag                  NaN         NaN   
mqtt_connect_will_flag                      NaN         NaN   
mqtt_duplicate_flag                         NaN         NaN   
mqtt_keep_alive_interval              -0.006837   -0.013966   
mqtt_length                           -0.009100    0.07

strong correlations between: 
- mqtt_connect_clean_session_flag and mqtt_keep_alive_interval (0.77) / mqtt_protocol_length (1.00)
- mqtt_version and mqtt_protocol_length                 

In [88]:
# Get the preprocessing pipeline
pipeline = get_preprocess_pipeline()

# Fit the pipeline to data
pipeline_model = pipeline.fit(train_df)

# Transform the data
train_df_preprocessed = pipeline_model.transform(train_df)
test_df_preprocessed = pipeline_model.transform(test_df)

In [None]:


# Apply the outlier handling function
train_df_with_outlier_handling = find_outliers(train_df_preprocessed)

# Count rows with 4 or fewer outliers
count_4_or_less_outliers = train_df_with_outlier_handling.filter(train_df_with_outlier_handling['total_outliers'] >= 4).count()

# Print the result
print(f"Number of rows with 4 or fewer outliers: {count_4_or_less_outliers}")

Number of rows with 4 or fewer outliers: 0


In [89]:
train_df_final = train_df_preprocessed.cache()
test_df_final = test_df_preprocessed.cache()

In [91]:
from pyspark.sql import functions as F

train_df_final_slowmite = train_df_final.filter(col("outcome") == "1.0").count()
train_df_zeros = train_df_final.filter(F.col("outcome") == "0.0").sample(False, train_df_final_slowmite / train_df_final.filter(F.col("outcome") == "0.0").count())
train_df_final = train_df_final.filter(F.col("outcome") != "0.0")
train_df_final = train_df_final.union(train_df_zeros)

In [67]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Define two classifiers
# Classifier 1: Logistic Regression
lr = LogisticRegression(featuresCol="features", labelCol="outcome", maxIter=10)

# Classifier 2: Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="outcome", numTrees=20)

# Set up parameter grids for tuning
lr_paramGrid = (ParamGridBuilder()
                .addGrid(lr.regParam, [0.01, 0.1, 1.0])
                .addGrid(lr.maxIter, [1, 5, 10])
                .build())

# Define evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="outcome", predictionCol="prediction", metricName="accuracy")

# Set up CrossValidator for Logistic Regression
lr_cv = CrossValidator(estimator=lr,
                       estimatorParamMaps=lr_paramGrid,
                       evaluator=evaluator,
                       numFolds=3)

# Fit models
print("Training Logistic Regression Model...")
lr_model = lr_cv.fit(train_df_final)


# Evaluate on test data
print("Evaluating Logistic Regression Model...")
lr_predictions = lr_model.transform(test_df_final)
lr_accuracy = evaluator.evaluate(lr_predictions)
print(f"Test Accuracy of Logistic Regression Model: {lr_accuracy}")


# Record the best models and parameters
best_lr_model = lr_model.bestModel

print("Best Logistic Regression Model Parameters:")
print(f"RegParam: {best_lr_model._java_obj.getRegParam()}")
print(f"MaxIter: {best_lr_model._java_obj.getMaxIter()}")

Training Logistic Regression Model...
Evaluating Logistic Regression Model...
Test Accuracy of Logistic Regression Model: 0.828
Best Logistic Regression Model Parameters:
RegParam: 0.01
MaxIter: 10


In [92]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

rf_paramGrid = (ParamGridBuilder()
                .addGrid(rf.numTrees, [10, 20, 40])
                .addGrid(rf.maxDepth, [5, 10, 15])
                .build())

# Set up CrossValidator for Random Forest
rf_cv = CrossValidator(estimator=rf,
                       estimatorParamMaps=rf_paramGrid,
                       evaluator=evaluator,
                       numFolds=3)

print("Training Random Forest Model...")
rf_model = rf_cv.fit(train_df_final)

print("Evaluating Random Forest Model...")
rf_predictions = rf_model.transform(test_df_final)
rf_accuracy = evaluator.evaluate(rf_predictions)
print(f"Test Accuracy of Random Forest Model: {rf_accuracy}")

best_rf_model = rf_model.bestModel

print("Best Random Forest Model Parameters:")
print(f"NumTrees: {best_rf_model.getNumTrees}")
print(f"MaxDepth: {best_rf_model.getMaxDepth()}")


Training Random Forest Model...
Evaluating Random Forest Model...
Test Accuracy of Random Forest Model: 0.67706
Best Random Forest Model Parameters:
NumTrees: 40
MaxDepth: 15


In [75]:
from pyspark.sql.types import ArrayType, FloatType
import torch
from torch.utils.data import Dataset,DataLoader
from torch import nn
import numpy as np 
import pandas as pd

to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType())) # keep as is

train_df_final, df_validate_final = train_df_final.randomSplit([0.8, 0.2], seed=42)

df_train_pandas = train_df_final.withColumn('features', to_array('features')).toPandas()
df_validate_pandas = df_validate_final.withColumn('features', to_array('features')).toPandas()
df_test_pandas = test_df_final.withColumn('features', to_array('features')).toPandas()

x_train = torch.from_numpy(np.array(df_train_pandas['features'].values.tolist(),np.float32))
y_train = torch.from_numpy(np.array(df_train_pandas['outcome'].values.tolist(),np.int64))

x_validate = torch.from_numpy(np.array(df_validate_pandas['features'].values.tolist(),np.float32))
y_validate = torch.from_numpy(np.array(df_validate_pandas['outcome'].values.tolist(),np.int64))

x_test = torch.from_numpy(np.array(df_test_pandas['features'].values.tolist(),np.float32))
y_test = torch.from_numpy(np.array(df_test_pandas['outcome'].values.tolist(),np.int64))

class MyDataset(Dataset): 
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

train_dataset = MyDataset(x_train, y_train)
test_dataset = MyDataset(x_test, y_test)
validate_dataset = MyDataset(x_validate, y_validate)

batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=False)


In [70]:

class ShallowMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.sequential = nn.Sequential(
            nn.Linear(input_dim, 20),
            nn.ReLU(),
            nn.Linear(20, 20),
            nn.ReLU(),
            nn.Linear(20, output_dim)
        )
        
    def forward(self, x):
        y = self.sequential(x)
        return y
    
class DeepMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.sequential = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, output_dim)
        )
        
    def forward(self, x):
        y = self.sequential(x)
        return y

In [76]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs):
    best_val_accuracy = 0.0
    best_model_state = None

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0

        for x_batch, y_batch in train_loader:
            # Forward pass
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)

            # Zero gradients
            optimizer.zero_grad()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Statistics
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_train += y_batch.size(0)
            correct_train += (predicted == y_batch).sum().item()

        train_loss = running_loss / len(train_loader)
        train_accuracy = correct_train / total_train

        # Validation phase
        model.eval()
        running_val_loss = 0.0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                outputs = model(x_batch)
                loss = criterion(outputs, y_batch)
                running_val_loss += loss.item()

                _, predicted = torch.max(outputs.data, 1)
                total_val += y_batch.size(0)
                correct_val += (predicted == y_batch).sum().item()

        val_loss = running_val_loss / len(val_loader)
        val_accuracy = correct_val / total_val

        # Save the model if it has the best validation accuracy so far
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = model.state_dict()

        print(f"Epoch [{epoch+1}/{num_epochs}], "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

    # Load the best model state
    model.load_state_dict(best_model_state)
    return model, best_val_accuracy


In [77]:

def evaluate_model(model, test_loader, criterion):
    model.eval()
    correct_test = 0
    total_test = 0
    running_test_loss = 0.0

    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            running_test_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total_test += y_batch.size(0)
            correct_test += (predicted == y_batch).sum().item()

    test_loss = running_test_loss / len(test_loader)
    test_accuracy = correct_test / total_test

    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
    return test_accuracy


In [78]:
input_dim = x_train.shape[1]
output_dim = len(torch.unique(y_train))

# Hyperparameter options
learning_rates = [0.01, 0.001]
num_epochs_list = [10, 20]
batch_sizes = [32, 64]

best_val_accuracy_shallow = 0.0
best_params_shallow = {}
best_test_accuracy_shallow = 0.0

for lr in learning_rates:
    for num_epochs in num_epochs_list:
        for batch_size in batch_sizes:
            print(f"\nTraining ShallowMLP with lr={lr}, num_epochs={num_epochs}, batch_size={batch_size}")
            
            # Update DataLoaders with new batch_size
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=False)
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
            
            # Initialize model, criterion, and optimizer
            model = ShallowMLP(input_dim, output_dim)
            criterion = nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            
            # Train model using the validation set
            model, val_accuracy = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs)
            
            # Evaluate the model on the test set
            test_accuracy = evaluate_model(model, test_loader, criterion)
            
            # Record best model based on validation accuracy
            if val_accuracy > best_val_accuracy_shallow:
                best_val_accuracy_shallow = val_accuracy
                best_params_shallow = {'lr': lr, 'num_epochs': num_epochs, 'batch_size': batch_size}
                best_model_shallow = model.state_dict()
                best_test_accuracy_shallow = test_accuracy  # Save corresponding test accuracy

print("\nBest ShallowMLP Validation Accuracy: {:.4f}".format(best_val_accuracy_shallow))
print("Corresponding Test Accuracy: {:.4f}".format(best_test_accuracy_shallow))
print("Best Hyperparameters for ShallowMLP:")
print(best_params_shallow)


Training ShallowMLP with lr=0.01, num_epochs=10, batch_size=32
Epoch [1/10], Train Loss: 0.5575, Train Acc: 0.8057, Val Loss: 0.5089, Val Acc: 0.8221
Epoch [2/10], Train Loss: 0.4836, Train Acc: 0.8283, Val Loss: 0.5022, Val Acc: 0.8219
Epoch [3/10], Train Loss: 0.4772, Train Acc: 0.8306, Val Loss: 0.4966, Val Acc: 0.8205
Epoch [4/10], Train Loss: 0.4760, Train Acc: 0.8284, Val Loss: 0.5099, Val Acc: 0.8140
Epoch [5/10], Train Loss: 0.4706, Train Acc: 0.8313, Val Loss: 0.4917, Val Acc: 0.8242
Epoch [6/10], Train Loss: 0.4710, Train Acc: 0.8316, Val Loss: 0.4915, Val Acc: 0.8228
Epoch [7/10], Train Loss: 0.4699, Train Acc: 0.8311, Val Loss: 0.4939, Val Acc: 0.8232
Epoch [8/10], Train Loss: 0.4702, Train Acc: 0.8310, Val Loss: 0.4908, Val Acc: 0.8230
Epoch [9/10], Train Loss: 0.4733, Train Acc: 0.8299, Val Loss: 0.4901, Val Acc: 0.8242
Epoch [10/10], Train Loss: 0.4688, Train Acc: 0.8310, Val Loss: 0.4928, Val Acc: 0.8242
Test Loss: 0.4758, Test Accuracy: 0.8276

Training ShallowMLP wit

In [80]:
best_val_accuracy_deep = 0.0
best_params_deep = {}
best_test_accuracy_deep = 0.0

for lr in learning_rates:
    for num_epochs in num_epochs_list:
        for batch_size in batch_sizes:
            print(f"\nTraining DeepMLP with lr={lr}, num_epochs={num_epochs}, batch_size={batch_size}")

            # Update DataLoaders with new batch_size
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=False)
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

            # Initialize model, criterion, and optimizer
            model = DeepMLP(input_dim, output_dim)
            criterion = nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)

            # Train model
            model, val_accuracy = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs)

            # Evaluate on test set
            test_accuracy = evaluate_model(model, test_loader, criterion)

            # Record best model based on validation accuracy
            if val_accuracy > best_val_accuracy_deep:
                best_val_accuracy_deep = val_accuracy
                best_params_deep = {'lr': lr, 'num_epochs': num_epochs, 'batch_size': batch_size}
                best_model_deep_state = model.state_dict()
                best_test_accuracy_deep = test_accuracy

print("\nBest DeepMLP Validation Accuracy: {:.4f}".format(best_val_accuracy_deep))
print("Corresponding Test Accuracy: {:.4f}".format(best_test_accuracy_deep))
print("Best Hyperparameters for DeepMLP:")
print(best_params_deep)


Training DeepMLP with lr=0.01, num_epochs=10, batch_size=32
Epoch [1/10], Train Loss: 0.6122, Train Acc: 0.7831, Val Loss: 0.5449, Val Acc: 0.8194
Epoch [2/10], Train Loss: 0.5046, Train Acc: 0.8239, Val Loss: 0.6445, Val Acc: 0.7653
Epoch [3/10], Train Loss: 0.5173, Train Acc: 0.8156, Val Loss: 0.5222, Val Acc: 0.8109
Epoch [4/10], Train Loss: 0.5119, Train Acc: 0.8194, Val Loss: 0.7685, Val Acc: 0.7198
Epoch [5/10], Train Loss: 0.5028, Train Acc: 0.8187, Val Loss: 0.5120, Val Acc: 0.8161
Epoch [6/10], Train Loss: 0.5158, Train Acc: 0.8190, Val Loss: 0.5166, Val Acc: 0.8159
Epoch [7/10], Train Loss: 0.4963, Train Acc: 0.8208, Val Loss: 0.5077, Val Acc: 0.8127
Epoch [8/10], Train Loss: 0.5126, Train Acc: 0.8198, Val Loss: 0.5551, Val Acc: 0.7948
Epoch [9/10], Train Loss: 0.6219, Train Acc: 0.8023, Val Loss: 0.5385, Val Acc: 0.8084
Epoch [10/10], Train Loss: 0.5154, Train Acc: 0.8188, Val Loss: 0.5307, Val Acc: 0.8180
Test Loss: 0.5085, Test Accuracy: 0.8194

Training DeepMLP with lr=0