# Safety Category: Models

In this notebook, several models are tested on the preprocessed training data.

**Models:**

- Random Forest
- Logistic Regression
- Support Vector Machine
- Neural Network

## Reading the data ##

In [16]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql import functions as F
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, LinearSVC, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Initialize Spark Session
spark = SparkSession.builder.appName("ModelComparison").getOrCreate()

# Load CSV dataset
raw_df = spark.read.csv("data/filtered_raw_data.csv", header=True, inferSchema=True)
label_df = spark.read.csv("data/label_table.csv", header=True, inferSchema=True)

# Compute magnitudes
df = raw_df.withColumn("accel_mag", F.sqrt(F.col("acceleration_x")**2 +
                                                F.col("acceleration_y")**2 +
                                                F.col("acceleration_z")**2)) \
                .withColumn("gyro_mag", F.sqrt(F.col("gyro_x")**2 +
                                               F.col("gyro_y")**2 +
                                               F.col("gyro_z")**2))

print(f"The raw data contains {df.count()} records.")

# Aggregate per interval
aggregated_df = df.groupBy("bookingid").agg(
    F.mean("speed").alias("avg_speed"),
    F.stddev("speed").alias("std_speed"),
    
    F.mean("accel_mag").alias("avg_accel_mag"),
    F.max("accel_mag").alias("max_accel_mag"),
    F.stddev("accel_mag").alias("std_accel_mag"),
    
    F.mean("gyro_mag").alias("avg_gyro_mag"),
    F.stddev("gyro_mag").alias("std_gyro_mag"),
    
    F.mean("acceleration_x").alias("avg_accel_x"),
    F.stddev("acceleration_x").alias("std_accel_x"),
    F.max("acceleration_x").alias("max_accel_x"),
    
    F.mean("acceleration_y").alias("avg_accel_y"),
    F.stddev("acceleration_y").alias("std_accel_y"),
    F.max("acceleration_y").alias("max_accel_y"),
    
    F.mean("acceleration_z").alias("avg_accel_z"),
    F.stddev("acceleration_z").alias("std_accel_z"),
    F.max("acceleration_z").alias("max_accel_z"),
    
    F.mean("gyro_x").alias("avg_gyro_x"),
    F.stddev("gyro_x").alias("std_gyro_x"),
    
    F.mean("gyro_y").alias("avg_gyro_y"),
    F.stddev("gyro_y").alias("std_gyro_y"),
    
    F.mean("gyro_z").alias("avg_gyro_z"),
    F.stddev("gyro_z").alias("std_gyro_z"),
    
    F.mean("accuracy").alias("avg_accuracy"),
    F.stddev("accuracy").alias("std_accuracy"),
    
    F.max("second").alias("second"),
)

labeled_df = aggregated_df.join(label_df, "bookingid", "left")

print(f"The labeled data contains {labeled_df.count()} records.")

df = labeled_df.fillna(0.0)

# Show data schema
df.printSchema()
print(f"Number of records: {df.count()}")

The raw data contains 1613554 records.
The labeled data contains 20000 records.
root
 |-- bookingid: long (nullable = true)
 |-- avg_speed: double (nullable = false)
 |-- std_speed: double (nullable = false)
 |-- avg_accel_mag: double (nullable = false)
 |-- max_accel_mag: double (nullable = false)
 |-- std_accel_mag: double (nullable = false)
 |-- avg_gyro_mag: double (nullable = false)
 |-- std_gyro_mag: double (nullable = false)
 |-- avg_accel_x: double (nullable = false)
 |-- std_accel_x: double (nullable = false)
 |-- max_accel_x: double (nullable = false)
 |-- avg_accel_y: double (nullable = false)
 |-- std_accel_y: double (nullable = false)
 |-- max_accel_y: double (nullable = false)
 |-- avg_accel_z: double (nullable = false)
 |-- std_accel_z: double (nullable = false)
 |-- max_accel_z: double (nullable = false)
 |-- avg_gyro_x: double (nullable = false)
 |-- std_gyro_x: double (nullable = false)
 |-- avg_gyro_y: double (nullable = false)
 |-- std_gyro_y: double (nullable = fal

## Preprocessing the data ##

In [17]:
# Drop non-feature columns
df = df.drop("bookingID")

# Ensure 'label' is integer type
df = df.withColumn("label", F.col("label").cast("integer"))

feature_cols = [
    "std_gyro_z",
    "std_accel_y",
    "std_accel_z",
    "max_accel_x",
    "avg_speed",
    "std_accel_x",
    "max_accel_mag",
    "std_speed",
    "std_gyro_mag",
    "std_gyro_x",
    "max_accel_z",
    "avg_gyro_mag",
    "std_accel_mag",
    "second",
]

In [18]:
# Convert features into a single feature vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)

# Normalize features using StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)
df = scaler.fit(df).transform(df)

# Select only the 'scaled_features' and 'label' columns
df = df.select("scaled_features", "label")
df = df.withColumnRenamed("scaled_features", "features")

# Show sample processed data
df.show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                                                                                                                                                                         |label|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|[0.312753069501274,1.0242575007151387,1.1160778342632933,2.1074726651670983,2.5963556529736684,2.079176142125469,4.073721417319177,3.6537257316905993,0.4

## Split Data for Training and Testing ##

In [19]:
# Split data into train (80%) and test (20%)
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Show dataset sizes
print(f"Training Data: {train_df.count()} rows")
print(f"Test Data: {test_df.count()} rows")

Training Data: 16047 rows
Test Data: 3953 rows


## Train the Models ##
Each model is trained with train_df, and then predictions are made on test_df.

**Train Random Forest**

In [20]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=50)
rf_model = rf.fit(train_df)
rf_preds = rf_model.transform(test_df)

**Train Logistic Regression**

In [21]:
lr = LogisticRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(train_df)
lr_preds = lr_model.transform(test_df)

**Train Support Vector Machine (SVM)**

In [22]:
svm = LinearSVC(featuresCol="features", labelCol="label", maxIter=10)
svm_model = svm.fit(train_df)
svm_preds = svm_model.transform(test_df)

**Train Neural Network**
- The input layer size is the **number of features**.
- The output layer size is the **number of unique labels**.

In [23]:
num_features = len(feature_cols)
num_classes = df.select("label").distinct().count()

nn = MultilayerPerceptronClassifier(
    featuresCol="features",
    labelCol="label",
    layers=[num_features, 16, 8, num_classes],  # Example: 3 hidden layers
    blockSize=128,
    maxIter=100
)

nn_model = nn.fit(train_df)
nn_preds = nn_model.transform(test_df)

## Evaluate Models ##
To check which model performs best, we evaluate accuracy and F1-score.

In [25]:
# Initialize evaluators
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", metricName="f1")
evaluator_auc = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")

# Store models and predictions
models = {
    "Random Forest": rf_preds,
    "Logistic Regression": lr_preds,
    "SVM": svm_preds,
    "Neural Network": nn_preds
}

# Compute metrics for each model
for name, preds in models.items():
    acc = evaluator_acc.evaluate(preds)
    f1 = evaluator_f1.evaluate(preds)
    
    # Handle AUC calculation (skip for SVM)
    if name == "SVM":
        auc_str = "N/A (SVM)"
        print(f"{name}:")
        print(f"  Accuracy = {acc:.4f}")
        print(f"  F1-score = {f1:.4f}")
        print(f"  AUC      = {auc_str}\n")
    else:
        auc = evaluator_auc.evaluate(preds)
        print(f"{name}:")
        print(f"  Accuracy = {acc:.4f}")
        print(f"  F1-score = {f1:.4f}")
        print(f"  AUC      = {auc:.4f}\n")

Random Forest:
  Accuracy = 0.7743
  F1-score = 0.6991
  AUC      = 0.6936

Logistic Regression:
  Accuracy = 0.7657
  F1-score = 0.6854
  AUC      = 0.6153

SVM:
  Accuracy = 0.7536
  F1-score = 0.6482
  AUC      = N/A (SVM)

Neural Network:
  Accuracy = 0.7675
  F1-score = 0.6839
  AUC      = 0.6139



## Conclusion ##
Based on the evaluate results, we choose the **Random Forest** model which performs the best.