# Model Training #

Based on the comparison results, we choose to use Random Forest model to train our model.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

spark = SparkSession.builder.appName("ModelTraining").getOrCreate()

data_path = "data/safety_dataset_filtered.csv"
# data_path = "data/safety_dataset_new.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

df.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1| 4991|
|    0|15009|
+-----+-----+



In [2]:
# Drop non-feature columns
df = df.drop("bookingID")

# Ensure 'label' is integer type
df = df.withColumn("label", col("label").cast("integer"))

feature_cols = [col_name for col_name in df.columns if col_name != "label"]
print(feature_cols)

# Convert features into a single feature vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)

# Normalize features using StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)
df = scaler.fit(df).transform(df)

# Select only the 'scaled_features' and 'label' columns
df = df.select("scaled_features", "label")
df = df.withColumnRenamed("scaled_features", "features")

# Split data into train (80%) and test (20%)
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Show dataset sizes
print(f"Training Data: {train_df.count()} rows")
print(f"Test Data: {test_df.count()} rows")

['Speed_perc70', 'acceleration_x_min', 'acceleration_z_std', 'Bearing_std', 'acceleration_x_std', 'Speed_std', 'acceleration_y_std', 'acceleration_z_max', 'Speed_max', 'time']
Training Data: 16052 rows
Test Data: 3948 rows


In [3]:
majority_count = train_df.filter(train_df.label == 0).count()
minority_count = train_df.filter(train_df.label == 1).count()

ratio = majority_count / minority_count
print(f"Class 0: {majority_count}, Class 1: {minority_count}, Ratio: {ratio:.2f}")

Class 0: 12024, Class 1: 4028, Ratio: 2.99


In [4]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
rf_model = rf.fit(train_df)
rf_preds = rf_model.transform(test_df)

In [5]:
def model_evaluator(preds_model):
    # Accuracy
    accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = accuracy_evaluator.evaluate(preds_model)
    print(f"Accuracy: {accuracy:.4f}")

    # Precision
    precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
    precision = precision_evaluator.evaluate(preds_model)
    print(f"Precision: {precision:.4f}")

    # Recall
    recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
    recall = recall_evaluator.evaluate(preds_model)
    print(f"Recall: {recall:.4f}")

    # F1-score
    f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
    f1_score = f1_evaluator.evaluate(preds_model)
    print(f"F1 Score: {f1_score:.4f}")

    auc_evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
    auc_roc = auc_evaluator.evaluate(preds_model)
    print(f"AUC-ROC: {auc_roc:.4f}")

    print("-----------------\n")

model_evaluator(rf_preds)

Accuracy: 0.7779
Precision: 0.7726
Recall: 0.7779
F1 Score: 0.7131
AUC-ROC: 0.7141
-----------------



In [7]:
model_path = "models/rf_model"

rf_model.write().overwrite().save(model_path)
print(f"Model saved successfully at {model_path}!")

Model saved successfully at models/rf_model!
