# Feature Engineering
using Random Forest for feature importance

### Load the raw dataset and label dataset

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, Imputer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.appName("FeatureEngineering").getOrCreate()

# Load your filtered raw data and label table
raw_df = spark.read.csv("data/filtered_raw_data.csv", header=True, inferSchema=True)
label_df = spark.read.csv("data/label_table.csv", header=True, inferSchema=True)

### Feature Engineering Function

In [2]:
# Compute magnitudes
df = raw_df.withColumn("accel_mag", F.sqrt(F.col("acceleration_x")**2 +
                                                F.col("acceleration_y")**2 +
                                                F.col("acceleration_z")**2)) \
                .withColumn("gyro_mag", F.sqrt(F.col("gyro_x")**2 +
                                               F.col("gyro_y")**2 +
                                               F.col("gyro_z")**2))

print(f"The raw data contains {df.count()} records.")
# df.printSchema()

# Aggregate per interval
aggregated_df = df.groupBy("bookingid").agg(
    F.mean("speed").alias("avg_speed"),
    F.stddev("speed").alias("std_speed"),
    
    F.mean("accel_mag").alias("avg_accel_mag"),
    F.max("accel_mag").alias("max_accel_mag"),
    F.stddev("accel_mag").alias("std_accel_mag"),
    
    F.mean("gyro_mag").alias("avg_gyro_mag"),
    F.stddev("gyro_mag").alias("std_gyro_mag"),
    
    F.mean("acceleration_x").alias("avg_accel_x"),
    F.stddev("acceleration_x").alias("std_accel_x"),
    F.max("acceleration_x").alias("max_accel_x"),
    
    F.mean("acceleration_y").alias("avg_accel_y"),
    F.stddev("acceleration_y").alias("std_accel_y"),
    F.max("acceleration_y").alias("max_accel_y"),
    
    F.mean("acceleration_z").alias("avg_accel_z"),
    F.stddev("acceleration_z").alias("std_accel_z"),
    F.max("acceleration_z").alias("max_accel_z"),
    
    F.mean("gyro_x").alias("avg_gyro_x"),
    F.stddev("gyro_x").alias("std_gyro_x"),
    
    F.mean("gyro_y").alias("avg_gyro_y"),
    F.stddev("gyro_y").alias("std_gyro_y"),
    
    F.mean("gyro_z").alias("avg_gyro_z"),
    F.stddev("gyro_z").alias("std_gyro_z"),
    
    F.mean("accuracy").alias("avg_accuracy"),
    F.stddev("accuracy").alias("std_accuracy"),
    
    F.max("second").alias("second"),
)

labeled_df = aggregated_df.join(label_df, "bookingid", "left")

print(f"The labeled data contains {labeled_df.count()} records.")

labeled_df = labeled_df.fillna(0.0)

labeled_df.printSchema()

The raw data contains 1613554 records.
The labeled data contains 20000 records.
root
 |-- bookingid: long (nullable = true)
 |-- avg_speed: double (nullable = false)
 |-- std_speed: double (nullable = false)
 |-- avg_accel_mag: double (nullable = false)
 |-- max_accel_mag: double (nullable = false)
 |-- std_accel_mag: double (nullable = false)
 |-- avg_gyro_mag: double (nullable = false)
 |-- std_gyro_mag: double (nullable = false)
 |-- avg_accel_x: double (nullable = false)
 |-- std_accel_x: double (nullable = false)
 |-- max_accel_x: double (nullable = false)
 |-- avg_accel_y: double (nullable = false)
 |-- std_accel_y: double (nullable = false)
 |-- max_accel_y: double (nullable = false)
 |-- avg_accel_z: double (nullable = false)
 |-- std_accel_z: double (nullable = false)
 |-- max_accel_z: double (nullable = false)
 |-- avg_gyro_x: double (nullable = false)
 |-- std_gyro_x: double (nullable = false)
 |-- avg_gyro_y: double (nullable = false)
 |-- std_gyro_y: double (nullable = fal

In [3]:
df = labeled_df.drop("bookingid")  # bookingID is just an identifier

feature_names = [c for c in df.columns if c != "label"]
label_col = "label"

assembler = VectorAssembler(
    inputCols=feature_names,
    outputCol="features"
)

### Use Random Forest

In [4]:
rf = RandomForestClassifier(labelCol=label_col, featuresCol="features", numTrees=50)

pipeline = Pipeline(stages=[assembler, rf])

model = pipeline.fit(df)

threshold = 0.01 

importances = model.stages[-1].featureImportances
feature_importance_list = list(zip(feature_names, importances))

sorted_importance = sorted(feature_importance_list, key=lambda x: x[1], reverse=True)

print("Feature Importances:")
for feature, score in sorted_importance:
    print(f"{feature}: {score}")
    
rf_selected_features = [feature for feature, score in sorted_importance if score > threshold]
print(f"Selected Features after RF: {rf_selected_features}")

Feature Importances:
second: 0.34831958610430097
std_gyro_z: 0.06976817186246109
std_accel_mag: 0.06543501155937806
std_accel_y: 0.061083091111019554
std_accel_z: 0.05973005628429935
max_accel_x: 0.05402353959920417
avg_speed: 0.05068410718992172
std_accel_x: 0.048567837480458016
max_accel_mag: 0.04542903789330278
std_speed: 0.03800659262731864
std_gyro_mag: 0.029149941772508327
std_gyro_x: 0.020732527656056106
max_accel_z: 0.019757705767866982
avg_gyro_mag: 0.019645224001540756
avg_accel_y: 0.0151691122129525
std_accuracy: 0.012650055297149984
std_gyro_y: 0.008925761135445527
max_accel_y: 0.005619092965213143
avg_accuracy: 0.005424631440508121
avg_accel_mag: 0.004690669833803138
avg_accel_z: 0.004457650062544988
avg_accel_x: 0.004171168189571563
avg_gyro_y: 0.0036084901478326387
avg_gyro_z: 0.0027978299089945887
avg_gyro_x: 0.002153107896347039
Selected Features after RF: ['second', 'std_gyro_z', 'std_accel_mag', 'std_accel_y', 'std_accel_z', 'max_accel_x', 'avg_speed', 'std_accel_x',

### Compare with Lasso (Logistic Regression + L1 Regularization)

In [5]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    elasticNetParam=1.0,  # L1 regularization (Lasso)
    regParam=0.01          # Tune this to control regularization strength
)

df_vector = assembler.transform(df)

lr_model = lr.fit(df_vector)
coeffs = lr_model.coefficients.toArray()
print("All coefficients:", coeffs)

lasso_selected_features = []

for i, coef in enumerate(coeffs):
    if __builtins__.abs(coef) > 1e-4:  # Adjust threshold as needed
        lasso_selected_features.append((feature_names[i], coef))
        
print(f"Selected Features after Lasso: {lasso_selected_features}")

All coefficients: [-0.0412745   0.07612243 -0.04248021  0.01767476  0.          0.00780605
  0.         -0.02621662  0.          0.08377648  0.          0.05047708
  0.          0.          0.13204311  0.01967786  0.          0.07719079
  0.          0.          0.          0.43727746  0.          0.
  0.        ]
Selected Features after Lasso: [('avg_speed', np.float64(-0.04127450472515698)), ('std_speed', np.float64(0.07612242920432162)), ('avg_accel_mag', np.float64(-0.04248021185555937)), ('max_accel_mag', np.float64(0.017674759630666538)), ('avg_gyro_mag', np.float64(0.007806048012756059)), ('avg_accel_x', np.float64(-0.026216619761205626)), ('max_accel_x', np.float64(0.08377648152781744)), ('std_accel_y', np.float64(0.050477082049784766)), ('std_accel_z', np.float64(0.13204310924908175)), ('max_accel_z', np.float64(0.019677855205955653)), ('std_gyro_x', np.float64(0.07719078840485713)), ('std_gyro_z', np.float64(0.4372774622602128))]


### Use Pearson Correlation as sanity check

In [6]:
correlations = {f: __builtins__.abs(df.stat.corr(f, "label")) for f in feature_names}
sorted_corr = sorted(correlations.items(), key=lambda x: x[1], reverse=True)
print("Feature Correlations with Label:")
for feature, corr in sorted_corr:
    if corr > threshold:
        print(f"{feature}: {corr}")

Feature Correlations with Label:
std_accel_z: 0.16911324954611875
std_accel_y: 0.1598660172875447
max_accel_x: 0.15547573054941072
std_accel_x: 0.1549506981637733
max_accel_mag: 0.14231543649186762
std_accel_mag: 0.13819959104593557
std_gyro_x: 0.13628694678962552
max_accel_z: 0.13581501960136239
std_gyro_z: 0.12625625561851947
std_gyro_mag: 0.1013182631444176
avg_gyro_mag: 0.08200975272605494
std_gyro_y: 0.07465966459074358
avg_speed: 0.07185067759246141
max_accel_y: 0.039036442852618605
std_speed: 0.035391407876509864
avg_accel_z: 0.033629392085236444
avg_gyro_x: 0.015643340157734675
avg_accel_mag: 0.01424314720079094


## Results

| Feature       | RF  | Lasso | Pearson | ✅ Final? | Reason                                      |
|--------------|-----|-------|---------|----------|---------------------------------------------|
| std_gyro_z     | ✔️  | ✔️    | ✔️      | ✔️        | Top feature in all methods (high importance) |
| std_accel_y    | ✔️  | ✔️    | ✔️      | ✔️        | Strong in RF, Lasso, and Pearson correlation |
| std_accel_z    | ✔️  | ✔️    | ✔️      | ✔️        | Important in all three methods              |
| max_accel_x    | ✔️  | ✔️    | ✔️      | ✔️        | High importance in RF and strong correlation |
| avg_speed      | ✔️  | ✔️    | ✔️      | ✔️        | Selected by all methods                     |
| std_accel_x    | ✔️  | ❌    | ✔️      | ✔️        | Important in RF and Pearson (Lasso missed)  |
| max_accel_mag  | ✔️  | ✔️    | ✔️      | ✔️        | Consensus across all methods                |
| std_speed      | ✔️  | ✔️    | ❌      | ✔️        | Key in RF and Lasso (Pearson weaker)        |
| std_gyro_mag   | ✔️  | ❌    | ✔️      | ✔️        | RF + Pearson agreement                      |
| std_gyro_x     | ✔️  | ✔️    | ✔️      | ✔️        | Important in all three                      |
| max_accel_z    | ✔️  | ✔️    | ✔️      | ✔️        | Selected by all methods                     |
| avg_gyro_mag   | ✔️  | ✔️    | ✔️      | ✔️        | Consensus across methods                    |
| std_accel_mag  | ✔️  | ❌    | ✔️      | ✔️        | RF + Pearson (Lasso dropped)                |
| second       | ✔️  | ❌    | ❌      | ⚠️ Maybe | Dominant in RF but might represent time     |

In [None]:
final_features = [
    "std_gyro_z",
    "std_accel_y",
    "std_accel_z",
    "max_accel_x",
    "avg_speed",
    "std_accel_x",
    "max_accel_mag",
    "std_speed",
    "std_gyro_mag",
    "std_gyro_x",
    "max_accel_z",
    "avg_gyro_mag",
    "std_accel_mag",
    "second",
]
