consider opening in colab for better output display.

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mohammad-dabash22/HAR_Analysis/blob/main/HAR.ipynb)

# Create a Spark session

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

spark

# Load and Explore the Dataset


In [4]:
df = spark.read.format("csv").load("HAR_3000.csv", header=True, inferSchema=True)

df.printSchema()

root
 |-- tBodyAcc-mean()-X: double (nullable = true)
 |-- tBodyAcc-mean()-Y: double (nullable = true)
 |-- tBodyAcc-mean()-Z: double (nullable = true)
 |-- tBodyAcc-std()-X: double (nullable = true)
 |-- tBodyAcc-std()-Y: double (nullable = true)
 |-- tBodyAcc-std()-Z: double (nullable = true)
 |-- tBodyAcc-mad()-X: double (nullable = true)
 |-- tBodyAcc-mad()-Y: double (nullable = true)
 |-- tBodyAcc-mad()-Z: double (nullable = true)
 |-- tBodyAcc-max()-X: double (nullable = true)
 |-- tBodyAcc-max()-Y: double (nullable = true)
 |-- tBodyAcc-max()-Z: double (nullable = true)
 |-- tBodyAcc-min()-X: double (nullable = true)
 |-- tBodyAcc-min()-Y: double (nullable = true)
 |-- tBodyAcc-min()-Z: double (nullable = true)
 |-- tBodyAcc-sma(): double (nullable = true)
 |-- tBodyAcc-energy()-X: double (nullable = true)
 |-- tBodyAcc-energy()-Y: double (nullable = true)
 |-- tBodyAcc-energy()-Z: double (nullable = true)
 |-- tBodyAcc-iqr()-X: double (nullable = true)
 |-- tBodyAcc-iqr()-Y: do

# Number of Unique Classes Present in the "Activity" Column

In [5]:
df.groupBy(df.Activity).count().show()

+------------------+-----+
|          Activity|count|
+------------------+-----+
|            LAYING|  537|
|WALKING_DOWNSTAIRS|  420|
|           WALKING|  496|
|          STANDING|  532|
|  WALKING_UPSTAIRS|  471|
|           SITTING|  491|
+------------------+-----+



# Dimensions of the dataset

In [8]:
num_rows = df.count()
num_columns = len(df.columns)


print(f"The dataset has {num_rows} rows and {num_columns} columns.")


The dataset has 2947 rows and 562 columns.


# Prepare the Dataset for Logistic Regression Classification

In [9]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

# Step 1: Assemble feature columns into a single vector column
assembler = VectorAssembler(inputCols=df.columns[:-1], outputCol="features")
data = assembler.transform(df)

# Preview the transformed data
data.show(5, truncate=False)

+-----------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+--------------+-------------------+-------------------+-------------------+----------------+----------------+----------------+--------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+--------------------------+--------------------------+--------------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------

In [10]:
# Step 2: Select relevant columns for machine learning
ML_data = data.select(data.features, data.Activity)

# Step 3: Index the 'Activity' column to convert it into numeric labels
indexer = StringIndexer(inputCol="Activity", outputCol="Activity_index")
indexed_data = indexer.fit(ML_data).transform(ML_data)

ML_data = indexed_data.drop("Activity")

ML_data.show(5)

+--------------------+--------------+
|            features|Activity_index|
+--------------------+--------------+
|[0.25717778,-0.02...|           1.0|
|[0.28602671,-0.01...|           1.0|
|[0.27548482,-0.02...|           1.0|
|[0.27029822,-0.03...|           1.0|
|[0.27483295,-0.02...|           1.0|
+--------------------+--------------+
only showing top 5 rows



# Split the Dataset into Training and Testing Sets

In [11]:
# Split the dataset into training (80%) and testing (20%) sets
train, test = ML_data.randomSplit([0.8, 0.2], seed=3)

print(f"Training set size: {train.count()} rows")
print(f"Testing set size: {test.count()} rows")


Training set size: 2335 rows
Testing set size: 612 rows


# Logistic Regression with 10-Fold Cross-Validation on Training Dataset

In [13]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Step 1: Initialize the Logistic Regression model
lr = LogisticRegression(labelCol='Activity_index', featuresCol='features')

# Step 2: Create a parameter grid for hyperparameter tuning
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.01])  # Regularization parameter
             .addGrid(lr.maxIter, [10, 20])      # Maximum number of iterations
             .build())

# Step 3: Define the evaluator for cross-validation
evaluator = MulticlassClassificationEvaluator(
    labelCol='Activity_index', predictionCol='prediction', metricName='accuracy'
)

# Step 4: Set up 10-fold cross-validation
cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=10
)

# Step 5: Fit the model using the training dataset
cv_model = cv.fit(train)

# Best model parameters
best_model = cv_model.bestModel
print(f"Best Regularization Parameter: {best_model._java_obj.getRegParam()}")
print(f"Best Maximum Iterations: {best_model._java_obj.getMaxIter()}")


Best Regularization Parameter: 0.01
Best Maximum Iterations: 20


# Evaluate the Best Model on the Testing Dataset

In [14]:
results = cv_model.transform(test)
f1_score = evaluator.evaluate(results)

print(f1_score)

0.9771241830065359


# Multiclass Classification Evaluation (Accuracy, Precision, Recall, and F1-Score)


In [15]:
f1_score = evaluator.evaluate(results, {evaluator.metricName: 'f1'})
accuracy = evaluator.evaluate(results, {evaluator.metricName: 'accuracy'})
recall = evaluator.evaluate(results, {evaluator.metricName: 'weightedRecall'})
precision = evaluator.evaluate(results, {evaluator.metricName: 'weightedPrecision'})


print(f"F1-Score: {f1_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

F1-Score: 0.9770
Accuracy: 0.9771
Recall: 0.9771
Precision: 0.9776


# Random Forest

## Hyperparameter Tuning Using Grid Search

In [16]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol='Activity_index', featuresCol='features')

paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [3, 5, 7, 9]) # Maximum depth of trees
             .addGrid(rf.numTrees, [10, 15, 20, 25]) # Number of trees
             .build())


cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10)

cv_model = cv.fit(test)

## Best Hyperparameter Combination

In [19]:
# Extract the best maxDepth and numTrees
best_max_depth = cv_model.bestModel._java_obj.getMaxDepth()
best_num_trees = cv_model.bestModel._java_obj.getNumTrees()

print(f"Best maxDepth: {best_max_depth}")
print(f"Best numTrees: {best_num_trees}")

Best maxDepth: 9
Best numTrees: 20


## Extract Top 50 Important Features

In [20]:
# Get feature importances
feature_importances = cv_model.bestModel.featureImportances

# Sort and select the top 50 feature importances
sorted_features = sorted(feature_importances, reverse=True)
top_50_val = sorted_features[:50]

print(f"Top 50 Feature Importances: {top_50_val}")

Top 50 Feature Importances: [0.0340392536909586, 0.031304461235740516, 0.027520710057555957, 0.026881108068754954, 0.021337667532532172, 0.021123681529680485, 0.020374539052035248, 0.019095512622218758, 0.01858358859792021, 0.016716484702180606, 0.013584085443148139, 0.012623928005612477, 0.012326680513316327, 0.01168947404557031, 0.011615632963694585, 0.011415288142880306, 0.011391992348358167, 0.011223422780235536, 0.010863982576466872, 0.01071812455664469, 0.010533568387627782, 0.010442795694287044, 0.01034833906562523, 0.010322936962848124, 0.010269326078298236, 0.010177732448955023, 0.01017508116088863, 0.010154979045966931, 0.010138328461672213, 0.010137944912799693, 0.010124948081434632, 0.010047218150779373, 0.009958486827782644, 0.009764034580028627, 0.009509126784228215, 0.009238461965237367, 0.00899278901004865, 0.008816475411887842, 0.008808571836094287, 0.008107429263952078, 0.00740753329074732, 0.007355350710185965, 0.007165483467961535, 0.007081867071589028, 0.0069139339

## Subset the Dataset with Top 50 Features and the Activity Column

In [21]:
top_50_features = []


for i in range(len(feature_importances)):
  if feature_importances[i] in top_50_val:
    top_50_features.append(df.columns[i])

top_50_features.append('Activity')

df = df.select(top_50_features)
df.show()

+----------------+----------------+--------------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+----------------------+----------------------+-----------------------+-------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------+------------------+-----------------------+---------------------+---------------------+---------------------+-----------------+----------------+----------------+---------------------+------------------------------+-------------------------------+--------------------+--------------------+------------------------+-----------------------------------+-----------------------------------+----------------------------------+-----------------------------------+-----------------------------------+----------------------------------+----------------------+-----------------+-----------------+----------

# Logistic Regression on the New Subset Dataset (50 Features + Activity Label)

In [22]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Step 1: Assemble the top 50 features into a single vector column
assembler = VectorAssembler(inputCols=df.columns[:-1], outputCol='features')
data = assembler.transform(df)

# Step 2: Prepare the dataset for machine learning
ML_data = data.select(data.features, data.Activity)

# Step 3: Index the 'Activity' column to numeric labels
indexer = StringIndexer(inputCol="Activity", outputCol="Activity_index")
indexed_data = indexer.fit(ML_data).transform(ML_data)


ML_data = indexed_data.drop('Activity')
train, test = ML_data.randomSplit([0.8, 0.2], seed=3)


lr = LogisticRegression(labelCol='Activity_index', featuresCol='features')

paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.01]) # Regularization parameter
             .addGrid(lr.maxIter, [10, 20])     # Maximum iterations
             .build())

evaluator = MulticlassClassificationEvaluator(labelCol='Activity_index', predictionCol='prediction')
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10)
cv_model = cv.fit(train)

results = cv_model.transform(test)
f1_score = evaluator.evaluate(results)

print(f"The F1-score of the Logistic Regression model on the testing dataset is: {f1_score:.4f}")

The F1-score of the Logistic Regression model on the testing dataset is: 0.9182


# Compare Accuracy Between Logistic Regression Models (Full Dataset vs. Subset)

In [23]:
# Evaluate accuracy of the logistic regression model on the subset dataset
accuracy_subset = evaluator.evaluate(results, {evaluator.metricName: 'accuracy'})

print(f"Accuracy of the logistic regression model with the subset dataset: {accuracy_subset:.4f}")

Accuracy of the logistic regression model with the subset dataset: 0.9183


* The logistic regression model using the **whole dataset** (Task 7) produced better accuracy compared to the model using the **subset dataset** (Task 9).
* This could indicate that the features excluded in the subset might have carried significant predictive power or that reducing the feature set led to a loss of important information.