In [0]:
# OS Module
import os

In [0]:
# Dataset - https://github.com/WChill/HMP_Dataset

os.listdir("/dbfs/FileStore/tables/HMP_Dataset/") # change your filesystem path to browse files

In [0]:
# Create custom schema
from pyspark.sql.types import StructField, StructType, IntegerType
custom_schema = StructType([
  StructField("x", IntegerType(), True),
  StructField("y", IntegerType(), True),
  StructField("z", IntegerType(), True),
])

In [0]:
# filtering only the folder names which contain "_"
# Change to your file system path
filtered_file_list = [item for item in os.listdir('/dbfs/FileStore/tables/HMP_Dataset') if '_' in item]
print(filtered_file_list)

In [0]:
# Preparing our dataset

from pyspark.sql.functions import lit


df = None
for category in filtered_file_list:
  print(category)
  # Change to your file system path
  for file in os.listdir("/dbfs/FileStore/tables/HMP_Dataset/"+category):
    
    df_temp = spark.read \
    .format("csv") \
    .option("header", False) \
    .option("delimiter", " ") \
    .schema(custom_schema).load("dbfs:/FileStore/tables/HMP_Dataset/" + category + "/" + file) # Change to your file system path
    
    df_temp = df_temp.withColumn("class", lit(category)).withColumn("source", lit(file))
    
    
    if df is None:
      df=df_temp
    else:
      df = df.union(df_temp)

      
      
df.show()

In [0]:
display(df)

x,y,z,class,source
22,49,35,Brush_teeth,Accelerometer_2011_04_11_13_28_18_brush_teeth_f1-1.txt
22,49,35,Brush_teeth,Accelerometer_2011_04_11_13_28_18_brush_teeth_f1-1.txt
22,52,35,Brush_teeth,Accelerometer_2011_04_11_13_28_18_brush_teeth_f1-1.txt
22,52,35,Brush_teeth,Accelerometer_2011_04_11_13_28_18_brush_teeth_f1-1.txt
21,52,34,Brush_teeth,Accelerometer_2011_04_11_13_28_18_brush_teeth_f1-1.txt
22,51,34,Brush_teeth,Accelerometer_2011_04_11_13_28_18_brush_teeth_f1-1.txt
20,50,35,Brush_teeth,Accelerometer_2011_04_11_13_28_18_brush_teeth_f1-1.txt
22,52,34,Brush_teeth,Accelerometer_2011_04_11_13_28_18_brush_teeth_f1-1.txt
22,50,34,Brush_teeth,Accelerometer_2011_04_11_13_28_18_brush_teeth_f1-1.txt
22,51,35,Brush_teeth,Accelerometer_2011_04_11_13_28_18_brush_teeth_f1-1.txt


In [0]:
# TRANSFORMATION

In [0]:
# String Indexes - As Machine Learning do not deal with text, so we will convert our class column(Categorical value) to integer.
# this process is called indexer.

In [0]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="class", outputCol="classIndex") # create indexer object
indexed = indexer.fit(df).transform(df)
indexed.show()

In [0]:
# One hot encoder for categorical features
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(inputCol="classIndex", outputCol="categroyVec")
encoded = encoder.fit(indexed).transform(indexed)



In [0]:
encoded.show()

In [0]:
# Spark ML only understand vectors, so lets create vector features 

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols=["x", "y", "z"], outputCol="features") # New column features with vector
feature_vecorized = vectorAssembler.transform(encoded)

In [0]:
feature_vecorized.show()

In [0]:
# Always normalize your data to bring your all data to the same range.

from pyspark.ml.feature import Normalizer

normlizer = Normalizer(inputCol="features", outputCol="feature_norm")
normalize_data = normlizer.transform(feature_vecorized)

In [0]:
normalize_data.show()

In [0]:
# Create pipeline

from pyspark.ml import Pipeline
pipe = Pipeline(stages=[indexer, encoder, vectorAssembler, normlizer]) # stages should be in correct sequence
model =pipe.fit(df)
prediction = model.transform(df)

In [0]:
prediction.show()

In [0]:
# Prepare training dataset. Only select what we need.

df_train = prediction.select("categroyVec", "feature_norm")
df_train.show()
