In [1]:
import os
import sys
import subprocess
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
os.environ['PYSPARK_PYTHON'] = sys.executable

os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
# Create Spark session
spark = SparkSession.builder.appName("HMP Spark Loader").getOrCreate()
spark.catalog.clearCache()

In [3]:
def load_hmp_from_repo(base_path="HMP_Dataset"):
    rows = []
    for activity in os.listdir(base_path):
        activity_path = os.path.join(base_path, activity)
        if not os.path.isdir(activity_path):
            continue
        for fname in os.listdir(activity_path):
            fpath = os.path.join(activity_path, fname)
            with open(fpath, 'r') as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) == 3:
                        x, y, z = map(float, parts)
                        rows.append(Row(activity=activity, x=x, y=y, z=z))
    return rows

rows = load_hmp_from_repo("HMP_Dataset")

schema = StructType([
    StructField("activity", StringType(), True),
    StructField("x", DoubleType(), True),
    StructField("y", DoubleType(), True),
    StructField("z", DoubleType(), True),
])

df = spark.createDataFrame(rows, schema)
df.printSchema()


root
 |-- activity: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)



In [4]:
df.show(5)


+-----------+----+----+----+
|   activity|   x|   y|   z|
+-----------+----+----+----+
|Brush_teeth|22.0|49.0|35.0|
|Brush_teeth|22.0|49.0|35.0|
|Brush_teeth|22.0|52.0|35.0|
|Brush_teeth|22.0|52.0|35.0|
|Brush_teeth|21.0|52.0|34.0|
+-----------+----+----+----+
only showing top 5 rows



In [5]:
import os
import pandas as pd

# Load a few example activity files from the HMP_Dataset for EDA
base_path = "HMP_Dataset"

# We'll sample 3 activities with 1 file each for initial EDA
sample_activities = ["Brush_teeth", "Climb_stairs", "Walk"]
sample_data = []

for activity in sample_activities:
    activity_path = os.path.join(base_path, activity)
    if os.path.isdir(activity_path):
        file_list = os.listdir(activity_path)
        if file_list:
            file_path = os.path.join(activity_path, file_list[0])
            df = pd.read_csv(file_path, sep=' ', header=None, names=['x', 'y', 'z'])
            df['activity'] = activity
            df['time'] = range(len(df))  # Time index
            sample_data.append(df)

# Combine into one DataFrame for visualization
eda_df = pd.concat(sample_data, ignore_index=True)


In [11]:
import plotly.express as px

# Downsample for speed if needed
sampled = eda_df.groupby("activity").apply(lambda df: df.sample(n=min(len(df), 200))).reset_index(drop=True)

fig = px.line(sampled, x="time", y="x", color="activity", title="X-axis Acceleration by Activity (Sampled)")
fig.show()


  sampled = eda_df.groupby("activity").apply(lambda df: df.sample(n=min(len(df), 200))).reset_index(drop=True)


In [12]:
import panel as pn
import holoviews as hv
import hvplot.pandas
pn.extension()

def plot_axis(axis='x'):
    return eda_df.hvplot.line(x='time', y=axis, groupby='activity', title=f"{axis.upper()} Time Series")

pn.interact(plot_axis, axis=['x', 'y', 'z'])


BokehModel(combine_events=True, render_bundle={'docs_json': {'f592807a-9d6b-451c-a389-35fc4ac5b639': {'version…

In [13]:
agg_df = eda_df.groupby('activity')[['x', 'y', 'z']].agg(['mean', 'std']).reset_index()
agg_df.columns = ['activity', 'x_mean', 'x_std', 'y_mean', 'y_std', 'z_mean', 'z_std']

import plotly.graph_objects as go

fig = go.Figure()
for axis in ['x', 'y', 'z']:
    fig.add_trace(go.Bar(
        name=f'{axis.upper()}',
        x=agg_df['activity'],
        y=agg_df[f'{axis}_mean'],
        error_y=dict(type='data', array=agg_df[f'{axis}_std'])
    ))

fig.update_layout(barmode='group', title="Mean ± Std of Axes by Activity", xaxis_title="Activity", yaxis_title="Mean Value")
fig.show()


In [14]:
fig = px.scatter_3d(sampled, x='x', y='y', z='z', color='activity',
                    title="3D Acceleration Scatter: x, y, z", opacity=0.6)
fig.show()


In [16]:
from sklearn.preprocessing import StandardScaler
import plotly.express as px

# Basic features
features = eda_df.groupby('activity')[['x', 'y', 'z']].agg(['mean', 'std', 'max', 'min']).reset_index()
features.columns = ['activity'] + [f"{col}_{stat}" for col, stat in features.columns.tolist()[1:]]

# Normalize for plotting
scaler = StandardScaler()
scaled = pd.DataFrame(scaler.fit_transform(features.iloc[:, 1:]), columns=features.columns[1:])
scaled['activity'] = features['activity']

px.parallel_coordinates(scaled, color=scaled.columns[0], dimensions=scaled.columns[1:], title="Parallel Coordinates of Features")


In [None]:
eda_df.groupby("activity")[["x", "y", "z"]].mean().hvplot.bar(rot=0, title="Mean Acceleration per Axis by Activity")

In [None]:
eda_df.hvplot.box(y=['x', 'y', 'z'], by='activity', height=400, title="Boxplot of Acceleration by Activity and Axis")