In [12]:
import camber

In [13]:
spark = camber.spark.connect(worker_size="XSMALL")

Output()

# Necessary Imports

In [14]:
import pyspark

import pandas as pd 
import numpy as np

import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt

# Load the dataset

In [15]:
# Load the JSON data into a DataFrame
df = spark.read.json('s3a://camber-spark-bkt/ciber-catts/shared-data/endomondoHR/', mode="DROPMALFORMED")

In [16]:
print('Schema overview')
df.printSchema()

Schema overview
root
 |-- altitude: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- gender: string (nullable = true)
 |-- heart_rate: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- id: long (nullable = true)
 |-- latitude: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- longitude: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- speed: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- sport: string (nullable = true)
 |-- timestamp: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- url: string (nullable = true)
 |-- userId: long (nullable = true)



In [19]:
df.show()

+--------------------+------+--------------------+---------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+--------+
|            altitude|gender|          heart_rate|       id|            latitude|           longitude|               speed|        sport|           timestamp|                 url|  userId|
+--------------------+------+--------------------+---------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+--------+
|[34.8, 35.2, 33.8...|  male|[121, 128, 133, 1...|294229534|[45.6687360629439...|[16.5148551855236...|                null|         bike|[1391727581, 1391...|https://www.endom...| 4572860|
|[34.8, 34.8, 34.8...|  male|[87, 90, 92, 93, ...|289213252|[45.6685770582407...|[16.5148167964071...|                null|          run|[1390354774, 1390...|https://www.endom...| 4572860|
|[34.8, 34.6, 37.4...|  male|[109, 119, 137, 1...|28796

# Example to handle nested elements

In [6]:
from pyspark.sql.functions import col, explode, expr
from pyspark.sql.types import FloatType
import statistics

# Assuming your DataFrame is called "df" and the column with Array<bigint> is called "heartrate"

# Explode the array column into multiple rows
run_df_heart = df.select(col("id"), explode(col("heart_rate")).alias("heart_value"))
# Convert the exploded column to FloatType for further analysis
run_df_heart = run_df_heart.withColumn("heart_value", run_df_heart["heart_value"].cast(FloatType()))


In [7]:
run_df_heart.show()

+---------+-----------+
|       id|heart_value|
+---------+-----------+
|294229534|      121.0|
|294229534|      128.0|
|294229534|      133.0|
|294229534|      137.0|
|294229534|      137.0|
|294229534|      133.0|
|294229534|      128.0|
|294229534|      130.0|
|294229534|      128.0|
|294229534|      124.0|
|294229534|      123.0|
|294229534|      120.0|
|294229534|      123.0|
|294229534|      126.0|
|294229534|      138.0|
|294229534|      149.0|
|294229534|      159.0|
|294229534|      169.0|
|294229534|      173.0|
|294229534|      173.0|
+---------+-----------+
only showing top 20 rows



👀 **Tutorial**:

`run_df.select(col("id"), explode(col("heart_rate")).alias("heart_value"))`: We are selecting two columns from the DataFrame `run_df`: "id" and "heart_rate". The `explode()` function is used to explode the "heart_rate" array column, which means it will create a new row for each element in the array. The `alias()` function renames the exploded column as "heart_value" in the resulting DataFrame, `run_df_heart`.


For example, let's say the initial DataFrame `run_df` has the following structure:

```
+---+----------------+
|id |heart_rate      |
+---+----------------+
|1  |[70, 75, 80]   |
|2  |[85, 90]       |
+---+----------------+

```

After applying `run_df.select(col("id"), explode(col("heart_rate")).alias("heart_value"))`, the resulting DataFrame run_df_heart will look like this:

```
+---+------------+
|id |heart_value |
+---+------------+
|1  |70          |
|1  |75          |
|1  |80          |
|2  |85          |
|2  |90          |
+---+------------+

```


## Utilizing RDD (example)

In [8]:
rdd = run_df_heart.rdd


In [20]:
# Step 2: Map to create key-value pairs
key_value_rdd = rdd.map(lambda row: (row["id"], row["heart_value"])) 

# Step 3: Calculate the mean heart rate for each user
mean_heart_rate_rdd = key_value_rdd.groupByKey() \
                                  .mapValues(lambda x: sum(x) / len(x))

# Printing the result
# for user_id, mean_heart_rate in mean_heart_rate_rdd.collect():
#     print(f"User ID: {user_id}, Mean Heart Rate: {mean_heart_rate}")



# Exercise: Endomondo Sensor Data


The given datasets consist of exercise records from Endomondo, a fitness tracking platform. The data includes sequential sensor information like heart rate, speed, GPS coordinates, as well as details about the sport type, user gender, and weather conditions such as temperature and humidity.

Your objective is to analyze the data using Spark RDD and/or Spark DataFrame to derive valuable insights. One example of potential research question could include:

1. Identifying instances where a specific individual had higher or lower heart rates during particular workouts, if any.
2. Comparing the heart rate of the specific individual to other users during similar workouts.
   - Investigating whether altitude played a role in the observed heart rate differences.
   - Exploring the impact of speed on heart rate.
   - Examining the potential relationship between heart rate and temperature.

The exercise is open-ended, but your insights should be supported by concrete reasoning and patterns/trends observed in the data. 
