# Exploration of health tracker data from users from coursera mock
https://www.coursera.org/learn/applied-data-science-for-data-analysts

## Import necessary packages

In [7]:
import pandas as pd
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt
# plt.rcParams['figure.figsize'] = [15, 10]

## Using a feature engineered dataset
### Using average, minimum and maximum and aggregating by device_id.
The table contains data points from each device for a day.

In [8]:
ht_grouped = pd.read_csv('data/ht_grouped_feature_engineered.csv',index_col='device_id')
ht_grouped.head()

Unnamed: 0_level_0,resting_heartrate_avg,resting_heartrate_min,resting_heartrate_max,active_heartrate_avg,active_heartrate_min,active_heartrate_max,bmi_avg,bmi_min,bmi_max,vo2_avg,vo2_min,vo2_max,workout_minutes_avg,workout_minutes_min,workout_minutes_max,steps_avg,steps_min,steps_max,lifestyle,bmi_change
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0003a6b8-e48b-11ea-8204-0242ac110002,82.683797,60.677755,100.121903,139.434875,120.307791,162.347827,22.398064,19.530816,24.997081,20.994012,20.283341,21.491176,5.502632,1.04693,9.32575,5171.49589,2539,7837,Sedentary,5.466266
0007a88a-e48b-11ea-8204-0242ac110002,77.732942,52.712876,97.937731,127.057153,109.049387,146.869868,25.150813,22.635269,27.933614,25.527475,24.929151,26.231702,37.216702,12.109056,66.115919,7115.591781,3731,10189,Weight Trainer,5.298346
000b9c56-e48b-11ea-8204-0242ac110002,86.511629,63.903506,104.437895,147.315731,129.551728,177.78315,19.148256,16.844741,21.339665,19.448407,18.817131,19.946847,45.000087,10.524606,84.725056,7257.693151,4223,11156,Weight Trainer,4.494924
000f916c-e48b-11ea-8204-0242ac110002,77.550541,58.418806,98.875329,129.577004,110.845517,146.73865,24.240376,21.33582,26.90915,21.401302,20.70586,22.088526,37.886069,11.40253,68.119525,7129.690411,4413,10563,Weight Trainer,5.573329
00176e5a-e48b-11ea-8204-0242ac110002,69.312448,47.355699,92.762496,167.18585,147.876166,186.283615,27.132669,23.589288,31.026467,30.939205,30.032978,32.026828,5.119427,0.856124,9.947677,5128.024658,2938,7310,Sedentary,7.437179


In [9]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale


pca = PCA(random_state=42)
pca.fit(scale(ht_grouped.drop('lifestyle',axis=1)))

PCA(random_state=42)

## Principle Components and Variance Ratio

In [10]:
print(f"Number of Principle components: {pca.n_components_}")
print(f"Variance ratio {pca.explained_variance_ratio_}")

Number of Principle components: 19
Variance ratio [5.92798649e-01 1.98251069e-01 7.27663899e-02 4.22246987e-02
 3.65819253e-02 2.39445983e-02 1.02326597e-02 8.14086591e-03
 7.36610731e-03 2.31194758e-03 2.11947024e-03 1.26894698e-03
 8.59132951e-04 6.18166464e-04 3.88548307e-04 1.04674256e-04
 1.69197555e-05 5.22961240e-06 9.09738060e-33]


In [11]:
bar_range = pca.n_components_+1
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=2, cols=1)

fig.add_trace(go.Bar(x=list(range(1, bar_range)), y=pca.explained_variance_ratio_, name="% of variance explained"),row=1, col=1)

fig.add_trace(go.Scatter(x=list(range(1, bar_range)), y=np.cumsum(pca.explained_variance_ratio_), name='% of cumulative variance explained'),row=2, col=1)

fig.update_xaxes(title_text="Principle Components", range=[1,pca.n_components], row=1, col=1)
fig.update_yaxes(title_text="Explained Variance",range=[0,1], row=1, col=1)
fig.update_layout(height=600,
                  title_text="Principle Component Analysis",
                  yaxis1_tickformat = '%',
                  yaxis2_tickformat = '%',
                  xaxis1=dict(tickmode='linear'),
                  xaxis2=dict(tickmode='linear'))
fig.show()

From the plot above it is clear that the Elbow (Breaking point) is at PC4. Around 90% of the variance in data has been catpures in the first 4 principle components

In [12]:
import plotly.express as px
features = ['resting_heartrate_avg','active_heartrate_avg','bmi_avg','vo2_avg','workout_minutes_avg','steps_avg']
X = ht_grouped[features]

pca = PCA(n_components=3)
components = pca.fit_transform(scale(X))

total_var = pca.explained_variance_ratio_.sum() * 100

labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=ht_grouped['lifestyle'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
# fig.update_traces(diagonal_visible=False)
fig.show()


### Pandas group-by example
Group dataframe from a columns and create multiple aggregation (mean,min,max) from other columns
```
df_grouped = df.groupby(['device_id'])\
                  .agg({'resting_heartrate': [np.mean, np.min, np.max],
                        'active_heartrate': [np.mean, np.min, np.max],
                        'bmi': [np.mean, np.min, np.max],
                        'vo2': [np.mean, np.min, np.max],
                        'workout_minutes': [np.mean, np.min, np.max],
                        'steps': [np.mean, np.min, np.max],
                        'lifestyle': [np.max]})\
                  .rename(columns={'mean': 'avg', 'amin': 'min', 'amax': 'max'})
df_grouped.columns = df_grouped.columns.map('_'.join)
df_grouped = df_grouped.rename(columns={"lifestyle_max": "lifestyle"})
```