# Exploring the PAMAP2 Physical Activity Monitoring Dataset

We know from dataset describtion that it is divided into 2 sub-folders: Protocol and Optional.

## Protocol

In [1]:
import pandas as pd
import glob
import os

In [2]:
path = r'C:\Users\meesh\Desktop\PAMAP2 Physical Activity Monitoring Data Set\PAMAP2_Dataset\Protocol' 
all_files = glob.glob(os.path.join(path, "*.dat"))

df_from_each_file = (pd.read_csv(f,header=None, delimiter=r"\s+").assign(subject = os.path.splitext(os.path.basename(f))[0]) for f in all_files)
protocol_df   = pd.concat(df_from_each_file, ignore_index=True)

The first thing we will check is the size of our dataset. Shape tells us about the rows and columns of dataframe

In [3]:
protocol_df.shape

(2872533, 55)

We can use info() to get the number of entries of each column.

In [4]:
protocol_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2872533 entries, 0 to 2872532
Data columns (total 55 columns):
0          float64
1          int64
2          float64
3          float64
4          float64
5          float64
6          float64
7          float64
8          float64
9          float64
10         float64
11         float64
12         float64
13         float64
14         float64
15         float64
16         float64
17         float64
18         float64
19         float64
20         float64
21         float64
22         float64
23         float64
24         float64
25         float64
26         float64
27         float64
28         float64
29         float64
30         float64
31         float64
32         float64
33         float64
34         float64
35         float64
36         float64
37         float64
38         float64
39         float64
40         float64
41         float64
42         float64
43         float64
44         float64
45         float64
46         floa

In [5]:
hand_lst = []
chest_lst = []
ankle_lst = []
for i in range(17):
    hand_lst.append('hand_'+str(i+1))
for i in range(17):
    chest_lst.append('chest_'+str(i+1))
for i in range(17):
    ankle_lst.append('ankle_'+str(i+1))
      
df_columns = ['timestamp','activityID','heartrate'] + hand_lst + chest_lst + ankle_lst + ['subject']

In [6]:
protocol_df.columns = df_columns

In [7]:
protocol_df.head()

Unnamed: 0,timestamp,activityID,heartrate,hand_1,hand_2,hand_3,hand_4,hand_5,hand_6,hand_7,...,ankle_9,ankle_10,ankle_11,ankle_12,ankle_13,ankle_14,ankle_15,ankle_16,ankle_17,subject
0,8.38,0,104.0,30.0,2.37223,8.60074,3.51048,2.43954,8.76165,3.35465,...,0.00925,-0.01758,-61.1888,-38.9599,-58.1438,1.0,0.0,0.0,0.0,subject101
1,8.39,0,,30.0,2.18837,8.5656,3.66179,2.39494,8.55081,3.64207,...,-0.004638,0.000368,-59.8479,-38.8919,-58.5253,1.0,0.0,0.0,0.0,subject101
2,8.4,0,,30.0,2.37357,8.60107,3.54898,2.30514,8.53644,3.7328,...,0.000148,0.022495,-60.7361,-39.4138,-58.3999,1.0,0.0,0.0,0.0,subject101
3,8.41,0,,30.0,2.07473,8.52853,3.66021,2.33528,8.53622,3.73277,...,-0.020301,0.011275,-60.4091,-38.7635,-58.3956,1.0,0.0,0.0,0.0,subject101
4,8.42,0,,30.0,2.22936,8.83122,3.7,2.23055,8.59741,3.76295,...,-0.014303,-0.002823,-61.5199,-39.3879,-58.2694,1.0,0.0,0.0,0.0,subject101


In [9]:
protocol_df['activityID'].value_counts()

0     929661
4     238761
17    238690
1     192523
3     189931
7     188107
2     185188
16    175353
6     164600
12    117216
13    104944
5      98199
24     49360
Name: activityID, dtype: int64

We know from the describtion that activityID = 0 is transient and should be discarded. We have 929661 values that should be discarded

In [10]:
protocol_df = protocol_df[protocol_df.activityID != 0]

In [11]:
len(protocol_df)

1942872

To know how many data we have per test subject we can use groupby()

In [23]:
gp_by_subject = protocol_df.groupby(['subject'])
gp_by_subject['subject'].count()

subject
subject101    249957
subject102    263349
subject103    174338
subject104    231421
subject105    272442
subject106    250096
subject107    232776
subject108    262102
subject109      6391
Name: subject, dtype: int64

## Optional

In [12]:
path2 = r'C:\Users\meesh\Desktop\PAMAP2 Physical Activity Monitoring Data Set\PAMAP2_Dataset\Optional' 
all_files2 = glob.glob(os.path.join(path2, "*.dat"))

df_from_each_file2 = (pd.read_csv(f,header=None, delimiter=r"\s+").assign(subject = os.path.splitext(os.path.basename(f))[0]) for f in all_files2)
optional_df   = pd.concat(df_from_each_file2, ignore_index=True)

In [13]:
optional_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977972 entries, 0 to 977971
Data columns (total 55 columns):
0          977972 non-null float64
1          977972 non-null int64
2          89285 non-null float64
3          976775 non-null float64
4          976775 non-null float64
5          976775 non-null float64
6          976775 non-null float64
7          976775 non-null float64
8          976775 non-null float64
9          976775 non-null float64
10         976775 non-null float64
11         976775 non-null float64
12         976775 non-null float64
13         976775 non-null float64
14         976775 non-null float64
15         976775 non-null float64
16         976775 non-null float64
17         976775 non-null float64
18         976775 non-null float64
19         976775 non-null float64
20         977225 non-null float64
21         977225 non-null float64
22         977225 non-null float64
23         977225 non-null float64
24         977225 non-null float64
25         977225

We can see that some of the data is missing. Many columns contain values less than 977972. 

In [14]:
optional_df.shape

(977972, 55)

In [15]:
optional_df.columns = df_columns

In [16]:
optional_df.head()

Unnamed: 0,timestamp,activityID,heartrate,hand_1,hand_2,hand_3,hand_4,hand_5,hand_6,hand_7,...,ankle_9,ankle_10,ankle_11,ankle_12,ankle_13,ankle_14,ankle_15,ankle_16,ankle_17,subject
0,10.03,0,105.0,33.3125,1.99964,6.94837,5.08845,2.62244,7.73457,4.64028,...,0.009471,0.003633,-43.8463,-2.97306,45.0822,1.0,0.0,0.0,0.0,subject101
1,10.04,0,,33.3125,1.8099,6.45729,5.16424,2.19725,6.93762,5.06528,...,0.070031,0.001804,-43.8385,-2.55481,46.8195,1.0,0.0,0.0,0.0,subject101
2,10.05,0,,33.3125,1.82756,5.93151,5.78208,2.01408,6.28989,5.44441,...,-0.00217,0.012096,-43.61,-3.00084,46.5653,1.0,0.0,0.0,0.0,subject101
3,10.06,0,,33.3125,1.7576,5.78202,5.97397,1.83344,5.83858,5.92881,...,-0.008545,0.007356,-43.5199,-1.99822,45.585,1.0,0.0,0.0,0.0,subject101
4,10.07,0,,33.3125,1.5067,6.20407,6.27669,1.71564,5.82465,6.27627,...,0.036445,-0.004679,-43.6258,-2.43575,45.5812,1.0,0.0,0.0,0.0,subject101


Removing activityID = 0

In [17]:
optional_df = optional_df[optional_df.activityID != 0]

In [18]:
len(optional_df)

782081

After removing activityID = 0, we are left with 782081 rows

To know total data per subject

In [19]:
grouped_by_subject = optional_df.groupby(['subject'])

In [22]:
grouped_by_subject['subject'].count()

subject
subject101    219368
subject105    139371
subject106    112277
subject108    152192
subject109    158873
Name: subject, dtype: int64