# **Initial setup**

In [None]:
from google.colab import drive
import pandas as pd
import gc as gc

In [None]:
# Mount Google Drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
file_path_training = "/content/gdrive/MyDrive/Project_NAPS/NAPS/data/Zzzs_train_multi.parquet"
# Read Parquet data into a Dask DataFrame
pd = pd.read_parquet(file_path_training)#, nparitions=10)

print(pd.head())

      series_id  step                 timestamp     anglez    enmo  awake
0  08db4255286f     0  2018-11-05T10:00:00-0400 -30.845301  0.0447      1
1  08db4255286f     1  2018-11-05T10:00:05-0400 -34.181801  0.0443      1
2  08db4255286f     2  2018-11-05T10:00:10-0400 -33.877102  0.0483      1
3  08db4255286f     3  2018-11-05T10:00:15-0400 -34.282101  0.0680      1
4  08db4255286f     4  2018-11-05T10:00:20-0400 -34.385799  0.0768      1


# **Extracting Features**

In [None]:
def make_features(pd):
    pd = pd.reset_index(drop=True)
    # Convert 'timestamp' to datetime
    pd['timestamp'] = pd.to_datetime(pd['timestamp'], utc=True).dt.tz_convert(None)

    # Extract hour from timestamp
    pd["hour"] = pd["timestamp"].dt.hour

    #Set interval
    periods = 20

    #Some negative values in anglez, make positive
    pd["anglez"] = abs(pd["anglez"])

    #Extract differences over period for each series ID, fill in any nan values with next valid value, change type to float
    pd["anglez_diff"] = pd.groupby('series_id')['anglez'].diff(periods=periods).fillna(method="bfill").astype('float16')
    pd["enmo_diff"] = pd.groupby('series_id')['enmo'].diff(periods=periods).fillna(method="bfill").astype('float16')

    #Rolling mean, period window will be centered at whatever current value is, fill na with backward and foward filling
    pd["anglez_rolling_mean"] = pd["anglez"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    pd["enmo_rolling_mean"] = pd["enmo"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    pd["anglez_rolling_max"] = pd["anglez"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')
    pd["enmo_rolling_max"] = pd["enmo"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')
    pd["anglez_rolling_std"] = pd["anglez"].rolling(periods,center=True).std().fillna(method="bfill").fillna(method="ffill").astype('float16')
    pd["enmo_rolling_std"] = pd["enmo"].rolling(periods,center=True).std().fillna(method="bfill").fillna(method="ffill").astype('float16')
    pd["anglez_diff_rolling_mean"] = pd["anglez_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    pd["enmo_diff_rolling_mean"] = pd["enmo_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    pd["anglez_diff_rolling_max"] = pd["anglez_diff"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')
    pd["enmo_diff_rolling_max"] = pd["enmo_diff"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')

    return pd

#coloumns of new df
features = ["hour",
            "anglez",
            "anglez_rolling_mean",
            "anglez_rolling_max",
            "anglez_rolling_std",
            "anglez_diff",
            "anglez_diff_rolling_mean",
            "anglez_diff_rolling_max",
            "enmo",
            "enmo_rolling_mean",
            "enmo_rolling_max",
            "enmo_rolling_std",
            "enmo_diff",
            "enmo_diff_rolling_mean",
            "enmo_diff_rolling_max",
           ]


In [None]:
#print(len(pd.index))
pd   = make_features(pd.head(len(pd.index)))

pd.to_csv("/content/gdrive/MyDrive/Project_NAPS/NAPS/data/processed-data.csv", chunksize=1000)

print(pd.head(10))

      series_id  step           timestamp     anglez    enmo  awake  hour  \
0  08db4255286f     0 2018-11-05 14:00:00  30.845301  0.0447      1    14   
1  08db4255286f     1 2018-11-05 14:00:05  34.181801  0.0443      1    14   
2  08db4255286f     2 2018-11-05 14:00:10  33.877102  0.0483      1    14   
3  08db4255286f     3 2018-11-05 14:00:15  34.282101  0.0680      1    14   
4  08db4255286f     4 2018-11-05 14:00:20  34.385799  0.0768      1    14   
5  08db4255286f     5 2018-11-05 14:00:25  34.925598  0.0511      1    14   
6  08db4255286f     6 2018-11-05 14:00:30  30.513399  0.1073      1    14   
7  08db4255286f     7 2018-11-05 14:00:35  30.509399  0.0649      1    14   
8  08db4255286f     8 2018-11-05 14:00:40  32.880600  0.0485      1    14   
9  08db4255286f     9 2018-11-05 14:00:45  34.674999  0.0462      1    14   

   anglez_diff  enmo_diff  anglez_rolling_mean  enmo_rolling_mean  \
0    -5.980469   0.018402            30.765625            0.05838   
1    -5.980469

In [None]:
print(pd.head(10))

      series_id  step           timestamp     anglez    enmo  awake  hour  \
0  08db4255286f     0 2018-11-05 14:00:00  30.845301  0.0447      1    14   
1  08db4255286f     1 2018-11-05 14:00:05  34.181801  0.0443      1    14   
2  08db4255286f     2 2018-11-05 14:00:10  33.877102  0.0483      1    14   
3  08db4255286f     3 2018-11-05 14:00:15  34.282101  0.0680      1    14   
4  08db4255286f     4 2018-11-05 14:00:20  34.385799  0.0768      1    14   
5  08db4255286f     5 2018-11-05 14:00:25  34.925598  0.0511      1    14   
6  08db4255286f     6 2018-11-05 14:00:30  30.513399  0.1073      1    14   
7  08db4255286f     7 2018-11-05 14:00:35  30.509399  0.0649      1    14   
8  08db4255286f     8 2018-11-05 14:00:40  32.880600  0.0485      1    14   
9  08db4255286f     9 2018-11-05 14:00:45  34.674999  0.0462      1    14   

   anglez_diff  enmo_diff  anglez_rolling_mean  enmo_rolling_mean  \
0    -5.980469   0.018402            30.765625            0.05838   
1    -5.980469