In [1]:


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown
import gc



In [2]:
train_events = pd.read_csv("data/train_events.csv")

In [3]:


series_has_NaN = train_events.groupby('series_id')['step'].apply(lambda x: x.isnull().any())
series_has_NaN.value_counts()



step
True     240
False     37
Name: count, dtype: int64

In [4]:


no_NaN_series = series_has_NaN[~series_has_NaN].index.tolist()
no_NaN_series



['08db4255286f',
 '0a96f4993bd7',
 '0cfc06c129cc',
 '1087d7b0ff2e',
 '10f8bc1f7b07',
 '18b61dd5aae8',
 '29c75c018220',
 '31011ade7c0a',
 '3452b878e596',
 '349c5562ee2c',
 '3664fe9233f9',
 '483d6545417f',
 '55a47ff9dc8a',
 '5acc9d63b5fd',
 '5f94bb3e1bed',
 '655f19eabf1e',
 '67f5fc60e494',
 '72bbd1ac3edf',
 '76237b9406d5',
 '7822ee8fe3ec',
 '89bd631d1769',
 '8e32047cbc1f',
 '939932f1822d',
 '9ee455e4770d',
 'a596ad0b82aa',
 'a9a2f7fac455',
 'a9e5f5314bcb',
 'af91d9a50547',
 'b364205aba43',
 'c535634d7dcd',
 'c6788e579967',
 'c68260cc9e8f',
 'ca730dbf521d',
 'd150801f3145',
 'd25e479ecbb7',
 'd515236bdeec',
 'd5e47b94477e']

In [5]:


# also drop these two "truncated" events series seen in EDA:
no_NaN_series.remove('31011ade7c0a') # incomplete events data
no_NaN_series.remove('a596ad0b82aa') # incomplete events data



In [6]:
def get_train_series(series):
    train_series = pd.read_parquet("data/train_series.parquet", filters=[('series_id','=',series)])
    train_events = pd.read_csv("data/train_events.csv").query('series_id == @series')
    
    train_events = train_events.dropna()
    train_events["step"]  = train_events["step"].astype("int")
    train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})

    train = pd.merge(train_series, train_events[['step','awake']], on='step', how='left')
    train["awake"] = train["awake"].bfill(axis ='rows')
    # final section:
    # train_events.groupby('series_id').tail(1)["event"].unique()
    # Result: the last event is always a "wakeup"
    train['awake'] = train['awake'].fillna(1) # awake
    train["awake"] = train["awake"].astype("int")
    return(train)

In [7]:
smaller_train_data = []

for series_id in no_NaN_series:
    train = get_train_series(series_id)
    smaller_train_data.append(train)

  train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
  train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
  train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
  train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
  train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
  train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
  train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
  train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
  train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
  train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
  train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
  train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})
  train_events["awake"] = train_events["

In [8]:


Zzzs_train = pd.concat(smaller_train_data).reset_index(drop=True)
Zzzs_train["series_id"].nunique()



Zzzs_train.to_parquet('processed/Zzzs_train.parquet')



35

In [9]:

train_series_names = pd.read_parquet("data/train_series.parquet", columns=['series_id'])
series_id_list = train_series_names['series_id'].unique().tolist()

train_events = train_events.dropna()
events_series_id_list = train_events['series_id'].unique().tolist()

base_series = list(set(series_id_list) - set(events_series_id_list))

# list out the series:
base_series



['0f9e60a8e56d',
 'c7b1283bb7eb',
 '89c7daa72eee',
 '390b487231ce',
 'a3e59c2ce3f6',
 'c5d08fc3e040',
 'e11b9d69f856',
 '2fc653ca75c7']

In [10]:
def get_train_series_base(series):
    train = pd.read_parquet("data/train_series.parquet", filters=[('series_id','=',series)])
    train["awake"] = 2
    # just to make sure
    train["awake"] = train["awake"].astype("int")
    return(train)

In [11]:
base_train_data = []

for series_id in base_series:
    train = get_train_series_base(series_id)
    base_train_data.append(train)

In [12]:


Zzzs_base = pd.concat(base_train_data).reset_index(drop=True)
# combine the 35 series from before with the 8 series that have no event data
Zzzs_train_multi = pd.concat([Zzzs_train,Zzzs_base]).reset_index(drop=True)

In [13]:


Zzzs_train_multi.to_parquet('processed/Zzzs_train_multi.parquet')



In [14]:
Zzzs_train_multi.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake
0,08db4255286f,0,2018-11-05T10:00:00-0400,-30.845301,0.0447,1
1,08db4255286f,1,2018-11-05T10:00:05-0400,-34.181801,0.0443,1
2,08db4255286f,2,2018-11-05T10:00:10-0400,-33.877102,0.0483,1
3,08db4255286f,3,2018-11-05T10:00:15-0400,-34.282101,0.068,1
4,08db4255286f,4,2018-11-05T10:00:20-0400,-34.385799,0.0768,1
