In [1]:
import os
import sys
import time
import pickle
import numpy as np
import pandas as pd
from dateutil import tz
from datetime import datetime,timedelta

In [2]:
with open('empirical_steps.pickle', 'rb') as handle:
  empirical_steps = pickle.load(handle)

In [3]:
## 1. smooth to a certain Hz
def smooth_data(data,hz):
  t = datetime.fromtimestamp(data['timestamp'][0]/1000)
  hr = t.hour
  t0 = datetime.timestamp(t-timedelta(minutes=t.minute,seconds=t.second,microseconds=t.microsecond))*1000
  t_seq = np.array(data['timestamp'])
  ## first locate those active bouts (let min interval be s seconds)
  num = np.floor((t_seq - t0)/(1/hz*1000))
  j = 0; i = 1
  new_data = []
  while i<(len(t_seq)-1):
    if num[i]==num[j]:
      i = i + 1
      if i == len(t_seq):
        index = np.arange(j,i)
        mean_x = np.mean(data['x'][index])
        mean_y = np.mean(data['y'][index])
        mean_z = np.mean(data['z'][index])
        new_data.append([t0+1/hz*1000*num[i],mean_x,mean_y,mean_z])
    else: 
      index = np.arange(j,i)
      mean_x = np.mean(data['x'][index])
      mean_y = np.mean(data['y'][index])
      mean_z = np.mean(data['z'][index])
      new_data.append([t0+1/hz*1000*num[i],mean_x,mean_y,mean_z])
      j = i
      i = i+1
  new_data = np.array(new_data)
  mag = np.sqrt(new_data[:,1]**2+new_data[:,2]**2+new_data[:,3]**2)
  stamp = new_data[:,0]
  return hr,t0,stamp,mag

In [4]:
## 2. minute-wise step estimation funciton
def step_est_min(stamp,mag,t0,hz,q,c):
  if np.mean(mag)>8:
    g = 9.8
  else:
    g = 1
  h = max(np.percentile(mag,q),c*g)
  output = []
  for i in range(60):
    index = (stamp>=t0+i*60*1000)*(stamp<t0+(i+1)*60*1000)
    sub_mag = mag[index]
    sub_stamp = stamp[index]
    if len(sub_mag)<=1:
      output.append([i,0,0,0])
    else:
      step = 0
      current = min(sub_stamp)-350
      for j in np.arange(1,len(sub_mag)):
        if(sub_mag[j]>=h and sub_stamp[j]>=current+350):
          step = step + 1
          current = sub_stamp[j]
      on_time = len(sub_mag)/hz
      output.append([i,on_time,step,np.floor(step*60/on_time)])
  output = pd.DataFrame(np.array(output))
  output.columns = ["min","active_s","step_obs","step_infer"]
  return output

In [5]:
## 3. check if there exists any walk within an interval
def nearby_walk(data,k,h):
  walk = (np.array(data['step_infer'])>h)*1
  record = (np.array(data['active_s'])>0)*1
  nearby = walk
  active = record
  n = data.shape[0]
  for i in np.arange(1,k+1):
    nearby = nearby + np.concatenate((walk[np.arange(i,n)],np.zeros(i))) + np.concatenate((np.zeros(i), walk[np.arange(0,n-i)]))
    active = active + np.concatenate((record[np.arange(i,n)],np.zeros(i))) + np.concatenate((np.zeros(i), record[np.arange(0,n-i)]))
  final = (np.array(active)>=1)*1
  final[(active>=1)*(nearby==0)] = 1
  final[(active>=1)*(nearby>=1)] = 2
  return final

In [6]:
## 4. impute the steps based on the output of nearby_walk
def imp_steps(output,final,hr):
  steps = np.zeros(output.shape[0])
  for i in range(output.shape[0]):
    if np.array(output['active_s'])[i]>5:
      steps[i] = np.array(output['step_infer'])[i]
    elif final[i] == 0:
      r = np.random.choice(range(len(empirical_steps['no_records'][hr])),1)
      steps[i] = empirical_steps['no_records'][hr][r]
    elif final[i] == 1:
      r = np.random.choice(range(len(empirical_steps['non_walk'][hr])),1)
      steps[i] = empirical_steps['non_walk'][hr][r]
    else:
      r = np.random.choice(range(len(empirical_steps['walk'][hr])),1)
      steps[i] = empirical_steps['walk'][hr][r]
  return(sum(steps))    

In [7]:
## 5. put everything together when read in a raw csv file
def imp_hour(path,hz,q,c,k,h):
  data = pd.read_csv(path)
  hr,t0,stamp,mag = smooth_data(data,hz)
  output = step_est_min(stamp,mag,t0,hz,q,c)
  final = nearby_walk(output,k,h)
  step = imp_steps(output,final,hr)
  return t0,step

In [8]:
hz = 10; q=75; c=1.05; k=60; h=60
path = "C:/Users/glius/Downloads/beiwe_data/1s5wlcm6/accelerometer/2018-01-19 17_00_00.csv"
t0,step = imp_hour(path,hz,q,c,k,h)
step

573.0

In [9]:
datetime.fromtimestamp(t0/1000)

datetime.datetime(2018, 1, 19, 12, 0)

In [42]:
## 6. read in all the data from the folder one by one, and fill in the hours without records
def hourly_step_count(data_path,output_path,hz,q,c,k,h):
  for i in os.listdir(data_path):
    result0 = []
    t_vec = []
    patient_path = data_path+"/"+i+"/accelerometer/"
    for j in os.listdir(patient_path):
      path = patient_path + j
      t0,step = imp_hour(path,hz,q,c,k,h)
      t1 = datetime.fromtimestamp(t0/1000)
      t_vec.append(t0)
      result0.append([t1.year,t1.month,t1.day,t1.hour,step])
    t_vec = np.array(t_vec)
    nrow = int((t_vec[-1]-t_vec[0])/(1000*60*60)+1)
    result1 = []
    m = 0
    for k in range(nrow):
      current_t = t_vec[0] + 1000*60*60*k
      if current_t == t_vec[m]:
        result1.append(result0[m])
        m = m + 1
      else:
        stamp = datetime.fromtimestamp(current_t/1000)
        r = np.random.choice(range(len(empirical_steps['no_records'][stamp.hour])),60)
        step = sum(empirical_steps['no_records'][stamp.hour][r])
        result1.append([stamp.year,stamp.month,stamp.day,stamp.hour,step])
    result1 = pd.DataFrame(np.array(result1))
    result1.columns = ["year","month","day","hour","steps"]
    result1.to_csv(output_path + "/" + i + ".csv", index=False)

In [43]:
## test
data_path = "C:/Users/glius/Downloads/test_data"
output_path = "C:/Users/glius/Downloads/test_output"
hourly_step_count(data_path,output_path,hz,q,c,k,h)

In [48]:
## At last, test the time used in each function
def time_decompose(path,hz,q,c,k,h):
  s0 = time.time()
  data = pd.read_csv(path)
  s1 = time.time()
  hr,t0,stamp,mag = smooth_data(data,hz)
  s2 = time.time()
  output = step_est_min(stamp,mag,t0,hz,q,c)
  final = nearby_walk(output,k,h)
  step = imp_steps(output,final,hr)
  s3 = time.time()
  return s1-s0,s2-s1,s3-s2

time_table=[]
for i in os.listdir(data_path):
  patient_path = data_path+"/"+i+"/accelerometer/"
  for j in os.listdir(patient_path):
    path = patient_path + j
    g1,g2,g3 = time_decompose(path,hz,q,c,k,h)
    time_table.append([g1,g2,g3])
time_table = np.array(time_table)
time_table

array([[3.19137573e-02, 3.35702538e+00, 6.68208599e-02],
       [1.09705925e-02, 3.65123916e+00, 6.38616085e-02],
       [1.59580708e-02, 4.14887238e+00, 5.88762760e-02],
       [9.97328758e-03, 5.42555094e-01, 1.28618002e-01],
       [8.97622108e-03, 1.08712173e+00, 1.30622387e-01],
       [3.98921967e-03, 3.23136806e-01, 1.37631416e-01],
       [5.98478317e-03, 2.36367702e-01, 1.03722572e-01],
       [1.69551373e-02, 2.42750788e+00, 9.07573700e-02],
       [8.97598267e-03, 2.52923822e+00, 7.08105564e-02],
       [3.98945808e-03, 1.42219830e+00, 8.47733021e-02],
       [4.98771667e-03, 1.00132227e+00, 1.05715752e-01],
       [8.97574425e-03, 2.98901033e+00, 1.66554451e-01],
       [1.49598122e-02, 5.42649078e+00, 5.98359108e-03],
       [1.29656792e-02, 5.15225339e+00, 1.29661560e-02],
       [4.98151779e-03, 1.29052687e+00, 3.04184914e-01],
       [3.98969650e-03, 5.17616987e-01, 4.84703064e-01],
       [8.97622108e-03, 7.09103823e-01, 3.24133873e-01],
       [7.97820091e-03, 7.77921

In [50]:
time_table[:,1]/np.sum(time_table,axis=1)

array([0.97142897, 0.97991659, 0.98228231, 0.7965323 , 0.88620177,
       0.69528054, 0.68299549, 0.95751355, 0.96941902, 0.9412541 ,
       0.90044883, 0.94453217, 0.99615536, 0.9949921 , 0.80673393,
       0.51437143, 0.68038222, 0.70909239, 0.9864016 , 0.9859762 ,
       0.77935332, 0.87220272, 0.96713076, 0.97702244, 0.93904372,
       0.81411637, 0.87676095, 0.83952946, 0.89838058, 0.93746917,
       0.90449403, 0.9787398 , 0.92713816, 0.95285636, 0.94375916])