In [1]:
import os
import sys
import time
import itertools
import numpy as np
import pandas as pd
from dateutil import tz
from scipy import fftpack, stats
from itertools import chain
from datetime import datetime,timedelta

In [2]:
path = "F:/DATA/hope/1s5wlcm6/accelerometer/2018-01-20 17_00_00.csv"
data = pd.read_csv(path)
data.head(10)

Unnamed: 0,timestamp,UTC time,accuracy,x,y,z
0,1516467868973,2018-01-20T17:04:28.973,3,-0.186753,5.569083,7.245076
1,1516467869113,2018-01-20T17:04:29.113,3,-0.383084,5.674431,7.292961
2,1516467869133,2018-01-20T17:04:29.133,3,-0.565049,5.944984,7.72393
3,1516467869157,2018-01-20T17:04:29.157,3,-0.536318,5.89231,7.829279
4,1516467869158,2018-01-20T17:04:29.158,3,-0.837996,5.937801,8.032792
5,1516467869160,2018-01-20T17:04:29.160,3,-0.67758,5.719923,8.1549
6,1516467869175,2018-01-20T17:04:29.175,3,-0.399844,5.478101,7.917867
7,1516467869315,2018-01-20T17:04:29.315,3,-0.232245,5.310502,7.728719
8,1516467869325,2018-01-20T17:04:29.325,3,-0.395055,5.523592,8.097438
9,1516467869333,2018-01-20T17:04:29.333,3,-0.66082,5.494861,8.209969


In [3]:
## 1. smooth to a certain Hz
hz = 10
def smooth_data(data,hz):
  stamp0 = datetime.fromtimestamp(data['timestamp'][0]/1000)
  stamp1 = [stamp0.year,stamp0.month,stamp0.day,stamp0.hour]
  stamp0 = np.floor(data['timestamp'][0]/1000/60/60)*60*60
  t = np.array(data["timestamp"])
  x = np.array(data["x"])
  y = np.array(data["y"])
  z = np.array(data["z"])
  mag = np.sqrt(x**2+y**2+z**2)
  t_diff = t[1:]-t[:-1]
  t_active = sum(t_diff[t_diff<5*1000])
  t_active = t_active/1000/60  ## in minute
  a = np.floor((t - min(t))/(1/hz*1000))  ## bin
  b = []
  for i in np.unique(a):
    index = a==i
    b.append(np.mean(mag[index]))
  b = np.array(b)
  return stamp0,stamp1,t_active,np.unique(a)*(1/hz*1000),b

In [4]:
stamp0,stamp1,t_active,t,mag = smooth_data(data,hz)

In [34]:
## 2. step estimation funciton
q = 85
c = 1.05
def step_est(t,mag,t_active,q,c):
  if np.mean(mag)>8:
    g = 9.8
  else:
    g = 1
  h = max(np.percentile(mag,q),c*g)
  step = 0
  current = -350
  for j in range(len(t)):
    if(mag[j]>=h and t[j]>=current+350):
      step = step + 1
      current = t[j]
  final_step = int(step/t_active*60)
  return final_step

In [6]:
step_est(t,mag,t_active,q,c)

1857

In [28]:
## 3. other statistics from accelerometer data
def acc_stats(mag,hz):
  if np.mean(mag)<8:
    mag = 9.8*mag
  m_mag = np.mean(mag)
  sd_mag = np.std(mag)
  cur_len = np.mean(abs(mag[1:]-mag[:-1]))
  X = fftpack.fft(mag)  
  amplitude_spectrum = np.abs(X)/hz
  eg = sum(amplitude_spectrum**2)*hz/len(mag)**2
  entropy = stats.entropy(mag)
  return [m_mag,sd_mag,cur_len,eg,entropy]

In [29]:
others = acc_stats(mag,hz)
others

[9.90077849773792,
 0.8550554806154826,
 0.4424653997168446,
 9.875653473620146,
 8.677132504256651]

In [9]:
## wrap up every thing
hz = 10; q = 75; c = 1.05
def GetAccStats(stamp0,stamp1,t_active,t,mag,hz,q,c):
  steps = step_est(t,mag,t_active,q,c)
  others = others = acc_stats(mag,hz)
  result = [stamp0,stamp1[0],stamp1[1],stamp1[2],stamp1[3],t_active,steps,
            others[0],others[1],others[2],others[3],others[4]]
  return np.array(result)

In [10]:
GetAccStats(stamp0,stamp1,t_active,t,mag,hz,q,c)
## year,month,day,hour,active_min,steps,mean_mag,sd_mag,curve_length,energy,entropy

array([1.51646760e+09, 2.01800000e+03, 1.00000000e+00, 2.00000000e+01,
       1.20000000e+01, 9.88225000e+00, 1.85700000e+03, 9.90077850e+00,
       8.55055481e-01, 1.86141304e+00, 9.87565347e+00, 8.67713250e+00])

In [11]:
path = "F:/DATA/hope/1s5wlcm6/accelerometer"
def patient_stats(path,hz,q,c):
  files = os.listdir(path)
  result = []
  for i in range(len(files)):
    dest_path = path + "/" + files[i]
    data = pd.read_csv(dest_path)
    stamp0,stamp1,t_active,t,mag = smooth_data(data,hz)
    if t_active>1:
      result.append(GetAccStats(stamp0,stamp1,t_active,t,mag,hz,q,c))
  result = np.array(result)
  return result

In [12]:
p_stats = patient_stats(path,hz,q,c)
p_stats

array([[ 1.51637760e+09,  2.01800000e+03,  1.00000000e+00, ...,
        -1.11184866e+00,  9.92051264e+00,  8.01004097e+00],
       [ 1.51638120e+09,  2.01800000e+03,  1.00000000e+00, ...,
         9.13242009e-03,  9.81261037e+00,  8.09626621e+00],
       [ 1.51638480e+09,  2.01800000e+03,  1.00000000e+00, ...,
         1.90438247e+00,  9.75395378e+00,  8.23239017e+00],
       ...,
       [ 1.52704440e+09,  2.01800000e+03,  5.00000000e+00, ...,
        -1.26262626e-02,  9.95223647e+00,  8.62044175e+00],
       [ 1.52704800e+09,  2.01800000e+03,  5.00000000e+00, ...,
         1.69934641e+00,  1.00295012e+01,  6.12904960e+00],
       [ 1.52705520e+09,  2.01800000e+03,  5.00000000e+00, ...,
         5.17063082e-03,  1.01197642e+01,  8.26049044e+00]])

In [13]:
def hour_range(h1,h2):
  if h1 + h2 > 23:
    out = np.arange(0,h1+h2-24+1)
    out = np.append(out,np.arange(h1-h2,24))
  elif h1 - h2 < 0:
    out = np.arange(h1-h2+24,24)
    out = np.append(out,np.arange(0,h1+h2+1))
  else:
    out = np.arange(h1-h2,h1+h2+1)
  return out

In [14]:
def check_exist(a1,a2):
  b = np.zeros(len(a1))
  for i in range(len(a1)):
    if sum(a1[i]==a2)>0:
      b[i] = 1
  return np.array(b,dtype=bool)

In [15]:
def fill_missing(p_stats):
  full_stats = []
  start_t = p_stats[0,0]
  end_t = p_stats[-1,0]
  k = int((end_t - start_t)/3600 + 1)
  j = 0
  for i in range(k):
    current_t = start_t + 3600*i
    if current_t == p_stats[j,0]:
      full_stats.append(p_stats[j,:])
      j = j + 1
    else:
      t = datetime.fromtimestamp(current_t)
      if sum(t.hour==p_stats[:,4])>15:
        candidates = p_stats[p_stats[:,4]==t.hour,:]
      else:
        index = check_exist(p_stats[:,4],hour_range(t.hour,2))
        if sum(index)<15:
          index = np.arange(p_stats.shape[0])
        candidates = p_stats[index,:]
      r = np.random.randint(candidates.shape[1])
      temp = [[current_t,t.year,t.month,t.day,t.hour,0],candidates[r,np.arange(6,12)].tolist()]
      newline = np.array(list(itertools.chain(*temp)))
      full_stats.append(newline) 
  return np.array(full_stats)

In [16]:
full_stats = fill_missing(p_stats)

In [17]:
def hour2day(full_stats):
  daily_stats = []
  t = full_stats[:,0]
  days = t/(60*60*24)
  start_day = np.ceil(days[0])
  end_day = np.floor(days[-1])
  for i in np.arange(start_day,end_day+1):
    temp = full_stats[(days>=i)*(days<i+1)]
    newline = np.append(temp[0,np.arange(1,4)],np.sum(temp[:,np.arange(5,12)],axis=0))
    daily_stats.append(newline)
  return np.array(daily_stats)

In [18]:
daily_stats = hour2day(full_stats)
daily_stats

array([[ 2.01800000e+03,  1.00000000e+00,  1.90000000e+01, ...,
         5.43333170e+00,  2.36172419e+02,  1.82514679e+02],
       [ 2.01800000e+03,  1.00000000e+00,  2.00000000e+01, ...,
        -5.07990820e+00,  2.37181949e+02,  1.83807498e+02],
       [ 2.01800000e+03,  1.00000000e+00,  2.10000000e+01, ...,
        -1.50715261e-01,  2.38409308e+02,  1.85536268e+02],
       ...,
       [ 2.01800000e+03,  5.00000000e+00,  2.00000000e+01, ...,
         7.62287036e+00,  2.35182381e+02,  1.81215428e+02],
       [ 2.01800000e+03,  5.00000000e+00,  2.10000000e+01, ...,
        -4.43458949e+00,  2.38894523e+02,  1.84876468e+02],
       [ 2.01800000e+03,  5.00000000e+00,  2.20000000e+01, ...,
         4.06484975e-01,  6.91247694e+01,  5.22532643e+01]])

In [33]:
def summarize_acc(input_path,output_path,option,hz=10,q=75,c=1.05):
  user_list = os.listdir(input_path)
  if option == "both":
    os.mkdir(output_path+"/hourly")
    os.mkdir(output_path+"/daily")
  for i in range(len(user_list)):
    sys.stdout.write( "Processing data from "+ user_list[i]  + '\n')
    acc_path = input_path + "/" + user_list[i] +"/accelerometer"
    if os.path.isdir(acc_path):
      p_stats = patient_stats(acc_path,hz,q,c)
      full_stats = fill_missing(p_stats)
      if option == "hourly":
        full_stats = full_stats[:,1:]
        full_stats = pd.DataFrame(full_stats)
        full_stats.columns = ["year","month","day","hour","active_min","steps","mean_mag","sd_mag",
                              "cur_len","energy","entropy"]
        dest_path = output_path + "/" + user_list[i] + "_hourly_acc.csv"
        full_stats.to_csv(dest_path,index=False)
      if option == "daily":
        daily_stats = hour2day(full_stats)
        daily_stats = pd.DataFrame(daily_stats)
        daily_stats.columns = ["year","month","day","active_min","steps","mean_mag","sd_mag",
                              "cur_len","energy","entropy"]
        dest_path = output_path + "/" + user_list[i] + "_daily_acc.csv"
        daily_stats.to_csv(dest_path,index=False)
      if option == "both":
        output_path1 = output_path+"/hourly"
        output_path2 = output_path+"/daily"
        daily_stats = hour2day(full_stats)
        daily_stats = pd.DataFrame(daily_stats)
        daily_stats.columns = ["year","month","day","active_min","steps","mean_mag","sd_mag",
                              "cur_len","energy","entropy"]
        dest_path = output_path2 + "/" + user_list[i] + "_daily_acc.csv"
        daily_stats.to_csv(dest_path,index=False)
        full_stats = full_stats[:,1:]
        full_stats = pd.DataFrame(full_stats)
        full_stats.columns = ["year","month","day","hour","active_min","steps","mean_mag","sd_mag",
                              "cur_len","energy","entropy"]
        dest_path = output_path1 + "/" + user_list[i] + "_hourly_acc.csv"
        full_stats.to_csv(dest_path,index=False)
    sys.stdout.write( "Done" + '\n')