In [57]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import os
import time

import torch
import torch.nn as nn
import torch.nn.functional as F

In [59]:
# label format: PID-GA-FEMALE_FLAG
data = {}

print('Loading data into dict:')
counter = 0
tic = time.time()
for folder in glob.glob('/home/ngr/gdrive/wearables/data/MOD_1000_Woman_Activity_Data/*'):
    for file in glob.glob(os.path.join(folder, '*GA*.csv')):
            dt = pd.read_csv(file)
            
            f = os.path.split(file)[1]
            f = f.replace(' ', '')
            if '_' in f:
                pid, f = f.split('_GA')
            elif '-' in f:
                pid, f = f.split('-GA')
            GA = int(f.split('.csv')[0])
            
            if dt.loc[dt['UserID']=='Sex'].shape[0] == 0:
                sex_female = 'NA'             
            else:
                sex_female = 1 if 'fem' in dt.loc[dt['UserID']=='Sex'].iloc[:, 1].values[0].lower() else 0
                
            for i, row0 in enumerate(dt.iloc[:, 0]):
                if isinstance(row0, str):
                    if np.sum([True if '/' in ii else False for ii in row0]) == 2:
                        row_idx_start = i
                        break
                        
            idx = dt.iloc[row_idx_start:, 0].loc[(~dt.iloc[row_idx_start:, [0,1,2]].isna().any(1)) == True].index.to_list()
            t = pd.to_datetime(dt.iloc[idx, 0].astype(str) + ' ' + dt.iloc[idx, 1].astype(str), format='%m/%d/%Y %I:%M:%S %p',)
            activity = dt.iloc[idx, 2] # MW counts
            data['{}-{}-{}'.format(pid, GA, sex_female)] = [t.to_list(), activity.to_list()]
            counter += 1
            
            if counter % 100 == 0 :
                print('.. through {} graphs in {:.0f}-s'.format(counter, time.time() - tic))
                
print('\n... {} graphs loaded in {:.1f}-min'.format(counter, (time.time() - tic)/60))

Through 100 graphs in 35-s


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Through 200 graphs in 73-s
Through 300 graphs in 99-s
Through 400 graphs in 124-s
Through 500 graphs in 148-s
Through 600 graphs in 168-s
Through 700 graphs in 196-s
Through 800 graphs in 217-s
Through 900 graphs in 264-s
Through 1000 graphs in 330-s
Through 1100 graphs in 387-s
Through 1200 graphs in 457-s
Through 1300 graphs in 523-s
Through 1400 graphs in 602-s
Through 1500 graphs in 631-s
Through 1600 graphs in 655-s
Through 1700 graphs in 675-s
Through 1800 graphs in 704-s
Through 1900 graphs in 721-s
Through 2000 graphs in 742-s
Through 2100 graphs in 762-s
Through 2200 graphs in 782-s
Through 2300 graphs in 803-s
Through 2400 graphs in 837-s
Through 2500 graphs in 858-s
Through 2600 graphs in 877-s


In [62]:
def pkl_that(data, fname, verbose=True):
    import pickle
    if not os.path.exists(os.path.split(fname)[0]):
        os.mkdir(os.path.split(fname)[0])
    tic = time.time()
    with open(fname, 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
        f.close()
    if verbose:
        print('Wrote data to {} in {:.0f}-s'.format(fname, time.time() - tic))
    return None

In [63]:
pkl_that(data, '/home/ngr/gdrive/wearables/data/processed/MOD_1000_Woman_Activity_Data.pkl')

Wrote data to /home/ngr/gdrive/wearables/data/processed/MOD_1000_Woman_Activity_Data.pkl in 224-s


In [145]:
def tx2coo(data, verbose=True):
    '''
    Returns:
      list (pid-GA-femaleflag), torch.sparse_coo (m samples x n timepoints)
    '''
    y = []
    max_tn = 0
    if verbose:
        print('Loading time-series in dict to torch sparse tensor (coo format)...')
        tic = time.time()
    for i, (k, v) in enumerate(data.items()):
        if len(v[1]) >= max_tn: # reset max number of time points
            max_tn = len(v[1])
        y.append(k)
        if i == 0:
            X_idx = np.array([np.repeat(i, len(v[1])), np.arange(len(v[1]))])
            X_val = np.array([float(ii) for ii in v[1]]) # v[0] are datetimes
        else:
            idx = np.array([np.repeat(i, len(v[1])), np.arange(len(v[1]))])
            val = np.array([float(ii) for ii in v[1]]) # v[0] are datetimes
            
            X_idx = np.concatenate((X_idx, idx), 1)
            X_val = np.append(X_val, val)
        if verbose and i % 100 == 0 and i != 0:
            print('... through {} graphs in {:.0f}-s'.format(i+1, time.time() - tic))
    return y, torch.sparse_coo_tensor(torch.tensor(X_idx), torch.tensor(X_val), (len(data.keys()), max_tn))
        

In [None]:
y, X = tx2coo(data)

Loading time-series in dict to torch sparse tensor (coo format)...
... through 101 graphs in 6-s
... through 201 graphs in 24-s
... through 301 graphs in 51-s
... through 401 graphs in 86-s
... through 501 graphs in 127-s
... through 601 graphs in 174-s
... through 701 graphs in 229-s
... through 801 graphs in 292-s
... through 901 graphs in 363-s
... through 1001 graphs in 442-s
... through 1101 graphs in 526-s
... through 1201 graphs in 618-s


In [None]:
def ystr2df(y):
    pid, GA, female = [], [], []
    for i, yy in enumerate(y):
        pid_i, GA_i, female_i = yy.split('-')
        pid.append(pid_i)
        GA.append(int(GA_i))
        female.append(int(female_i))
    dt = pd.DataFrame({'pid':pid, 'GA':GA, 'female':female})
    return dt


In [None]:
md = ystr2df(y)

In [None]:
pkl_that({'y':md, 'X':X}, '/home/ngr/gdrive/wearables/data/processed/MOD1k_GAactigraphy_torchsparse.pkl')