In [None]:
import numpy as np 
import pandas as pd 
import os

TRAIN_PATH = '/kaggle/input/datafile/train'
TEST_PATH = '/kaggle/input/datafile1/test'
VAL_PATH = '/kaggle/input/datavalidation/val'

train_datas = {}
test_datas = {}
val_datas = {}

for dirname, _, filenames in os.walk(TRAIN_PATH):
    for filename in filenames:
        train_datas[filename] = os.path.join(dirname, filename)

for dirname, _, filenames in os.walk(TEST_PATH):
    for filename in filenames:
        test_datas[filename] = os.path.join(dirname, filename)
        
for dirname, _, filenames in os.walk(VAL_PATH):
    for filename in filenames:
        val_datas[filename] = os.path.join(dirname, filename)

        
len(train_datas), len(test_datas), len(val_datas)

- 20Hz = 20 readings/seconds.
- Activity Time = 3 minutes.
- 180 * 20 = 3600 rows/eachactivity/eachsubject

In [None]:
import numpy as np


def window(width, overlap, max_idx):    
    start = 0
    if overlap < 0.0 or overlap >= 1.:
        raise ValueError("overlap needs to be a number between 0 and 1")
    while True:
        end = start + width
        if end >= max_idx:
            return None
        yield start, end
        start += max(int((1-overlap)*width), 1)

        
def window_df(df, width, overlap):
    windows = window(width, overlap, len(df))
    for start, end in windows:
        yield df[start:end]


def standardize(df):    
    return (df - df.mean()) / df.std()


def zero_cross_rate(series):    
    zero_cross_count = (np.diff(np.sign(series)) != 0).sum()
    return zero_cross_count / len(series)

In [2]:
SAMPLING_RATE = 20 
DATA_COLS = ["x", "y", "z"] 
TARGET_COL = ["target"] 

VALID_TARGETS = list("abcdefghijklmopqrs".upper())
print(len(VALID_TARGETS))

18


In [1]:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis
import os
import pickle
from collections import defaultdict

N_SECONDS = 5
OVERLAP = 0.5

def generate_features(df):

    master_valid = df[df['target'].isin(VALID_TARGETS)]

    feature_matrix = []
    
    for target, df in master_valid.groupby('target'):
        df = standardize(df[DATA_COLS])
                
        grp = defaultdict(list)
        grp['target'] = target
        samples = window_df(df,width=N_SECONDS*SAMPLING_RATE,overlap=OVERLAP)
        
        for sample in samples:
            means = sample[DATA_COLS].mean()
            grp['x_mean'].append(means['x'])
            grp['y_mean'].append(means['y'])
            grp['z_mean'].append(means['z'])
            
            stds = sample[DATA_COLS].std()
            grp['x_std'].append(stds['x'])
            grp['y_std'].append(stds['y'])
            grp['z_std'].append(stds['z'])
            
            grp['x_max_min'].append(max(sample["x"]) - min(sample["x"]))
            grp['y_max_min'].append(max(sample["y"]) - min(sample["y"]))
            grp['z_max_min'].append(max(sample["z"]) - min(sample["z"]))
            
            corrs = sample[DATA_COLS].corr()
            grp['xy_corr'].append(corrs.loc['x', 'y'])
            grp['xz_corr'].append(corrs.loc['x', 'z'])
            grp['yz_corr'].append(corrs.loc['y', 'z'])
            
        
            rms = np.sqrt(np.mean(np.square(sample[DATA_COLS]), axis=1))
            grp['rms_mean'].append(rms.mean())
            grp['rms_std'].append(rms.std())

            grp['x_zcr'].append(zero_cross_rate(sample['x']))
            grp['y_zcr'].append(zero_cross_rate(sample['y']))
            grp['z_zcr'].append(zero_cross_rate(sample['z']))

            kurtoses = kurtosis(sample[DATA_COLS])
            grp['x_kurtosis'].append(kurtoses[0])
            grp['y_kurtosis'].append(kurtoses[1])
            grp['z_kurtosis'].append(kurtoses[2])

  
            x_fft = abs(np.fft.rfft(sample['x']))
            y_fft = abs(np.fft.rfft(sample['y']))
            z_fft = abs(np.fft.rfft(sample['z']))

            grp['x_freq_max'].append(np.argmax(x_fft))
            grp['y_freq_max'].append(np.argmax(y_fft))
            grp['z_freq_max'].append(np.argmax(z_fft))

    
            grp['x_fft_max'].append(x_fft.max())
            grp['y_fft_max'].append(y_fft.max())
            grp['z_fft_max'].append(z_fft.max())
            
 
            grp['x_fft_mean'].append(x_fft.mean())
            grp['y_fft_mean'].append(y_fft.mean())
            grp['z_fft_mean'].append(z_fft.mean())

        
            grp['x_fft_std'].append(x_fft.std())
            grp['y_fft_std'].append(y_fft.std())
            grp['z_fft_std'].append(z_fft.std())

            grp['x_fft_kurtosis'].append(kurtosis(x_fft))
            grp['y_fft_kurtosis'].append(kurtosis(y_fft))
            grp['z_fft_kurtosis'].append(kurtosis(z_fft))


        feature_matrix.append(pd.DataFrame(grp))

 
    feature_matrix_df = pd.concat(feature_matrix)
    
    return feature_matrix_df


In [None]:
def preprocess_data(data_paths):
    feature_dfs = []
    for txt_file in data_paths:
        txt_hints = txt_file.split('_')
        print("Pre-Processing {} ...".format(data_paths[txt_file]))
        df = pd.read_csv(data_paths[txt_file],names = ['id', 'target', 'timestamp', 'x', 'y', 'z'])
        df['z'] = df['z'].apply(lambda x: float(x.strip(';')))

        fdf = generate_features(df)
        fdf['idx'] = int(txt_hints[1])
        fdf['sensor'] = txt_hints[2]
        fdf['device'] = txt_hints[3].split('.')[0]
        feature_dfs.append(fdf)
    feature_df = pd.concat(feature_dfs)
    return feature_df

In [None]:
train_data = preprocess_data(train_datas)
output_file = "train_data.pkl"
with open(output_file, 'wb') as f:
    pickle.dump(train_data, f)

In [None]:
test_data = preprocess_data(test_datas)

output_file = "test_data.pkl"
with open(output_file, 'wb') as f:
    pickle.dump(test_data, f)

In [None]:
val_data = preprocess_data(val_datas)
output_file = "val_data.pkl"
with open(output_file, 'wb') as f:
    pickle.dump(val_data, f)

In [None]:
!ls