In [1]:
import numpy as np
import pandas as pd
import glob
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier as RFC, ExtraTreesClassifier as ETC
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, StratifiedKFold

# Baseline Model

In [2]:
def segmentation(df, overlap_rate, time_window):
    
    # make a list for segment window and its label
    seg_data = []

    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    #segment
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
        
    return seg_data

In [19]:
data_list = []
for file in tqdm(glob.glob("../TrainData/*/*/*.csv")):
    tempdf = pd.read_csv(file)
    data_list.extend(segmentation(tempdf, 0.5, 500))

  0%|          | 0/151 [00:00<?, ?it/s]

In [3]:
def get_features(x_data):
    #Set features list
    features = []
    #Set columns name list
    cols = x_data.columns.tolist()

    #Calculate features (STD, Average, Max, Min) for each data columns X Y Z 
    for k in cols:
        # std
        features.append(x_data[k].std(ddof=0))
        # avg
        features.append(np.average(x_data[k]))
        # max
        features.append(np.max(x_data[k]))
        # min
        features.append(np.min(x_data[k]))
        #median
        features.append(np.median(x_data[k]))                                
        features.append(np.var(x_data[k]))
#         #skewness
#         features.append(stats.skew(x_data[k]))
#         #kutosis
#         features.append(stats.kurtosis(x_data[k]))
    return features

In [13]:
features_list = []
label_list = []
for j in tqdm(range(0,len(data_list))):
    #extract only xyz columns
    x_data = data_list[j].drop(columns=["subject_id","activity"])

    #Get features and label for each elements
    features_list.append(get_features(x_data))
    label_list.append(data_list[j].iloc[0, -1])

  0%|          | 0/3573 [00:00<?, ?it/s]

In [20]:
rf = RFC(n_estimators=300,n_jobs=-1)

In [23]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(rf, features_list, label_list, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise', verbose=2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.8min finished


In [25]:
n_scores.mean(), n_scores

(0.8262024735406269,
 array([0.79329609, 0.82960894, 0.82681564, 0.85434174, 0.83753501,
        0.86834734, 0.79831933, 0.81792717, 0.83473389, 0.8487395 ,
        0.83798883, 0.79888268, 0.84078212, 0.82352941, 0.82913165,
        0.85154062, 0.80112045, 0.82913165, 0.81232493, 0.82072829,
        0.79050279, 0.81005587, 0.84357542, 0.85154062, 0.82633053,
        0.79551821, 0.82913165, 0.83753501, 0.85154062, 0.79551821]))

# Tune the baseline

In [4]:
def train_and_evaluate(model, overlap_rate, segment_size, shuffle=False, n_splits=10, n_repeats=3):
    print("Loading the dataset...")
    data_list = []
    for file in tqdm(glob.glob("../TrainData/*/*/*.csv")):
        tempdf = pd.read_csv(file)
        data_list.extend(segmentation(tempdf, overlap_rate, segment_size))
    features_list = []
    label_list = []
    print("Extracting the features...")
    for j in tqdm(range(0,len(data_list))):
        #extract only xyz columns
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        #Get features and label for each elements
        features_list.append(get_features(x_data))
        label_list.append(data_list[j].iloc[0, -1])
    print("Training the model...")
    if shuffle:
        cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=1)
    else:
        cv = StratifiedKFold(n_splits=n_splits,shuffle=shuffle)
    n_scores = cross_val_score(model, features_list, label_list, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return n_scores

In [17]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 500, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:10<00:00, 14.03it/s]


Extracting the features...


100%|██████████| 3573/3573 [01:33<00:00, 38.32it/s]


Training the model...
0.6927053502965432


In [18]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 500, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:10<00:00, 14.80it/s]


Extracting the features...


100%|██████████| 3573/3573 [01:38<00:00, 36.44it/s]


Training the model...
0.8279736475595826


In [12]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 500, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:09<00:00, 16.54it/s]


Extracting the features...


100%|██████████| 1818/1818 [00:43<00:00, 41.44it/s]


Training the model...
0.730457571894036


In [11]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 500, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:09<00:00, 16.68it/s]


Extracting the features...


100%|██████████| 1818/1818 [00:43<00:00, 41.68it/s]


Training the model...
0.6689089915609252


In [19]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 800, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:09<00:00, 15.89it/s]


Extracting the features...


100%|██████████| 2143/2143 [00:52<00:00, 40.80it/s]


Training the model...
0.8727667898282985


In [20]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 800, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:10<00:00, 13.84it/s]


Extracting the features...


100%|██████████| 2143/2143 [01:11<00:00, 30.18it/s]


Training the model...
0.7531514888067812


In [9]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 800, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:08<00:00, 16.91it/s]


Extracting the features...


100%|██████████| 1113/1113 [00:27<00:00, 40.81it/s]


Training the model...
0.7451603389103388


In [13]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 800, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:08<00:00, 16.79it/s]


Extracting the features...


100%|██████████| 1113/1113 [00:27<00:00, 40.81it/s]


Training the model...
0.703555341055341


In [21]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 1000, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:11<00:00, 13.67it/s]


Extracting the features...


100%|██████████| 1667/1667 [00:48<00:00, 34.31it/s]


Training the model...
0.8768427001418851


In [22]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 1000, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:12<00:00, 11.97it/s]


Extracting the features...


100%|██████████| 1667/1667 [01:01<00:00, 27.05it/s]


Training the model...
0.7684907293846044


In [6]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 1000, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:09<00:00, 16.78it/s]


Extracting the features...


100%|██████████| 865/865 [00:21<00:00, 40.09it/s]


Training the model...
0.7622739018087856


In [14]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 1000, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:09<00:00, 16.53it/s]


Extracting the features...


100%|██████████| 865/865 [00:21<00:00, 40.55it/s]


Training the model...
0.7317963111467523


In [25]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 1200, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:10<00:00, 13.74it/s]


Extracting the features...


100%|██████████| 1359/1359 [00:41<00:00, 32.58it/s]


Training the model...
0.8638779956427014


In [26]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 1200, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:10<00:00, 14.30it/s]


Extracting the features...


100%|██████████| 1359/1359 [00:37<00:00, 36.65it/s]


Training the model...
0.7880718954248366


In [8]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 1200, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:08<00:00, 17.17it/s]


Extracting the features...


100%|██████████| 713/713 [00:18<00:00, 39.22it/s]


Training the model...
0.774237089201878


In [15]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 1200, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:08<00:00, 17.07it/s]


Extracting the features...


100%|██████████| 713/713 [00:23<00:00, 30.31it/s]


Training the model...
0.7491001564945228


In [27]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 1500, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:08<00:00, 17.35it/s]


Extracting the features...


100%|██████████| 1037/1037 [00:25<00:00, 40.89it/s]


Training the model...
0.8678864824495893


In [28]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 1500, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:10<00:00, 14.86it/s]


Extracting the features...


100%|██████████| 1037/1037 [00:27<00:00, 37.23it/s]


Training the model...
0.7907580283793876


In [7]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 1500, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:08<00:00, 17.38it/s]


Extracting the features...


100%|██████████| 554/554 [00:14<00:00, 38.59it/s]


Training the model...
0.7706385281385284


In [16]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 1500, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:11<00:00, 12.92it/s]


Extracting the features...


100%|██████████| 554/554 [00:15<00:00, 35.69it/s]


Training the model...
0.7454220779220779


In [30]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 1800, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:09<00:00, 15.85it/s]


Extracting the features...


100%|██████████| 842/842 [00:24<00:00, 34.02it/s]


Training the model...
0.8634407096171802


In [31]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 1800, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:08<00:00, 18.37it/s]


Extracting the features...


100%|██████████| 842/842 [00:19<00:00, 42.97it/s]


Training the model...
0.7957002801120449


In [32]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 1800, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:08<00:00, 18.55it/s]


Extracting the features...


100%|██████████| 465/465 [00:10<00:00, 43.76it/s]


Training the model...
0.7554887449892076


In [33]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 1800, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:08<00:00, 16.86it/s]


Extracting the features...


100%|██████████| 465/465 [00:12<00:00, 36.53it/s]


Training the model...
0.7525901942645699


In [34]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 2000, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:11<00:00, 13.71it/s]


Extracting the features...


100%|██████████| 714/714 [00:19<00:00, 36.39it/s]


Training the model...
0.8861306729264476


In [35]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 2000, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:09<00:00, 16.23it/s]


Extracting the features...


100%|██████████| 714/714 [00:19<00:00, 37.26it/s]


Training the model...
0.8333528951486697


In [36]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 2000, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:08<00:00, 18.52it/s]


Extracting the features...


100%|██████████| 390/390 [00:09<00:00, 43.08it/s]


Training the model...
0.7623931623931626


In [37]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 2000, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:08<00:00, 18.78it/s]


Extracting the features...


100%|██████████| 390/390 [00:10<00:00, 38.82it/s]


Training the model...
0.7384615384615385


In [38]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 2500, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:10<00:00, 14.49it/s]


Extracting the features...


100%|██████████| 546/546 [00:17<00:00, 31.77it/s]


Training the model...
0.9023120089786757


In [39]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 2500, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:09<00:00, 15.73it/s]


Extracting the features...


100%|██████████| 546/546 [00:15<00:00, 34.37it/s]


Training the model...
0.8589562289562289


In [40]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 2500, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:09<00:00, 15.75it/s]


Extracting the features...


100%|██████████| 315/315 [00:09<00:00, 32.91it/s]


Training the model...
0.785383064516129


In [42]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 2500, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:09<00:00, 16.22it/s]


Extracting the features...


100%|██████████| 315/315 [00:08<00:00, 36.53it/s]


Training the model...
0.7691532258064516


In [47]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 3000, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:09<00:00, 15.23it/s]


Extracting the features...


100%|██████████| 403/403 [00:12<00:00, 31.08it/s]


Training the model...
0.9098373983739838


In [48]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0.5, 3000, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:08<00:00, 17.15it/s]


Extracting the features...


100%|██████████| 403/403 [00:12<00:00, 31.01it/s]


Training the model...
0.866280487804878


In [49]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 3000, shuffle=True)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:10<00:00, 13.75it/s]


Extracting the features...


100%|██████████| 238/238 [00:06<00:00, 35.13it/s]


Training the model...
0.839975845410628


In [50]:
model = RFC(n_estimators=300,n_jobs=-1)
n_scores = train_and_evaluate(model, 0, 3000, shuffle=False)
print(n_scores.mean())

Loading the dataset...


100%|██████████| 151/151 [00:07<00:00, 18.93it/s]


Extracting the features...


100%|██████████| 238/238 [00:05<00:00, 39.91it/s]


Training the model...
0.8228260869565217
