<a href="https://colab.research.google.com/github/nazanaza2970/BENTO-PACKAGING-ACTIVITY-RECOGNITION-CHALLENGE/blob/naza/Bento.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from scipy import stats
import glob
import os
from tqdm import tqdm, trange
from sklearn.ensemble import RandomForestClassifier as RFC, ExtraTreesClassifier as ETC
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from scipy.fftpack import fft

# Normal Model

In [None]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True)) 
    return seg_data

In [None]:
data_list = []
for file in tqdm(glob.glob("/content/drive/MyDrive/Competitions/2021/Bento 2021/Data/TrainData/*/*/*.csv")):
    tempdf = pd.read_csv(file) 
    data_list.extend(segmentation(tempdf, 0.75, 3000))

100%|██████████| 151/151 [00:10<00:00, 14.33it/s]


In [None]:
def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, median, variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))                                
        features.append(np.var(x_data[k]))
#         #skewness
#         features.append(stats.skew(x_data[k]))
#         #kutosis
#         features.append(stats.kurtosis(x_data[k]))

        # fd = np.abs(fft(np.array(x_data[k])))**2
        # features.append(stats.skew(fd))
        # features.append(stats.kurtosis(fd))

        # features.append(fd.std(ddof=0))
        # features.append(np.average(fd))
        # features.append(np.max(fd))
        # features.append(np.min(fd))
        # features.append(np.median(fd))                                
        # features.append(np.var(fd))


    return features

In [None]:
features_list = []
label_list = []
for j in trange(0,len(data_list)):
    #extract only xyz columns
    x_data = data_list[j].drop(columns=["subject_id","activity"])

    #Get features and label for each elements
    features_list.append(get_features(x_data))
    label_list.append(data_list[j].iloc[0, -1])

100%|██████████| 736/736 [00:46<00:00, 15.85it/s]


In [None]:
rf = RFC(n_estimators=300,n_jobs=-1)

In [None]:
cv = StratifiedKFold(n_splits=10, shuffle=False)
n_scores = cross_val_score(rf, features_list, label_list, scoring='accuracy', cv=cv, 
                            n_jobs=-1, error_score='raise', verbose=1)
print(n_scores.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


0.9034431691965938


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.3min finished


In [None]:
n_scores.mean(), n_scores

(0.9034431691965938,
 array([0.94594595, 0.87837838, 0.91891892, 0.85135135, 0.91891892,
        0.97297297, 0.90410959, 0.83561644, 0.8630137 , 0.94520548]))

# Model with transformations

In [None]:
transforms = list()
transforms.append(('mms', MinMaxScaler()))
transforms.append(('ss', StandardScaler()))
transforms.append(('rs', RobustScaler()))
transforms.append(('qt', QuantileTransformer(n_quantiles=100, output_distribution='normal')))
transforms.append(('kbd', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')))
transforms.append(('pca', PCA(n_components=7)))
transforms.append(('svd', TruncatedSVD(n_components=7)))

In [None]:
fu = FeatureUnion(transforms)
model = RFC(n_estimators=300,n_jobs=-1)
steps = list()
steps.append(('fu', fu))
steps.append(('m', model))
pipeline = Pipeline(steps=steps)

In [None]:
cvpipe = StratifiedKFold(n_splits=10, shuffle=False)
n_scores_pipe = cross_val_score(pipeline, features_list, label_list, scoring='accuracy', cv=cvpipe, 
                            n_jobs=-1, error_score='raise', verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   45.2s finished


In [None]:
n_scores_pipe.mean(), n_scores_pipe

(0.9088115512773047,
 array([0.90540541, 0.89189189, 0.94594595, 0.86486486, 0.95945946,
        1.        , 0.93150685, 0.84931507, 0.84931507, 0.89041096]))

# Model with Quaternion

In [None]:
q = pd.read_feather("/content/drive/MyDrive/Competitions/2021/Bento 2021/Data/quats.feather")
cols = list(q.columns )
q2 = []
for i in tqdm(cols):
    col = np.stack(q[i])
    q2.append(col[:,0])
    q2.append(col[:,1])
    q2.append(col[:,2])
    q2.append(col[:,3])

q2 = np.transpose(np.array(q2))
# q2 = np.append(q2,q2[0]).reshape(1786500,60)
q2.shape

100%|██████████| 15/15 [00:20<00:00,  1.38s/it]


(950634, 60)

In [None]:
labels = []
for root, dirs, files in os.walk("/content/drive/MyDrive/Competitions/2021/Bento 2021/Data/TrainData"):
     for file in tqdm(files):
        tempdf = pd.read_csv(os.path.join(root,file))
        labels.extend(tempdf.iloc[:,-1])

0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 5/5 [00:00<00:00, 20.79it/s]
100%|██████████| 5/5 [00:00<00:00, 19.95it/s]
100%|██████████| 5/5 [00:00<00:00, 20.84it/s]
100%|██████████| 5/5 [00:00<00:00, 20.00it/s]
100%|██████████| 5/5 [00:00<00:00, 20.13it/s]
100%|██████████| 5/5 [00:00<00:00, 19.21it/s]
100%|██████████| 5/5 [00:00<00:00, 18.20it/s]
100%|██████████| 5/5 [00:00<00:00, 16.81it/s]
100%|██████████| 5/5 [00:00<00:00, 16.59it/s]
100%|██████████| 5/5 [00:00<00:00, 17.02it/s]
0it [00:00, ?it/s]
100%|██████████| 5/5 [00:00<00:00, 15.07it/s]
100%|██████████| 5/5 [00:00<00:00, 15.53it/s]
100%|██████████| 5/5 [00:00<00:00, 17.52it/s]
100%|██████████| 5/5 [00:00<00:00, 15.32it/s]
100%|██████████| 5/5 [00:00<00:00, 17.55it/s]
100%|██████████| 5/5 [00:00<00:00, 18.70it/s]
100%|██████████| 5/5 [00:00<00:00, 16.35it/s]
100%|██████████| 5/5 [00:00<00:00, 18.05it/s]
100%|██████████| 6/6 [00:00<00:00, 19.03it/s]
0it [00:00, ?it/s]
100%|██████████| 6/6 [00:00<00:00, 13.32it/s]
0it 

In [None]:
q2 = np.append(q2,q2[0]).reshape(950635,60)
len(labels)

950635

In [None]:
def windows(arr,labels,overlap_rate,window):
    seg_data = []
    label = []
    labels = np.array(labels)
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*window)
    # interpolate
    # df = df.interpolate().ffill().fillna(0)
    for i in range(0, arr.shape[0]-window+1, overlap):
        seg_data.append(arr[i:i+window])
        label.append(int(stats.mode(labels[i:i+window]).mode)) 
    return seg_data,label

In [None]:
segs,annots = windows(q2,labels,0.75,3000)

In [None]:
features = []
for i in segs:
    features.append(get_features(pd.DataFrame(i)))

In [None]:
rfq = RFC(n_estimators=1200,n_jobs=-1)

In [None]:
cvq = StratifiedKFold(n_splits=10, shuffle=False)
n_scores = cross_val_score(rfq, np.nan_to_num(features), annots, scoring='accuracy', cv=cvq, 
                            n_jobs=-1, error_score='raise', verbose=1)
print(n_scores.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


0.8259530058742657


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.9min finished


In [None]:
n_scores

array([0.77165354, 0.85826772, 0.84251969, 0.82677165, 0.88888889,
       0.83333333, 0.84126984, 0.76190476, 0.79365079, 0.84126984])

# With speed and acc

In [None]:
data = pd.read_feather("/content/drive/MyDrive/Competitions/2021/Bento 2021/Data/streams.feather")
quats = pd.read_feather("/content/drive/MyDrive/Competitions/2021/Bento 2021/Data/quats.feather")

In [None]:
cols = list(quats.columns )
q2 = []
for i in tqdm(cols):
    col = np.stack(quats[i])
    q2.append(col[:,0])
    q2.append(col[:,1])
    q2.append(col[:,2])
    q2.append(col[:,3])

q2 = np.transpose(np.array(q2))
# q2 = np.append(q2,q2[0]).reshape(1786500,60)
q2 = pd.DataFrame(np.append(q2,q2[0]).reshape(950635,60))

In [None]:
X = pd.concat([data.drop(columns = ['subject_id','activity']),q2],axis = 1)
y = data['activity']

In [None]:
seg = []
labels = []
for i in tqdm(range(0,len(X)-3001,2250)):
    seg.append(X.iloc[i:i+3000])
    labels.append(stats.mode(y[i:i+3000])[0][0])

In [None]:
rf = RFC(n_estimators=300,n_jobs=-1)
cv = StratifiedKFold(n_splits=10, shuffle=False)
n_scores = cross_val_score(rf, seg, labels, scoring='accuracy', cv=cv, 
                            n_jobs=-1, error_score='raise', verbose=1)
print(n_scores.mean())