In [None]:
import io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from scipy.interpolate import CubicSpline
import ast, sys
from scipy.stats import iqr
from sklearn.metrics import accuracy_score
import glob
from scipy import signal

In [None]:
def computeEuclideanDistance(b1, b2):
    return np.sqrt( (b1[0,:]-b2[0,:])**2 + (b1[1,:]-b2[1,:])**2 + (b1[2,:]-b2[2,:])**2)

def computeEuclideanDistanceAll(p1,p2):
    return np.sqrt((p1-p2)**2)

In [None]:
def readData(path):
    data_path = 'challenge/task2/'+path+'/'
    ret = []
    files = glob.glob(data_path + '*txt')
    tot_lun = len(files)

    for number, file in enumerate(files): 
        sys.stdout.write("\r computed {%d/ %d}" % (number, tot_lun) )
        sys.stdout.flush()
        
        df = pd.read_csv(file, sep=',', decimal='.',header=None).T
        df.fillna(df.median(),inplace=True)
        diff = np.zeros((len(df),17))

        for idf in range(17):
            b = np.array([df.iloc[:,1::3].T.mean(), df.iloc[:,2::3].T.mean(), df.iloc[:,3::3].T.mean()])
            f = np.asarray([df.iloc[:,idf*3+1], df.iloc[:,idf*3+1+1], df.iloc[:,idf*3+2+1]])
            diff[:, idf] = computeEuclideanDistance(f,b)
        

        info = file.split('\\')[-1].split('.txt')[0].split('_')
        if len(info) == 4:
            person = int(info[0])
            exer_type = int(info[1])
            label = int(info[2])
            exercise = int(info[-1])
            meta = np.array([person, exer_type, label, exercise]) 
            meta = np.tile(meta, (len(diff), 1))
            all_data = np.concatenate((meta,diff), axis=1)
        else:
            person = int(info[0])
            exercise = int(info[-1])
            meta = np.array([person, exercise]) 
            meta = np.tile(meta, (len(diff), 1))
            all_data = np.concatenate((meta,diff), axis=1)

        if len(ret) == 0:
            ret = all_data
        else:
            ret = np.concatenate((ret, all_data), axis=0)
    
    print()
    return pd.DataFrame(ret)

In [None]:
%%time
X_train = readData('training')

In [None]:
%%time
X_val = readData('validation')

In [None]:
%%time
X_test = readData('test')

In [None]:
X_train.rename(columns={ X_train.columns[0]: "person",X_train.columns[1]:"type",X_train.columns[2]: "label",X_train.columns[3]: "exercise"},inplace=True)
X_val.rename(columns={ X_val.columns[0]: "person",X_val.columns[1]:"type", X_val.columns[2]: "label",X_val.columns[3]: "exercise"},inplace=True)
X_test.rename(columns={ X_test.columns[0]: "person",X_test.columns[1]: "exercise"},inplace=True)

In [None]:
X_train.to_csv('challenge/task2/train.csv', sep=';', decimal='.', index=False, header=True)
X_val.to_csv('challenge/task2/val.csv', sep=';', decimal='.', index=False, header=True)
X_test.to_csv('challenge/task2/test.csv', sep=';', decimal='.', index=False, header=True)

In [None]:
X_train

In [None]:
from scipy.interpolate import interp1d, CubicSpline

def sampleSameLenght(X,lenght):
    groups = X.groupby([X.person, X.exercise]) 
    if lenght == None:
        lenght = int(groups.size().mode())
    ret = []
    r, c = lenght, X.shape[1]
    
    for idg, (name, g) in enumerate(groups):
        sys.stdout.write("\r computed {%d/ %d}" % (idg+1, groups.ngroups) )
        sys.stdout.flush()
        comodo = np.zeros((r,c))
        comodo[:r,:3] = np.tile(g.iloc[0,:3].values, (r,1))

        x = g.index
        x_new = np.linspace(np.min(x),np.max(x), lenght)

        for i in range(3,c):
            y = g.iloc[:,i]
            f = CubicSpline(x,y)

            y_new = f(x_new)
            comodo[:,i] = y_new

        ret.append(comodo)

    ret = pd.DataFrame(np.vstack(ret), columns=X.columns)
    ret.sort_index(inplace=True)
    ret.reset_index(drop=True, inplace=True)
    print()
    return ret, lenght

In [None]:
%%time
X_train, dim = sampleSameLenght(X_train,120)

In [None]:
%%time
X_val, _ = sampleSameLenght(X_val,dim)

In [None]:
%%time
X_test, _ = sampleSameLenght(X_test,dim)

In [None]:
def extractFeatures(features):
    features.fillna(0,inplace=True)
    raw_features = features.shape[1]
    mean = features.mean().values
    var = features.var().values
    kurt = features.kurtosis().values
    skew = features.skew().values
    corr = features.corr().values[np.triu_indices(raw_features,k=1)]
    mad = features.mad().values
    sem = features.sem().values
    arr = features.values
    energy = np.sqrt(np.einsum('ij,ij->j',arr,arr))
    iqr_ = iqr(arr,axis=0)
    mi = features.min().values
    ma = features.max().values
    slopes = features.apply(lambda x: np.polyfit(features.index, x, 1)[0]).values

    frq_info = []
    for el in features[features.columns]:
        f_ = features[el]
        fft = np.fft.fft(f_)
        amplitude_spectrum = np.abs(fft)
        phase_angle = np.angle(f_)

    frq_info.extend([amplitude_spectrum[0].real,phase_angle[0].real,np.max(np.abs(fft[1:])),np.argmax(fft[1:])])
    frq_info.extend([pd.DataFrame(fft[1:]).abs().kurtosis().values[0],pd.DataFrame(fft[1:]).abs().skew().values[0], pd.DataFrame(fft[1:]).abs().mean(axis=0).values[0]])
  
    frq_info = np.asarray(frq_info)
    ret = np.concatenate((mean,var,kurt,skew,corr,mad,sem,energy,iqr_,mi,ma,slopes,frq_info))#.reshape(1,-1) 
    return ret

def featureEngineering(X,test=False):
    if test:
        groups = X.groupby([X.person, X.exercise])
        keys = list(groups.groups.keys())
        n_groups = groups.ngroups        
        ret = np.zeros((n_groups,992))
    else:
        groups = X.groupby([X.person, X.label,X.exercise])
        keys = list(groups.groups.keys())
        n_groups = groups.ngroups   
        ret = np.zeros((n_groups,994))

    for i in range(len(keys)):
        sys.stdout.write("\r computed {%d/ %d}" % (i+1, n_groups) )
        sys.stdout.flush()
        g = groups.get_group(list(keys)[i])

        if test:
            features = g.iloc[:,2:]
            labels = g.iloc[0,:2].values
        else:
            features = g.iloc[:,4:]
            labels = g.iloc[0,:4].values

        f_per_launch = []
        mid = round(len(features)/2)
        step = round(mid/2)
        
        for j in range(0,len(features)-step,step):
            f_computed = extractFeatures(features.iloc[j:j+mid,:])
            if len(f_per_launch) == 0:
                f_per_launch = np.concatenate((labels,f_computed),axis=0)
            else:
                f_per_launch = np.concatenate((f_per_launch,f_computed),axis=0)
        
        f_per_launch = f_per_launch.reshape(1,-1) 
        ret[i,:] = f_per_launch
    print()
    return ret

In [None]:
%%time
X_train = featureEngineering(X_train)

In [None]:
%%time
X_val = featureEngineering(X_val)

In [None]:
%%time
X_test = featureEngineering(X_test, test=True)

In [None]:
X_train = pd.DataFrame(X_train)
X_val = pd.DataFrame(X_val)
X_test = pd.DataFrame(X_test)

X_train.rename(columns={ X_train.columns[0]: "person",X_train.columns[1]:"type",X_train.columns[2]: "label",X_train.columns[3]: "exercise"},inplace=True)
X_val.rename(columns={ X_val.columns[0]: "person",X_val.columns[1]:"type", X_val.columns[2]: "label",X_val.columns[3]: "exercise"},inplace=True)
X_test.rename(columns={ X_test.columns[0]: "person",X_test.columns[1]: "exercise"},inplace=True)

In [None]:
X_train.to_csv('challenge/task2/train_feat.csv', sep=';', decimal='.', index=False, header=True)
X_val.to_csv('challenge/task2/val_feat.csv', sep=';', decimal='.', index=False, header=True)
X_test.to_csv('challenge/task2/test_feat.csv', sep=';', decimal='.', index=False, header=True)

In [None]:
X_train = pd.read_csv('task2/train_feat.csv', sep=';', decimal='.',header=0) 
X_val = pd.read_csv('challenge/task2/val_feat.csv', sep=';', decimal='.',header=0) 
X_test = pd.read_csv('task2/test_feat.csv', sep=';', decimal='.',header=0) 

In [None]:
def norm(X, test=False):
    groups = X.groupby(['person'])
    ret = []
    for name, g in groups:
        if test:
            s = g.iloc[:,2:]
            scaler = preprocessing.StandardScaler().fit(s.values)
            new_values = g.values
            new_values[:,2:] = scaler.transform(s.values)
            ret.append(new_values)
        else:
            s = g.iloc[:,3:]
            scaler = preprocessing.StandardScaler().fit(s.values)
            new_values = g.values
            new_values[:,3:] = scaler.transform(s.values)
            ret.append(new_values)

    ret = pd.DataFrame(np.asarray(np.vstack(ret)))
    ret.columns = X.columns  
    return ret

In [None]:
%%time
X_train = norm(X_train)

In [None]:
%%time
X_val = norm(X_val)

In [None]:
%%time
X_test = norm(X_test,test=True)

In [None]:

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score

from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek

In [None]:
def leave_1_out(X):
    grouped = X.groupby([X.person])

    for i, (name,group) in enumerate(grouped):
        test = group.copy()
        train = X.drop(test.index)

        mi = min(train.label.value_counts())
        train = train.groupby(train.label, group_keys=False).apply(lambda x: x.sample(min(len(x), mi)))

        yield train.index, test.index

In [None]:
%%time
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

n_features = X_train.iloc[:,3:].shape[1]
sqrt1_3 = round(pow(n_features,1/3))
sqrt3_4 = round(pow(n_features,2/3))
params = ['auto',sqrt1_3,sqrt3_4]

param_grid = {
    'rf__max_features':params,
    'rf__min_samples_leaf':[1,3,5]'
}


rf = RandomForestClassifier(random_state=42,n_estimators=1000)
pipeline = imbpipeline(steps = [['var_tresh',VarianceThreshold()],
                                ['smote', RandomUnderSampler(random_state=42)],
                                ['scaler', StandardScaler()],
                                ['rf', rf]]
                       )
cv = leave_1_out(X_train) 

clf = GridSearchCV(pipeline, param_grid, cv=cv, scoring=('balanced_accuracy'), n_jobs=32)
clf.fit(X_train.iloc[:,3:], X_train.label)
print(clf.score(X_val.iloc[:,3:], X_val.label))

y_pred = clf.predict(X_val.iloc[:,3:])
print(accuracy_score(X_val.label, y_pred), balanced_accuracy_score(X_val.label, y_pred))
print(f1_score(X_val.label, y_pred))
print(matthews_corrcoef(X_val.label, y_pred))


In [None]:
import xgboost as xgb

In [None]:
%%time
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE, ADASYN
from scipy import stats
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import SelectFromModel

param_grid = {
      'model__eta': [0.01,0.05,0.1,0.3],
      'model__max_depth': [3,5,7,9],
      'model__gamma': [0,0.1,0.2,0.3,0.4,0.5],
      }


model = xgb.XGBClassifier(n_estimators=1000)

pipeline = imbpipeline(steps = [['var_tresh',VarianceThreshold()],
                                ['smote', RandomUnderSampler(random_state=42)],
                                ['scaler', StandardScaler()],
                                ['model', model]]
                       )
cv = leave_1_out(X_train)

clf = GridSearchCV(pipeline, param_grid, cv=cv, scoring=('balanced_accuracy'), n_jobs=4,verbose=1)
clf.fit(X_train.iloc[:,3:], X_train.label)

y_pred = clf.predict(X_val.iloc[:,3:])
print('ACC:' , accuracy_score(X_val.label, y_pred))
print('B_ACC:', balanced_accuracy_score(X_val.label, y_pred))
print('f1:', f1_score(X_val.label, y_pred), '->',f1_score(X_val.label, y_pred,average=None))
print('MCC:', matthews_corrcoef(X_val.label, y_pred))


In [None]:
clf.best_estimator_

In [None]:
X_learning = pd.concat([X_train,X_val],axis=0)
X_learning.shape

Random Forest predictions

In [None]:
%%time
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

n_features = X_train.iloc[:,3:].shape[1]
sqrt1_3 = round(pow(n_features,1/3))
sqrt3_4 = round(pow(n_features,2/3))
params = ['auto',sqrt1_3,sqrt3_4]

param_grid = {
    'rf__max_features':params,
    'rf__min_samples_leaf':[1,3,5]'
}


rf = RandomForestClassifier(random_state=42,n_estimators=1000)
pipeline = imbpipeline(steps = [['var_tresh',VarianceThreshold()],
                                ['smote', RandomUnderSampler(random_state=42)],
                                ['scaler', StandardScaler()],
                                ['rf', rf]]
                       )
cv = leave_1_out(X_learning) 

clf = GridSearchCV(pipeline, param_grid, cv=cv, scoring=('balanced_accuracy'), n_jobs=32)
clf.fit(X_learning.iloc[:,3:], X_learning.label)
rf_pred = clf.predict(X_test.iloc[:,2:])

XGBOOST predictions

In [None]:
%%time
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE, ADASYN
from scipy import stats
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import SelectFromModel

param_grid = {
      'model__eta': [0.01,0.05,0.1,0.3],
      'model__max_depth': [3,5,7,9],
      'model__gamma': [0,0.1,0.2,0.3,0.4,0.5],
      }


model = xgb.XGBClassifier(n_estimators=1000)

pipeline = imbpipeline(steps = [['var_tresh',VarianceThreshold()],
                                ['smote', RandomUnderSampler(random_state=42)],
                                ['scaler', StandardScaler()],
                                ['model', model]]
                       )
cv = leave_1_out(X_train)

clf = GridSearchCV(pipeline, param_grid, cv=cv, scoring=('balanced_accuracy'), n_jobs=4,verbose=1)
clf.fit(X_learning.iloc[:,3:], X_learning.label)
xgboost_pred = clf.predict(X_test.iloc[:,2:])

