In [1]:
# Install tsflex and seglearn
!pip install tsflex --no-index --find-links=file:///kaggle/input/time-series-tools
!pip install seglearn --no-index --find-links=file:///kaggle/input/time-series-tools

Looking in links: file:///kaggle/input/time-series-tools
Processing /kaggle/input/time-series-tools/tsflex-0.3.0-py3-none-any.whl
Installing collected packages: tsflex
Successfully installed tsflex-0.3.0
[0mLooking in links: file:///kaggle/input/time-series-tools
Processing /kaggle/input/time-series-tools/seglearn-1.2.5-py3-none-any.whl
Installing collected packages: seglearn
Successfully installed seglearn-1.2.5
[0m

In [2]:
import numpy as np
import pandas as pd
from sklearn import *
import glob
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from os import path
from pathlib import Path
from seglearn.feature_functions import base_features, emg_features
from tsflex.features import FeatureCollection, MultipleFeatureDescriptors
from tsflex.features.integrations import seglearn_feature_dict_wrapper
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.base import clone
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import LabelEncoder
import os
from sklearn.utils import resample
import gc
import sys

In [3]:
from numpy.random import seed
seed(1123) 
import random as pyrandom
pyrandom.seed(1123)

In [4]:
root = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/'

train = glob.glob(path.join(root, 'train/**/**'))
# train2 = glob.glob(path.join(root, 'unlabeled/**'))
test = glob.glob(path.join(root, 'test/**/**'))

subjects = pd.read_csv(path.join(root, 'subjects.csv'))
tasks = pd.read_csv(path.join(root, 'tasks.csv'))
events = pd.read_csv(path.join(root, 'events.csv'))

tdcsfog_metadata = pd.read_csv(path.join(root, 'tdcsfog_metadata.csv'))
defog_metadata = pd.read_csv(path.join(root, 'defog_metadata.csv')) 

tdcsfog_metadata['Module'] = 'tdcsfog'
defog_metadata['Module'] = 'defog'

full_metadata = pd.concat([tdcsfog_metadata, defog_metadata])

In [5]:
best_params_ = {'colsample_bytree': 0.5282057895135501,
 'learning_rate': 0.22659963168004743,
 'max_depth': 8,
 'min_child_weight': 3.1233911067827616,
 'n_estimators': 291,
 'subsample': 0.9961057796456088,
 'objective':'multiclass',
  'metric':'multi_logloss',
  'num_class' : 4

#   'device':'gpu'
    
                
 }
# from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def custom_average_precision(y_true, y_pred):
    score = average_precision_score(np.round(y_true), y_pred)
    return 'average_precision', score, True

def custom_average_precision1(y_true, y_pred):
    score = average_precision_score(np.round(y_true), y_pred)
    return score

# class LGBMMultiOutputClassifier(MultiOutputClassifier):
#     def fit(self, X, y, eval_set=None, **fit_params):
#         self.estimators_ = [clone(self.estimator) for _ in range(y.shape[1])]
        
#         for i, estimator in enumerate(self.estimators_):
#             if eval_set:
#                 fit_params['eval_set'] = [(eval_set[0], eval_set[1][:, i])]
#             estimator.fit(X, y[:, i], **fit_params)
        
#         return self
    
#     def predict_proba(self, X):
#         predictions = np.hstack([estimator.predict_proba(X)[:, 1].reshape(-1, 1) for estimator in self.estimators_])
#         return predictions

In [6]:
basic_feats = MultipleFeatureDescriptors(
    functions=seglearn_feature_dict_wrapper(base_features()),
    series_names=['AccV', 'AccML', 'AccAP'],
    windows=[1000],
    strides=[800],
)
basic_feats

emg_feats = emg_features()
del emg_feats['simple square integral'] # is same as abs_energy (which is in base_features)
emg_feats

emg_feats = MultipleFeatureDescriptors(
    functions=seglearn_feature_dict_wrapper(emg_feats),
    series_names=['AccV', 'AccML', 'AccAP'],
    windows=[1000],
    strides=[800],
)

fc = FeatureCollection([basic_feats, emg_feats])

In [7]:
trainnt_files = glob.glob(path.join(root, 'train/notype/**')) 

def reader2(file):
#     try:        
        df = pd.read_csv(file)
        path_split = file.split('/')         
        dataset = Path(file).parts[-2]  
        df['Id'] = path_split[-1].split('.')[0]
        idv=df['Id'][0]       
        df['Time_frac2']=(df.index/df.index.max()).values
#         df['Module'] = dataset      

        df['Flag']= df['Event']        
         
#         # Reindex based on time
#         df = df.set_index('Time')
        
        # Extract specific columns
        columns_to_select = ['Time_frac2','AccV', 'AccML', 'AccAP', 'Flag']
        try:
            df = df[columns_to_select]  

            df['Module'] = dataset
            df['Id'] = path_split[-1].split('.')[0]  
            df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin").astype(np.float32)
            df = df.merge(df_feats, how="left", left_index=True, right_index=True)   

            df.fillna(method="ffill", inplace=True)

            return df
        except: 
            print("error at df, ignored",idv)
            pass
#         print("file name exception ",file)
#           pass

trainnt = pd.concat([reader2(f) for f in tqdm(trainnt_files)]).fillna(0)

# cols = [c for c in train.columns if c not in ['Id','Subject','Module', 'Time', 'StartHesitation', 'Turn' , 'Walking', 'Valid', 'Task','Event']]
# pcols = ['StartHesitation', 'Turn' , 'Walking']
# scols = ['Id', 'StartHesitation', 'Turn' , 'Walking']
trainnt=trainnt.reset_index(drop=True)

trainnt["total_accl"]= np.sqrt(np.square(trainnt.AccV) + np.square(trainnt.AccML) + np.square(trainnt.AccAP))
trainnt["abs_delta1"] = (trainnt["total_accl"] - trainnt["total_accl"].shift(100)) * 1000
trainnt["abs_delta2"] = (trainnt["total_accl"] - trainnt["total_accl"].shift(1000)) * 1000
trainnt["abs_delta1"] = trainnt["abs_delta1"].fillna(0)
trainnt["abs_delta2"] = trainnt["abs_delta2"].fillna(0)

# print("columns in train data ",len(train.columns))
print("columns in trainnt data ",len(trainnt.columns))
# trainnt.columns

  0%|          | 0/46 [00:00<?, ?it/s]

columns in trainnt data  67


In [8]:
trainnt

Unnamed: 0,Time_frac2,AccV,AccML,AccAP,Flag,Module,Id,AccAP__abs_energy__w=1000,AccAP__abs_sum__w=1000,AccAP__emg_var__w=1000,...,AccV__skew__w=1000,AccV__slope_sign_changes__w=1000,AccV__std__w=1000,AccV__var__w=1000,AccV__waveform_length__w=1000,AccV__willison_amplitude__w=1000,AccV__zero_crossing__w=1000,total_accl,abs_delta1,abs_delta2
0,0.000000,-0.991926,-0.119916,0.050087,0,notype,1e8d55d48d,150.447662,356.825500,0.150598,...,0.509972,352.0,0.067781,0.004594,2.985554,999.0,999.0,1.000403,0.000000,0.000000
1,0.000005,-0.994243,-0.118624,0.049909,0,notype,1e8d55d48d,150.447662,356.825500,0.150598,...,0.509972,352.0,0.067781,0.004594,2.985554,999.0,999.0,1.002537,0.000000,0.000000
2,0.000010,-0.995840,-0.118602,0.048774,0,notype,1e8d55d48d,150.447662,356.825500,0.150598,...,0.509972,352.0,0.067781,0.004594,2.985554,999.0,999.0,1.004063,0.000000,0.000000
3,0.000015,-0.995865,-0.121627,0.048090,0,notype,1e8d55d48d,150.447662,356.825500,0.150598,...,0.509972,352.0,0.067781,0.004594,2.985554,999.0,999.0,1.004417,0.000000,0.000000
4,0.000020,-0.992330,-0.122146,0.048878,0,notype,1e8d55d48d,150.447662,356.825500,0.150598,...,0.509972,352.0,0.067781,0.004594,2.985554,999.0,999.0,1.001013,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10251109,0.999991,-0.850114,-0.156777,-0.499272,0,notype,434732a5e3,59.884552,132.604736,0.202998,...,-0.617458,62.0,0.042921,0.001842,2.219893,295.0,295.0,0.998271,-48.695882,-7.514825
10251110,0.999993,-0.847296,-0.161428,-0.496935,0,notype,434732a5e3,59.884552,132.604736,0.202998,...,-0.617458,62.0,0.042921,0.001842,2.219893,295.0,295.0,0.995447,-60.865013,-19.761007
10251111,0.999995,-0.846237,-0.164099,-0.495441,0,notype,434732a5e3,59.884552,132.604736,0.202998,...,-0.617458,62.0,0.042921,0.001842,2.219893,295.0,295.0,0.994237,-61.825752,-32.318259
10251112,0.999998,-0.843425,-0.164062,-0.495419,0,notype,434732a5e3,59.884552,132.604736,0.202998,...,-0.617458,62.0,0.042921,0.001842,2.219893,295.0,295.0,0.991827,-61.299421,-38.923812


In [9]:
def endtransform(df):
    
    df=pd.DataFrame(df)
    ones_indices = np.where(df == 1)[0]  # Get indices of ones
     
    indx={}
    k=0
    diff=0
    nex=0
    ind={}
    j=0
    for i in range(len(ones_indices)):
        indx['start']=ones_indices[k]         
        prev = ones_indices[i]        
        nex = ones_indices[i+1] if i < len(ones_indices) - 1 else len(df)         
        diff = nex -prev        

        if diff> 1:  
            indx['end']=prev
            ind[j]=(indx['start'],indx['end'])
            j+=1             
            k= i+1 

    for key in ind.keys():
        k=0.8
        p=0.1
        start=ind[key][0]
        end = ind[key][1] 
        dur=end-start
        df[start:start+round(dur*p)]=df[start:start+round(dur*p)]*k
        df[end+1 - round(dur*p):end+1]=df[end+1 - round(dur*p):end+1]*k
    return df


In [10]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split

# model1 = lgb.LGBMClassifier() 

best_params_ = {'colsample_bytree': 0.5282057895135501,
 'learning_rate': 0.22659963168004743,
 'max_depth': 8,
 'min_child_weight': 3.1233911067827616,
 'n_estimators': 291,
 'subsample': 0.9961057796456088,
 

#   'device':'gpu'
    
                
 }
model2 = lgb.LGBMRegressor(**best_params_)  

# train['Flag'] = endtransform(train['Flag'])
# trainnt['Flag']= endtransform(trainnt['Flag'])

# train3=train.drop(['Time_frac','t_group','Visit','Test','Medication','Id','Subject','Module', 
#                    's_group','label'],axis=1).sample(n=1000000, random_state=1123)
trainnt=trainnt.drop(['Module','Id'],axis=1).sample(n=200000,random_state=1123,weights='Flag')
# train4 = pd.concat([train3,trainnt])
# trainnt=trainnt.sample(n=200000,random_state=1123)

y = trainnt['Flag']
X = trainnt.drop(['Flag'],axis=1)

print(y)

ntcols = X.columns.values
print("nt cols ",ntcols)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1123)

# # Perform random undersampling on the training data
# rus = RandomUnderSampler(random_state=1123)
# X_train_res, y_train_res = rus.fit_resample(X_train, y_train) 

y_train  = endtransform(y_train) 

# Train the model on the undersampled training data
# model1.fit(X_train_res, y_train_res)
model2.fit(X_train, y_train)

print("shape of X_train ",X_train.shape)

# Predict on the test data
# y_pred = model1.predict(X_test)
y_pred2 = model2.predict(X_test) 

mse = mean_squared_error(y_test,y_pred2)

print("mse for prediction ",mse)



4936388     1
10085909    1
6158704     1
1172234     1
8870260     1
           ..
4965077     1
8913374     1
6461023     1
6435056     1
8020036     1
Name: Flag, Length: 200000, dtype: int64
nt cols  ['Time_frac2' 'AccV' 'AccML' 'AccAP' 'AccAP__abs_energy__w=1000'
 'AccAP__abs_sum__w=1000' 'AccAP__emg_var__w=1000' 'AccAP__kurt__w=1000'
 'AccAP__maximum__w=1000' 'AccAP__mean__w=1000' 'AccAP__mean_abs__w=1000'
 'AccAP__mean_crossings__w=1000' 'AccAP__median__w=1000'
 'AccAP__minimum__w=1000' 'AccAP__mse__w=1000'
 'AccAP__root_mean_square__w=1000' 'AccAP__skew__w=1000'
 'AccAP__slope_sign_changes__w=1000' 'AccAP__std__w=1000'
 'AccAP__var__w=1000' 'AccAP__waveform_length__w=1000'
 'AccAP__willison_amplitude__w=1000' 'AccAP__zero_crossing__w=1000'
 'AccML__abs_energy__w=1000' 'AccML__abs_sum__w=1000'
 'AccML__emg_var__w=1000' 'AccML__kurt__w=1000' 'AccML__maximum__w=1000'
 'AccML__mean__w=1000' 'AccML__mean_abs__w=1000'
 'AccML__mean_crossings__w=1000' 'AccML__median__w=1000'
 'AccML__

In [11]:
import sys
sys.getsizeof(trainnt)

60000016

In [12]:
del trainnt
gc.collect()

20

In [13]:
subjects.loc[subjects['Subject'] == 'fe5d84', 'Sex'] = 'F' 

seed = 1123
cluster_size = 10

subjects['Sex'] = subjects['Sex'].factorize()[0]
subjects = subjects.fillna(0).groupby('Subject').median()
subjects['s_group'] = cluster.KMeans(n_clusters = cluster_size, random_state = seed).fit_predict(subjects[subjects.columns[1:]])
new_names = {'Visit':'s_visit','Age':'s_age','YearsSinceDx':'s_years','UPDRSIII_On':'s_on','UPDRSIII_Off':'s_off','NFOGQ':'s_NFOGQ', 'Sex': 's_sex'}
subjects = subjects.rename(columns = new_names)
subjects



Unnamed: 0_level_0,s_visit,s_age,s_sex,s_years,s_on,s_off,s_NFOGQ,s_group
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00f674,1.5,63.0,0.0,27.0,37.0,39.5,25.0,1
02bc69,0.0,69.0,0.0,4.0,21.0,0.0,22.0,0
040587,1.5,75.0,0.0,26.0,49.5,72.0,22.5,7
056372,2.0,69.0,0.0,13.0,44.0,50.0,22.0,4
07285e,0.0,58.0,0.0,1.0,18.0,26.0,10.0,6
...,...,...,...,...,...,...,...,...
f686f0,0.0,61.0,0.0,7.0,44.0,0.0,24.0,8
f80507,1.0,57.0,0.0,2.0,12.0,0.0,0.0,5
fa8764,0.0,60.0,1.0,7.0,30.0,0.0,19.0,0
fba3a3,1.0,65.0,1.0,8.0,28.0,0.0,0.0,5


In [14]:
tasks['Duration'] = tasks['End'] - tasks['Begin']
tasks = pd.pivot_table(tasks, values=['Duration'], index=['Id'], columns=['Task'], aggfunc='sum', fill_value=0)
tasks.columns = [c[1] for c in tasks.columns]
tasks = tasks.reset_index()
tasks['t_group'] = cluster.KMeans(n_clusters = cluster_size, random_state = seed).fit_predict(tasks[tasks.columns[1:]])



In [15]:
# merge the subjects with the metadata
metadata_w_subjects = full_metadata.merge(subjects, how='left', on='Subject').copy()
features = metadata_w_subjects.columns
metadata_w_subjects['Medication'] = metadata_w_subjects['Medication'].factorize()[0]
metadata_w_subjects

Unnamed: 0,Id,Subject,Visit,Test,Medication,Module,s_visit,s_age,s_sex,s_years,s_on,s_off,s_NFOGQ,s_group
0,003f117e14,4dc2f8,3,2.0,0,tdcsfog,0.0,68.0,1.0,9.0,17.0,15.0,15.0,6
1,009ee11563,f62eec,4,2.0,0,tdcsfog,0.0,71.0,0.0,10.0,42.0,0.0,24.0,8
2,011322847a,231c3b,2,2.0,0,tdcsfog,0.0,67.0,0.0,12.0,27.0,28.0,19.0,2
3,01d0fe7266,231c3b,2,1.0,1,tdcsfog,0.0,67.0,0.0,12.0,27.0,28.0,19.0,2
4,024418ba39,fa8764,19,3.0,0,tdcsfog,0.0,60.0,1.0,7.0,30.0,0.0,19.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,f3a921edee,1a778d,1,,1,defog,1.5,65.0,0.0,7.0,50.0,59.5,24.5,7
966,f40e8c6ebe,575c60,1,,1,defog,1.0,28.0,0.0,4.0,54.0,50.0,25.0,1
967,f8ddbdd98d,107712,1,,0,defog,1.0,82.0,1.0,11.0,38.0,42.0,21.0,4
968,f9efef91fb,5d9cae,2,,1,defog,1.5,72.0,0.5,14.0,22.5,39.0,16.0,2


In [16]:
# basic_feats = MultipleFeatureDescriptors(
#     functions=seglearn_feature_dict_wrapper(base_features()),
#     series_names=['AccV', 'AccML', 'AccAP'],
#     windows=[2000],
#     strides=[1000],
# )
# basic_feats

In [17]:
# emg_feats = emg_features()
# del emg_feats['simple square integral'] # is same as abs_energy (which is in base_features)
# emg_feats

In [18]:
# emg_feats = MultipleFeatureDescriptors(
#     functions=seglearn_feature_dict_wrapper(emg_feats),
#     series_names=['AccV', 'AccML', 'AccAP'],
#     windows=[5000],
#     strides=[5000],
# )

In [19]:
# fc = FeatureCollection([basic_feats, emg_feats])

In [20]:
# def reader(file):
# #     try:        
#         df = pd.read_csv(file)
#         path_split = file.split('/')         
#         dataset = Path(file).parts[-2]  
#         df['Id'] = path_split[-1].split('.')[0]
#         idv=df['Id'][0]       
#         df['Time_frac2']=(df.index/df.index.max()).values
# #         df['Module'] = dataset        
        
#         if dataset!='notype':
#             df['Flag']= np.any(df[['StartHesitation','Turn','Walking']] == 1, axis=1).astype(int)
        
# #         print("in df ",idv)
        
#         if events[events.Id==idv].empty==False:
            
# #             ms=(events.Id==idv) & (events.Type=='StartHesitation')
# #             mt=(events.Id==idv) & (events.Type=='Turn')
# #             mw=(events.Id==idv) & (events.Type=='Walking')
            
#             if dataset=='defog':
#                 start =  round(events[events.Id==idv].Init.min() * 100)
#                 end = round(events[events.Id==idv].Completion.max() * 100)
                
#             else:
#                 start =  round(events[events.Id==idv].Init.min() * 128)
#                 end = round(events[events.Id==idv].Completion.max() * 128)
                
#             dur = end - start
# #             print("dur is ",dur)
#             start = round(max(start - 2 * dur,0))
#             end   = round(end + 2 * dur)               
#             if end >= df.Time.max(): end=df.Time.max()                
#             indx = list(np.arange(start,end)) 
#             df = df.loc[indx] 
            
#             ev = events[events.Id==idv] 
            
# #             print("in df ",idv)
            
#             if dataset!='notype':
#                 for i,row in ev.iterrows():

#                     if dataset=="defog":
#                         start_r = round(row.Init * 100)
#                         end_r = round(row.Completion * 100)

#                     else:
#                         start_r = round(row.Init * 128)
#                         end_r = round(row.Init * 128) 

#                     durr = end_r - start_r

#                     if start_r!=0:               
#     #                     print("dafdsf \n",df.loc[start_r:(start_r+round(0.1*durr)),:])
#                         df.loc[start_r:(start_r+round(0.2*durr)),'StartHesitation'] = df.loc[start_r:(start_r+round(0.2*durr)),'StartHesitation'] * 0.8
#                         df.loc[(end_r+1 - round(0.2*durr)):end_r+1,'StartHesitation'] = df.loc[(end_r+1 - round(0.2*durr)):end_r+1,'StartHesitation'] * 0.8

#                         df.loc[start_r:(start_r+round(0.2*durr)),'Turn'] = df.loc[start_r:(start_r+round(0.2*durr)),'Turn'] * 0.8
#                         df.loc[(end_r+1 - round(0.2*durr)):end_r+1,'Turn'] = df.loc[(end_r+1 - round(0.2*durr)):end_r+1,'Turn'] * 0.8

#                         df.loc[start_r:(start_r+round(0.2*durr)),'Walking'] = df.loc[start_r:(start_r+round(0.2*durr)),'Walking'] * 0.8
#                         df.loc[(end_r+1 - round(0.2*durr)):end_r+1,'Walking'] = df.loc[(end_r+1 - round(0.2*durr)):end_r+1,'Walking'] * 0.8

#         # Reindex based on time
#         df = df.set_index('Time')
        
#         # Extract specific columns
#         columns_to_select = ['Time_frac2','AccV', 'AccML', 'AccAP', 'StartHesitation', 'Turn', 'Walking','Flag']
#         try:
#             df = df[columns_to_select]  

#             df['Module'] = dataset
#             df['Id'] = path_split[-1].split('.')[0]

# #             # this is done because the speeds are at different rates for the datasets
# #             if dataset == 'tdcsfog':
# #                 df.AccV = df.AccV / 9.80665
# #                 df.AccML = df.AccML / 9.80665
# #                 df.AccAP = df.AccAP / 9.80665

#             df['Time_frac']=(df.index/df.index.max()).values

#             df = pd.merge(df, tasks[['Id','t_group']], how='left', on='Id').fillna(-1)

#             df = pd.merge(df, metadata_w_subjects[['Id','Subject', 'Visit','Test','Medication','s_group']], how='left', on='Id').fillna(-1)

#             df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin").astype(np.float32)
#             df = df.merge(df_feats, how="left", left_index=True, right_index=True)

    

#             df.fillna(method="ffill", inplace=True)

#     #             print(" df completed ",idv)

#             return df
#         except: 
#             print("error at df, ignored",idv)
#             pass
# #         print("file name exception ",file)
# #           pass

# train = pd.concat([reader(f) for f in tqdm(train)]).fillna(0); print(train.shape)
# cols = [c for c in train.columns if c not in ['Id','Subject','Module', 'Time', 'StartHesitation', 'Turn' , 'Walking', 'Valid', 'Task','Event']]
# pcols = ['StartHesitation', 'Turn' , 'Walking']
# scols = ['Id', 'StartHesitation', 'Turn' , 'Walking']
# train=train.reset_index(drop=True)

In [21]:
def reader(file):
#     try:        
        df = pd.read_csv(file)
        path_split = file.split('/')         
        dataset = Path(file).parts[-2]  
        df['Id'] = path_split[-1].split('.')[0]
        idv=df['Id'][0]       
        df['Time_frac2']=(df.index/df.index.max()).values
        df['annot']=1
        if dataset!='notype':
            df['Flag']= np.any(df[['StartHesitation','Turn','Walking']] == 1, axis=1).astype(int)
        
#         print("in df ",idv)
        
        if events[events.Id==idv].empty==False:
            
#             ms=(events.Id==idv) & (events.Type=='StartHesitation')
#             mt=(events.Id==idv) & (events.Type=='Turn')
#             mw=(events.Id==idv) & (events.Type=='Walking')
            
            if dataset=='defog':
                df['annot']=0
                df.loc[df[(df.Valid==True) & (df.Task==True)].index.values,'annot']=1
                start =  round(events[events.Id==idv].Init.min() * 100)
                end = round(events[events.Id==idv].Completion.max() * 100)
                
            else:
                start =  round(events[events.Id==idv].Init.min() * 128)
                end = round(events[events.Id==idv].Completion.max() * 128)
                
            dur = end - start
#             print("dur is ",dur)
            start = round(max(start - 2 * dur,0))
            end   = round(end + 2 * dur)               
            if end >= df.Time.max(): end=df.Time.max()                
            indx = list(np.arange(start,end)) 
            df = df.loc[indx] 
        try:
            df['None'] = (df['StartHesitation'] == 0) & (df['Turn'] == 0) & (df['Walking'] == 0)
    # Convert the boolean values to integers (0 and 1)
            df['None'] = df['None'].astype(int)    

            for index, row in df.iterrows():
                for col in ['None','StartHesitation','Turn','Walking']:
                    if row[col] == 1:
                        df.at[index, 'label'] = col
                        break

            df['label'] = df['label'].astype(str)

        # # Perform label encoding on the 'Label' column

    #             except:
    #                 pass

    # # # Print the resulting DataFrame


    #             ev = events[events.Id==idv] 

    #             print("in df ",idv)

    #             if dataset!='notype':
    #                 for i,row in ev.iterrows():

    #                     if dataset=="defog":
    #                         start_r = round(row.Init * 100)
    #                         end_r = round(row.Completion * 100)

    #                     else:
    #                         start_r = round(row.Init * 128)
    #                         end_r = round(row.Init * 128) 

    #                     durr = end_r - start_r

    #                     if start_r!=0:               
    #     #                     print("dafdsf \n",df.loc[start_r:(start_r+round(0.1*durr)),:])
    #                         df.loc[start_r:(start_r+round(0.2*durr)),'StartHesitation'] = df.loc[start_r:(start_r+round(0.2*durr)),'StartHesitation'] * 0.8
    #                         df.loc[(end_r - round(0.2*durr)):end_r,'StartHesitation'] = df.loc[(end_r - round(0.2*durr)):end_r,'StartHesitation'] * 0.8

    #                         df.loc[start_r:(start_r+round(0.2*durr)),'Turn'] = df.loc[start_r:(start_r+round(0.2*durr)),'Turn'] * 0.8
    #                         df.loc[(end_r - round(0.2*durr)):end_r,'Turn'] = df.loc[(end_r - round(0.2*durr)):end_r,'Turn'] * 0.8

    #                         df.loc[start_r:(start_r+round(0.2*durr)),'Walking'] = df.loc[start_r:(start_r+round(0.2*durr)),'Walking'] * 0.8
    #                         df.loc[(end_r - round(0.2*durr)):end_r,'Walking'] = df.loc[(end_r - round(0.2*durr)):end_r,'Walking'] * 0.8

            # Reindex based on time
            df = df.set_index('Time')

            # Extract specific columns
            columns_to_select = ['Time_frac2','annot','AccV', 'AccML', 'AccAP', 'label','Flag'] #'annot',
    #         try:
            df = df[columns_to_select]  

            df['Module'] = dataset
            df['Id'] = path_split[-1].split('.')[0]

    #             # this is done because the speeds are at different rates for the datasets
    #             if dataset == 'tdcsfog':
    #                 df.AccV = df.AccV / 9.80665
    #                 df.AccML = df.AccML / 9.80665
    #                 df.AccAP = df.AccAP / 9.80665

            df['Time_frac']=(df.index/df.index.max()).values



            df = pd.merge(df, tasks[['Id','t_group']], how='left', on='Id').fillna(-1)


            df = pd.merge(df, metadata_w_subjects[['Id','Subject', 'Visit','Test','Medication','s_group']], how='left', on='Id').fillna(-1)

            df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin").astype(np.float32)
            df = df.merge(df_feats, how="left", left_index=True, right_index=True)

        #         # stride
        #         df["Stride"] = df["AccV"] + df["AccML"] + df["AccAP"]

        #         # step
        #         df["Step"] = np.sqrt(abs(df["Stride"]))

            df.fillna(method="ffill", inplace=True)

        #             print(" df completed ",idv)

            return df
        except: 
            print("error at df, ignored",idv)
            pass
#         print("file name exception ",file)
#           pass

train = pd.concat([reader(f) for f in tqdm(train)]).fillna(0); print(train.shape)
cols = [c for c in train.columns if c not in ['Id','Subject','Module', 'Time', 'label', 'Valid', 'Task','Event']]
pcols = ['label'] #['StartHesitation', 'Turn' , 'Walking']
scols = ['Id', 'StartHesitation', 'Turn' , 'Walking']
train=train.reset_index(drop=True)

  0%|          | 0/970 [00:00<?, ?it/s]

  return stats.kurtosis(X, axis=1)
  return stats.skew(X, axis=1)
  return stats.skew(X, axis=1)
  return stats.kurtosis(X, axis=1)
  return stats.skew(X, axis=1)
  return stats.kurtosis(X, axis=1)


error at df, ignored 1e8d55d48d
error at df, ignored 89e9ed32d1
error at df, ignored e5a0e226fe
error at df, ignored 1b3bc93401
error at df, ignored 34b979fc28
error at df, ignored 9cd837fd53
error at df, ignored 60f28aa837
error at df, ignored 02ab235146
error at df, ignored 6a886a3bb8
error at df, ignored 339c0cc15f
error at df, ignored af02b83cbf
error at df, ignored 71dd8ce20d
error at df, ignored b1f5aa1b77
error at df, ignored 296c84448e
error at df, ignored 46cdfe23ea
error at df, ignored 2cc3c30645
error at df, ignored 8228c6fdee
error at df, ignored 847ebc0a8d
error at df, ignored 2acdf5a450
error at df, ignored dd589529b5
error at df, ignored ad8e83242a
error at df, ignored d2ecd51a66
error at df, ignored 9506859311
error at df, ignored 0a900ed8a2
error at df, ignored affdf8553f
error at df, ignored 72853af746
error at df, ignored cb82000108
error at df, ignored 6214414fff
error at df, ignored 2054f1d5df
error at df, ignored 6614bf9767
error at df, ignored 6a18e3751f
error at

In [22]:
train["total_accl"]= np.sqrt(np.square(train.AccV) + np.square(train.AccML) + np.square(train.AccAP))
train["abs_delta1"] = (train["total_accl"] - train["total_accl"].shift(100)) * 1000
train["abs_delta2"] = (train["total_accl"] - train["total_accl"].shift(1000)) * 1000
train["abs_delta1"] = train["abs_delta1"].fillna(0)
train["abs_delta2"] = train["abs_delta2"].fillna(0)

In [23]:
# train['Flag']= np.any(train[['StartHesitation','Turn','Walking']] == 1, axis=1).astype(int)
cols = cols + ['total_accl','abs_delta1','abs_delta2'] 
cols

['Time_frac2',
 'annot',
 'AccV',
 'AccML',
 'AccAP',
 'Flag',
 'Time_frac',
 't_group',
 'Visit',
 'Test',
 'Medication',
 's_group',
 'AccAP__abs_energy__w=1000',
 'AccAP__abs_sum__w=1000',
 'AccAP__emg_var__w=1000',
 'AccAP__kurt__w=1000',
 'AccAP__maximum__w=1000',
 'AccAP__mean__w=1000',
 'AccAP__mean_abs__w=1000',
 'AccAP__mean_crossings__w=1000',
 'AccAP__median__w=1000',
 'AccAP__minimum__w=1000',
 'AccAP__mse__w=1000',
 'AccAP__root_mean_square__w=1000',
 'AccAP__skew__w=1000',
 'AccAP__slope_sign_changes__w=1000',
 'AccAP__std__w=1000',
 'AccAP__var__w=1000',
 'AccAP__waveform_length__w=1000',
 'AccAP__willison_amplitude__w=1000',
 'AccAP__zero_crossing__w=1000',
 'AccML__abs_energy__w=1000',
 'AccML__abs_sum__w=1000',
 'AccML__emg_var__w=1000',
 'AccML__kurt__w=1000',
 'AccML__maximum__w=1000',
 'AccML__mean__w=1000',
 'AccML__mean_abs__w=1000',
 'AccML__mean_crossings__w=1000',
 'AccML__median__w=1000',
 'AccML__minimum__w=1000',
 'AccML__mse__w=1000',
 'AccML__root_mean_sq

In [24]:
numerical_columns = train.select_dtypes(include=[np.number]).columns

# Convert integer columns to int8 data type
integer_columns = train[numerical_columns].select_dtypes(include=['int']).columns
train[integer_columns] = train[integer_columns].astype('int8')

# Convert float columns to float32 data type
float_columns = train[numerical_columns].select_dtypes(include=['float']).columns
train[float_columns] = train[float_columns].astype('float32')

sys.getsizeof(train)

9141491486

In [25]:
# import os

# directory = '/kaggle/working/dat'

# # Create directory if it does not exist
# if not os.path.exists(directory):
#     os.makedirs(directory)
    
# os.chdir(r'/kaggle/working/dat')

# train.to_parquet('train_dat.parquet')

# os.chdir(r'/kaggle/working/')

In [26]:
# os.chdir(r'/kaggle/working/')

In [27]:
# del train
# import gc
# gc.collect()

In [28]:
print("cols \n",cols)
print("ntcols \n",ntcols)
label_encoder = LabelEncoder()
train['label'] = label_encoder.fit_transform(train['label'])
classes = label_encoder.classes_
encoded_numbers = label_encoder.transform(classes)

# Print the classes and encoded numbers
for cls, num in zip(classes, encoded_numbers):
    print(f"Class: {cls}, Encoded Number: {num}")

cols 
 ['Time_frac2', 'annot', 'AccV', 'AccML', 'AccAP', 'Flag', 'Time_frac', 't_group', 'Visit', 'Test', 'Medication', 's_group', 'AccAP__abs_energy__w=1000', 'AccAP__abs_sum__w=1000', 'AccAP__emg_var__w=1000', 'AccAP__kurt__w=1000', 'AccAP__maximum__w=1000', 'AccAP__mean__w=1000', 'AccAP__mean_abs__w=1000', 'AccAP__mean_crossings__w=1000', 'AccAP__median__w=1000', 'AccAP__minimum__w=1000', 'AccAP__mse__w=1000', 'AccAP__root_mean_square__w=1000', 'AccAP__skew__w=1000', 'AccAP__slope_sign_changes__w=1000', 'AccAP__std__w=1000', 'AccAP__var__w=1000', 'AccAP__waveform_length__w=1000', 'AccAP__willison_amplitude__w=1000', 'AccAP__zero_crossing__w=1000', 'AccML__abs_energy__w=1000', 'AccML__abs_sum__w=1000', 'AccML__emg_var__w=1000', 'AccML__kurt__w=1000', 'AccML__maximum__w=1000', 'AccML__mean__w=1000', 'AccML__mean_abs__w=1000', 'AccML__mean_crossings__w=1000', 'AccML__median__w=1000', 'AccML__minimum__w=1000', 'AccML__mse__w=1000', 'AccML__root_mean_square__w=1000', 'AccML__skew__w=1000

In [29]:
# from sklearn.utils import resample

kfold = GroupKFold(5)
groups=kfold.split(train, groups=train.Subject)

regs = []
cvs1 = []
cvs2 = []
# pcols =  ['StartHesitation', 'Turn' , 'Walking']
pcols = ['label']
for _, (tr_idx, te_idx) in enumerate(tqdm(groups, total=5, desc="Folds")): 
    
    tr_idx = pd.Series(tr_idx).sample(n=2000000,random_state=1123).values 
    
    classifier = lgb.LGBMClassifier(**best_params_)

    x_train = train.loc[tr_idx, cols]
    x_train1=x_train[ntcols]   
    x_train['Flag'] = pd.Series(model2.predict(x_train1))
    x_train = x_train.to_numpy()
    
#     x_train['Flag'] = endtransform(x_train['Flag'])
    y_train = train.loc[tr_idx, pcols].to_numpy()     
     
    x_test = train.loc[te_idx, cols]
    x_test1 = x_test[ntcols]
    x_test['Flag']=pd.Series(model2.predict(x_test1))
    x_test = x_test.to_numpy()
#     x_test['Flag'] = endtransform(x_test['Flag'])
    y_test = train.loc[te_idx, pcols].to_numpy() 

    classifier.fit(
        x_train, y_train,
        eval_set=(x_test, y_test),
#         eval_metric=custom_average_precision,
        early_stopping_rounds=25,
        verbose = 0,
    )
    
    regs.append(classifier) 
    
#     # Make predictions on the test set
#     y_pred = model.predict(x_test)

#     # Convert the predicted probabilities to class labels
#     y_pred_labels = y_pred.argmax(axis=1)

#     # Calculate evaluation metrics
#     # Print the confusion matrix
#     cm = confusion_matrix(y_test, y_pred_labels)
#     print("Confusion Matrix:")
#     print(cm)

#     from sklearn.metrics import classification_report
#     classification_report=classification_report(y_test,y_pred_labels)
#     print(classification_report)


#     from sklearn.metrics import balanced_accuracy_score 
#     balanced_acc2 = balanced_accuracy_score(y_test, y_pred_labels)
    
# #     print("balanced_accuracy ",balanced_acc2)
# #     accuracy = accuracy_score(y_test, y_pred_labels)
#     precision = precision_score(y_test, y_pred_labels, average='macro')
# #     recall = recall_score(y_test, y_pred_labels, average='macro')
#     f1 = f1_score(y_test, y_pred_labels, average='macro')
# #     balanced_accuracy_score = balanced_accuracy_score(y_test,y_pred_labels)

# #     print("Accuracy:", accuracy)
# #     print("Precision:", precision)
# #     print("Recall:", recall)
# #     print("F1-score:", f1)
# #     print("Balanced accuracy ",balaced_accuracy_score)
# #     print("classification report \n",classification_report)
    
#     cv1 = f1_score(y_test, y_pred_labels, average='macro')
#     cv2 = precision_score(y_test, y_pred_labels, average='macro')
    
# #     cv = custom_average_precision1(y_test, classifier.predict(x_test)) #.clip(0.0,1.0))
    
#     cvs1.append(cv1)
#     cvs2.append(cv2)
    
# print(cvs1)
# print(cvs2)
# print(np.mean(cvs1))
# print(np.mean(cvs2))

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [30]:
# train=None
# trainnt=None
# gc.collect()

In [31]:
sub = pd.read_csv(path.join(root, 'sample_submission.csv'))
submission = []

pcols =  ['StartHesitation', 'Turn' , 'Walking']

for f in test:
    df = pd.read_csv(f)
    df['Time_frac2']=(df.index/df.index.max()).values
    df.set_index('Time', drop=True, inplace=True)

    df['Id'] = f.split('/')[-1].split('.')[0]
    df['annot']=1

    dataset = Path(f).parts[-2]
    
     
            
    df['Time_frac']=(df.index/df.index.max()).values
    df = pd.merge(df, tasks[['Id','t_group']], how='left', on='Id').fillna(-1)

    df = pd.merge(df, metadata_w_subjects[['Id','Subject', 'Visit','Test','Medication','s_group']], how='left', on='Id').fillna(-1)
    df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin")
    df = df.merge(df_feats, how="left", left_index=True, right_index=True)
    
    
    df["total_accl"]= np.sqrt(np.square(df.AccV) + np.square(df.AccML) + np.square(df.AccAP))
    df["abs_delta1"] = (df["total_accl"] - df["total_accl"].shift(100)) * 1000
    df["abs_delta2"] = (df["total_accl"] - df["total_accl"].shift(1000)) * 1000
    df["abs_delta1"].fillna(0)
    df["abs_delta2"].fillna(0)
    df.fillna(method="ffill", inplace=True)
#     df_classify=df[ntcols]
    yp=model2.predict(df[ntcols]) 
    
    
    df['Flag']=pd.Series(yp)
#     df['Flag'] = endtransform(df['Flag'])
    df.fillna(0)     
        
    res_vals = []
    
    for i_fold in range(5):
        
        pred = regs[i_fold].predict_proba(df[cols])[:,1:] 
        res_vals.append(np.expand_dims(np.round(pred, 3), axis = 2))
        
    res_vals = np.mean(np.concatenate(res_vals, axis = 2), axis = 2)
    res = pd.DataFrame(res_vals, columns=pcols)
    
    df = pd.concat([df,res], axis=1)
    df['Id'] = df['Id'].astype(str) + '_' + df.index.astype(str)
    submission.append(df[scols])
    
submission = pd.concat(submission)
submission = pd.merge(sub[['Id']], submission, how='left', on='Id').fillna(0.0)
os.chdir(r'/kaggle/working/')
submission[scols].to_csv('submission.csv', index=False)
submission

Unnamed: 0,Id,StartHesitation,Turn,Walking
0,003f117e14_0,0.0010,0.0050,0.0008
1,003f117e14_1,0.0010,0.0050,0.0008
2,003f117e14_2,0.0010,0.0050,0.0008
3,003f117e14_3,0.0010,0.0050,0.0008
4,003f117e14_4,0.0010,0.0050,0.0008
...,...,...,...,...
286365,02ab235146_281683,0.0006,0.0122,0.0020
286366,02ab235146_281684,0.0006,0.0122,0.0020
286367,02ab235146_281685,0.0006,0.0122,0.0020
286368,02ab235146_281686,0.0006,0.0122,0.0020


In [32]:
# sub = pd.read_csv(path.join(root, 'sample_submission.csv'))
# submission = []

# # pcols =  ['None','StartHesitation', 'Turn' , 'Walking']
# pcols =  ['StartHesitation', 'Turn' , 'Walking']

# for f in test:
#     df = pd.read_csv(f)
#     df['Time_frac2']=(df.index/df.index.max()).values
#     df.set_index('Time', drop=True, inplace=True)

#     df['Id'] = f.split('/')[-1].split('.')[0]

#     dataset = Path(f).parts[-2]
        
# #     if dataset == 'tdcsfog':
# #         df.AccV = df.AccV / 9.80665
# #         df.AccML = df.AccML / 9.80665
# #         df.AccAP = df.AccAP / 9.80665
            
#     df['Time_frac']=(df.index/df.index.max()).values
#     df = pd.merge(df, tasks[['Id','t_group']], how='left', on='Id').fillna(-1)

#     df = pd.merge(df, metadata_w_subjects[['Id','Subject', 'Visit','Test','Medication','s_group']], how='left', on='Id').fillna(-1)
#     df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin")
#     df = df.merge(df_feats, how="left", left_index=True, right_index=True)
    
    
#     df["total_accl"]= np.sqrt(np.square(df.AccV) + np.square(df.AccML) + np.square(df.AccAP))
#     df["abs_delta1"] = (df["total_accl"] - df["total_accl"].shift(100)) * 1000
#     df["abs_delta2"] = (df["total_accl"] - df["total_accl"].shift(1000)) * 1000
#     df["abs_delta1"].fillna(0)
#     df["abs_delta2"].fillna(0)
#     df.fillna(method="ffill", inplace=True)
     
#     yp=model2.predict(ntcols)
#     df['Flag']=pd.Series(yp)
# #     df['Flag'] = endtransform(df['Flag'])
#     df.fillna(0)    
        
#     res_vals = []
    
#     for i_fold in range(5):
        
#         pred = regs[i_fold].predict_proba(df[cols])[:,1:] #.clip(0.0,1.0)
# #         print(pred)
# #         dfsd
#         res_vals.append(np.expand_dims(np.round(pred, 3), axis = 2))
        
        
#     res_vals = np.mean(np.concatenate(res_vals, axis = 2), axis = 2)
#     res = pd.DataFrame(res_vals, columns=pcols)
    
#     df = pd.concat([df,res], axis=1)
#     df['Id'] = df['Id'].astype(str) + '_' + df.index.astype(str)
# #     scols1 = ['StartHesitation', 'Turn' , 'Walking','None','Id']
#     submission.append(df[scols])
    
# submission = pd.concat(submission)

# # max_column = submission[['StartHesitation','Turn','Walking','None']].idxmax(axis=1)

# # # Set values of other columns to 0
# # submission.loc[max_column != 'StartHesitation', 'StartHesitation'] = 0
# # submission.loc[max_column != 'Turn', 'Turn'] = 0
# # submission.loc[max_column != 'Walking', 'Walking'] = 0
# # submission.loc[max_column != 'None', 'None'] = 0

# # # Threshold value
# # threshold = 0.5

# # submission['StartHesitation'][submission['StartHesitation'] >= threshold] = 1
# # submission['Turn'][submission['Turn'] >= threshold] = 1
# # submission['Walking'][submission['Walking'] >= threshold] = 1
# # submission['None'][submission['None'] >= threshold] = 1

# # scols = ['Id', 'StartHesitation', 'Turn' , 'Walking']


# submission = pd.merge(sub[['Id']], submission, how='left', on='Id').fillna(0.0)
# os.chdir(r'/kaggle/working/')
# submission[scols].to_csv('submission.csv', index=False)
# # submission

In [33]:
submission['StartHesitation'].sum(),submission['Turn'].sum(),submission['Walking'].sum()
# (3037.7040000000006, 13867.645600000002, 5489.886600000002)
# (2666.168200000001, 9083.0386, 2668.4824)

(405.18399999999997, 18716.004599999997, 5163.162800000001)

In [34]:
# [0.3651458691967257, 0.20678542041783934, 0.06170097111594116, 0.15119806424902058, 0.06150458186906638]
# 0.16926698136971866

# [0.1262944734525422, 0.13564939054645525, 0.031860946725975, 0.08222518609558244, 0.03200343657865203]
# 0.08160668667984138

# [0.13026471957413796, 0.05112257329301665, 0.027278221087166526, 0.09745428203630054, 0.026953597619237216]
# 0.06661467872197177