In [1]:
import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
import seaborn  as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import sklearn  as skl
from numpy import asarray

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline      
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier
from sklearn.svm           import SVC

In [2]:
data = pd.read_csv(r'D:\apple_watch_tracker\Datasets\dataset_0.5sec.csv')

In [3]:
oe = OrdinalEncoder()
ct = asarray(data['user'])
data['user'] = oe.fit_transform(ct.reshape(-1,1))
data = data.sort_values(by='user')
data

Unnamed: 0.1,Unnamed: 0,id,time,activityrecognition#0,activityrecognition#1,android.sensor.accelerometer#mean,android.sensor.accelerometer#min,android.sensor.accelerometer#max,android.sensor.accelerometer#std,android.sensor.game_rotation_vector#mean,...,sound#mean,sound#min,sound#max,sound#std,speed#mean,speed#min,speed#max,speed#std,target,user
62584,62584,84508,630.0,,100.0,15.892297,15.892297,15.892297,3.606631,0.838142,...,89.730876,89.730876,89.730876,,0.75,0.75,0.75,,Walking,0.0
21106,21106,8609,234.0,,100.0,10.142241,10.142241,10.142241,0.007413,0.840593,...,89.770732,89.770732,89.770732,,13.50,13.50,13.50,,Car,0.0
21105,21105,22422,1727.0,,8.0,10.033811,9.701954,10.334343,0.317356,0.890862,...,89.794108,89.794108,89.794108,,20.50,20.50,20.50,,Car,0.0
21104,21104,11240,1151.0,,100.0,10.044192,9.843447,10.144604,0.173851,0.772161,...,,,,,32.50,32.50,32.50,,Car,0.0
21103,21103,7828,289.0,,85.0,8.514446,8.514446,8.514446,0.062750,0.734319,...,89.738523,89.738523,89.738523,,12.25,12.25,12.25,,Car,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37690,37690,129745,309.0,,8.0,10.805771,10.805771,10.805771,0.102640,,...,71.595672,71.595672,71.595672,,,,,,Train,12.0
37689,37689,129760,324.0,,8.0,10.389200,10.389200,10.389200,0.019366,,...,68.089832,68.089832,68.089832,,,,,,Train,12.0
37688,37688,131014,1578.0,,8.0,10.718977,10.704910,10.733045,0.019895,,...,70.312179,70.312179,70.312179,,,,,,Train,12.0
37699,37699,130782,1346.0,,8.0,10.551876,10.463580,10.686567,0.118512,,...,68.048667,68.048667,68.048667,,,,,,Train,12.0


In [4]:
def col_names_change(data):
    data.drop(['Unnamed: 0','id', 'activityrecognition#0'],axis=1, inplace=True)
    data_col = ['time','activityrecognition_1']
    for i in data.columns[2:58]:
        b = i.split('.')[2].split('#')
        data_col.append(f'{b[0]}_{b[1]}')

    for i in data.columns[58:66]:
        b = i.split('#')
        data_col.append(f'{b[0]}_{b[1]}')

    data_col.append('target')
    data_col.append('user')
    data = pd.DataFrame.from_records(data.values)
    data.columns = data_col

    return data

In [5]:
data = col_names_change(data)

In [6]:
data.drop(['pressure_mean', 'pressure_max', 'pressure_min', 'pressure_std', 'sound_std', 'speed_std','step_counter_mean','step_counter_std','light_std',
            'proximity_mean','proximity_min','proximity_max','proximity_std', 'step_counter_min', 'step_counter_max' ], axis=1, inplace=True)


In [7]:
data['target']= data['target'].apply({'Bus':0, 'Car':1, 'Still':2, 'Train':3, 'Walking':4}.get)

In [8]:
df0 = data[data['target']==0]
df1 = data[data['target']==1]
df2 = data[data['target']==2]
df3 = data[data['target']==3]
df4 = data[data['target']==4]
#msno.matrix(df1)

In [9]:
def fill_nan_values(tsdf):
    for i in tsdf.columns:
        ndf = tsdf.groupby(['target'])[f'{i}'].mean()
        mn = ndf.values[0]
        tsdf[f'{i}'] = tsdf[f'{i}'].fillna(mn)
    return tsdf

df0 = fill_nan_values(df0)
df1 = fill_nan_values(df1)
df2 = fill_nan_values(df2)
df3 = fill_nan_values(df3)
df4 = fill_nan_values(df4)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [10]:
data = pd.concat([df0, df1, df2, df3, df4])
data.shape

(62585, 53)

In [11]:
data

Unnamed: 0,time,activityrecognition_1,accelerometer_mean,accelerometer_min,accelerometer_max,accelerometer_std,game_rotation_vector_mean,game_rotation_vector_min,game_rotation_vector_max,game_rotation_vector_std,...,rotation_vector_max,rotation_vector_std,sound_mean,sound_min,sound_max,speed_mean,speed_min,speed_max,target,user
13713,821.0,100.0,9.580663,9.580663,9.580663,0.012232,0.801618,0.801618,0.801618,0.000162,...,0.928278,0.000081,85.121360,85.121360,85.121360,8.000000,8.000000,8.000000,0,0.0
13714,13.0,100.0,10.958598,9.868953,12.048243,1.540991,0.779063,0.779063,0.779063,0.004365,...,0.482034,0.008963,83.843193,83.843144,83.843242,5.341548,5.341548,5.341548,0,0.0
13715,12.0,100.0,10.958598,9.868953,12.048243,1.540991,0.779063,0.779063,0.779063,0.004365,...,0.452277,0.004023,83.843193,83.843144,83.843242,5.341548,5.341548,5.341548,0,0.0
13716,11.0,100.0,8.841528,6.835641,10.847415,2.836753,0.779063,0.779063,0.779063,0.004365,...,0.452277,0.004023,83.843193,83.843144,83.843242,5.341548,5.341548,5.341548,0,0.0
13717,10.0,100.0,11.618934,9.715548,13.385976,1.839009,0.779063,0.779063,0.779063,0.004365,...,0.417588,0.004023,83.843193,83.843144,83.843242,5.341548,5.341548,5.341548,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62145,483.0,100.0,10.265969,8.937641,11.594297,1.878540,0.870339,0.862206,0.876766,0.015529,...,0.972661,0.016987,73.757710,73.757710,73.757710,1.178197,1.178197,1.178197,4,12.0
62146,418.0,100.0,14.321415,14.321415,14.321415,3.729311,0.870339,0.862206,0.876766,0.015529,...,0.879428,0.004941,79.568839,79.567896,79.569832,1.178197,1.178197,1.178197,4,12.0
62147,546.0,100.0,25.482379,25.482379,25.482379,1.884572,0.870339,0.862206,0.876766,0.015529,...,0.854721,0.029765,79.568839,79.567896,79.569832,1.178197,1.178197,1.178197,4,12.0
62529,854.0,85.0,13.730206,13.730206,13.730206,1.836736,0.870339,0.862206,0.876766,0.015529,...,0.996823,0.001330,78.116997,78.116997,78.116997,1.178197,1.178197,1.178197,4,12.0


In [12]:
data = data.sort_values(by='user')
data.drop(['user'], axis=1, inplace=True)

In [13]:
accelerometer_min = asarray(data['accelerometer_min'])
accelerometer_max = asarray(data['accelerometer_max'])
gyroscope_uncalibrated_min = asarray(data['gyroscope_uncalibrated_min'])
gyroscope_uncalibrated_max = asarray(data['gyroscope_uncalibrated_max'])
linear_acceleration_min = asarray(data['linear_acceleration_min'])
linear_acceleration_max = asarray(data['linear_acceleration_max'])
######
data = data.drop(data.filter(regex='min').columns, axis=1)
data = data.drop(data.filter(regex='max').columns, axis=1)
#####
data['accelerometer_min'] = accelerometer_min 
data['accelerometer_max'] = accelerometer_max
data['gyroscope_uncalibrated_min'] = gyroscope_uncalibrated_min
data['gyroscope_uncalibrated_max'] = gyroscope_uncalibrated_max
data['linear_acceleration_min'] = linear_acceleration_min
data['linear_acceleration_max'] = linear_acceleration_max
data


Unnamed: 0,time,activityrecognition_1,accelerometer_mean,accelerometer_std,game_rotation_vector_mean,game_rotation_vector_std,gravity_mean,gravity_std,gyroscope_mean,gyroscope_std,...,rotation_vector_std,sound_mean,speed_mean,target,accelerometer_min,accelerometer_max,gyroscope_uncalibrated_min,gyroscope_uncalibrated_max,linear_acceleration_min,linear_acceleration_max
13713,821.0,100.0,9.580663,0.012232,0.801618,1.617893e-04,9.806650,5.216953e-07,0.037269,0.016188,...,0.000081,85.121360,8.000000,0,9.580663,9.580663,0.036633,0.036633,0.067885,0.067885
13104,31.0,100.0,9.645943,0.206842,0.739330,7.427495e-03,9.806650,4.209336e-07,0.248027,0.034275,...,0.002077,85.603369,33.705530,3,9.645943,9.645943,0.220782,0.220782,0.237493,0.237493
13105,1381.0,100.0,9.697037,0.033686,0.986589,4.705820e-06,9.806650,1.079752e-08,0.004636,0.005330,...,0.000032,88.017664,33.705530,3,9.697037,9.697037,0.031598,0.031598,0.060151,0.060151
13106,599.0,100.0,10.062870,0.067896,0.966962,1.132248e-04,9.806650,7.184205e-08,0.023576,0.015889,...,0.000323,89.756620,33.705530,3,10.014861,10.110879,0.032029,0.032029,0.051883,0.051883
13107,2728.0,15.0,10.172925,0.026775,0.990579,4.727893e-07,9.806651,4.146969e-07,0.004143,0.001013,...,0.000014,78.007343,33.705530,3,10.172925,10.172925,0.036818,0.036818,0.022213,0.022213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62289,1121.0,8.0,10.652345,0.145613,0.762776,1.539067e-03,9.806650,3.715563e-07,0.077557,0.003418,...,0.000095,51.174171,33.705530,3,10.441941,10.777754,0.063161,0.086416,0.702425,0.702425
62288,1079.0,8.0,10.622010,0.069927,0.762776,1.539067e-03,9.806651,2.531796e-07,0.070117,0.003679,...,0.000070,50.343918,33.705530,3,10.622010,10.622010,0.063161,0.086416,0.842117,0.842117
62287,755.0,8.0,10.514805,0.303703,0.762776,1.539067e-03,9.806650,2.803891e-07,0.069321,0.020483,...,0.000299,74.830910,33.705530,3,10.168924,10.737847,0.063161,0.086416,0.654386,0.654386
62298,601.0,8.0,10.473649,0.065375,0.762776,1.539067e-03,9.806650,4.575759e-07,0.094089,0.015840,...,0.000472,70.472328,33.705530,3,10.473649,10.473649,0.063161,0.086416,0.841839,0.841839


In [14]:
data_test = data.iloc[:26000, :]
data_train = data.iloc[26000: , :]

#### Training sets
x_train = data_train.drop(['target'], axis=1)
y_train = data_train['target']

#### Testing sets
x_test = data_test.drop(['target'], axis=1)
y_test = data_test['target']

In [15]:
y_train.unique()

array([2, 4, 1, 0, 3], dtype=int64)

In [16]:
y_test.unique()

array([0, 3, 1, 2, 4], dtype=int64)

In [17]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler = scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# Grid SearchCV

In [18]:
# from sklearn.model_selection import GridSearchCV
# rfc = LGBMClassifier()

# params = {'n_estimators'        : [int(x) for x in np.linspace(start = 200, stop = 500, num = 10)],
#           'max_depth'           : [int(x) for x in np.linspace(10, 110, num = 11)],
#           'min_samples_split'   : [2, 5, 10],
#           'min_samples_leaf'    :[1, 2, 4],
#           'bootstrap'           : [True, False]
#            }
# rfc_cv = GridSearchCV(rfc, params)
# rfc_cv.fit(x_train, y_train)

In [19]:
tree_classifiers ={
  #"Decision Tree": DecisionTreeClassifier(),
  #"Extra Trees": ExtraTreesClassifier(n_estimators=100),
  #"Random Forest": RandomForestClassifier(n_estimators=500, min_samples_split=8, min_samples_leaf=8, max_depth=85, bootstrap=False),
  #"AdaBoost": AdaBoostClassifier(n_estimators=100),
  #"Skl GBM": GradientBoostingClassifier(n_estimators=100),
  #"Skl HistGBM": HistGradientBoostingClassifier(max_iter=100),
  #"XGBoost": XGBClassifier(),

  "LightGBM": LGBMClassifier(n_estimators=165, num_leaves=45, max_depth=10, boosting_type='goss', learning_rate=0.15),
  #"CatBoost":  CatBoostClassifier(n_estimators=100),
}

In [20]:
tree_classifiers = {name: make_pipeline(model) for name, model in tree_classifiers.items()}

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

In [21]:
rang = abs(y_train.max()) + abs(y_train.min())
for model_name, model in tree_classifiers.items():
    
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(x_test)
    
    results = results.append({"Model":    model_name,
                              "Accuracy": accuracy_score(y_test, pred)*100,
                              "Bal Acc.": balanced_accuracy_score(y_test, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              
                              
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,LightGBM,83.057692,81.888601,1.878054
