## Import dependencies

In [169]:
import pandas as pd
import seaborn as sns
import time

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, BaggingClassifier
from sklearn.ensemble import RandomForestClassifier



from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV

## Read data

In [170]:
X_train = pd.read_csv("../data/X_train.csv",header=[0,1],index_col=0)
X_test = pd.read_csv("../data/X_test.csv",header=[0,1],index_col=0)
y_train = pd.read_csv("../data/y_train.csv",index_col=0)
y_test = pd.read_csv("../data/y_test.csv",index_col=0)

In [171]:
def find_na(df):
    
    features_to_drop = []
    
    for i in df.columns:
        
        if df[i].isna().sum() > len(df)*0.2:
            features_to_drop.append(i)
    
    return features_to_drop

In [172]:
def drop_features(df,features_to_drop):
    
    df.drop(features_to_drop, axis = 1, inplace = True)
    
    return df

### Drop nas

#### Train data

In [173]:
features_to_drop = find_na(X_train)
X_train = drop_features(X_train,features_to_drop)

#### Test data 

In [174]:
features_to_drop = find_na(X_test)
X_test = drop_features(X_test,features_to_drop)

## Build pipeline

### Select classifier 

First, I test some classifier, than I can choose one for parameter optimization to get better results.

In [176]:
knc = KNeighborsClassifier()
lr = LogisticRegression(solver="newton-cg")
svc = SVC()
rfc = RandomForestClassifier()
nbc = MultinomialNB()
gbc = GradientBoostingClassifier()
bc = BaggingClassifier()

ml_models = models = {"knc": knc, "lr": lr, "svc": svc, "rfc": rfc,
                      'nb' : MultinomialNB(), 'gbc' : GradientBoostingClassifier(),
                       'bc' : BaggingClassifier()}

In [177]:
def initial_model(X_train, X_test, y_train, y_test, ml_models):
    
    
    
    for model in ml_models:
          
        if model != 'nb':
            
            pipe = Pipeline([("imputer", SimpleImputer(fill_value = 0)),
                     ('scaler', StandardScaler()), (model, ml_models[model])])
        else:
            
            pipe = Pipeline([("imputer", SimpleImputer(fill_value = 0)),
                     ('minmax',MinMaxScaler()), (model, ml_models[model])])
        
        start_train = time.time()
        pipe.fit(X_train, y_train)
        train_time = round(time.time()-start_train,1)
        
        accuracy = accuracy_score(y_test,pipe.predict(X_test))
        recall = recall_score(y_test,pipe.predict(X_test), average=None)
        precision = precision_score(y_test,pipe.predict(X_test), average=None)
        f1 = f1_score(y_test,pipe.predict(X_test), average=None)
        
        print('*'*20)
        print(f'Scores of {model}:')
        print('*'*20)
        print(f'Train time: {train_time}s')
        print(f'Accuracy score: {accuracy}')
        print('\n')
        print(f'Recall score: {recall}')
        print('\n')
        print(f'Precision score: {precision}')
        print('\n')
        print(f'F1 score: {f1}')
        print('\n')
    

In [178]:
initial_model(X_train,X_test,y_train["activity"],y_test["activity"],ml_models)

********************
Scores of knc:
********************
Train time: 0.7s
Accuracy score: 0.948269711023903


Recall score: [0.99307159 0.99240506 0.98060942 0.83936652 0.93644068 0.99789916
 0.97142857 0.84615385 0.87804878 0.85294118 0.875      0.775     ]


Precision score: [0.97505669 0.98492462 0.99159664 0.92982456 0.86328125 0.99789916
 0.91891892 1.         0.87804878 0.80555556 0.875      0.86111111]


F1 score: [0.98398169 0.98865069 0.98607242 0.882283   0.89837398 0.99789916
 0.94444444 0.91666667 0.87804878 0.82857143 0.875      0.81578947]


********************
Scores of lr:
********************
Train time: 11.1s
Accuracy score: 0.9721726721369961


Recall score: [0.99769053 0.99493671 0.99445983 0.91628959 0.95338983 0.99789916
 0.94285714 1.         0.92682927 0.97058824 0.9375     0.9       ]


Precision score: [0.98855835 0.99746193 1.         0.94847775 0.92592593 0.99789916
 1.         1.         0.92682927 0.94285714 0.86538462 0.97297297]


F1 score: [0.99310345 

## Final model

In the initial test, in the previous section I tried out some of the classification models in sklearn. There are some model with really high accuracy. Most of the models without any parameter tuning, show us a high accuracy. The fastest model was the `Naive Bayes` but the accuracy wasn't so high. The Random Forest Classifier has the highest accuracy score is, and the calculation speed is not so bad. I will test out this model. 