In [17]:
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger('tsfresh').setLevel(logging.ERROR)
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.impute import SimpleImputer
from tsfresh import extract_features, select_features,feature_selection
from tsfresh.utilities.dataframe_functions import impute
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression as LR
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_auc_score as AUC, accuracy_score as accuracy
from sklearn.svm import SVC as SVM
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.gaussian_process import GaussianProcessClassifier as GPC
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier as DTC

In [2]:
def train():
    result = pd.DataFrame()
#     Extracting Data for meal class

    for x in range(5):
        d = pd.read_csv('data/mealData'+str(x+1)+'.csv', header = None,error_bad_lines=False)
        d['y']= 1
        result = pd.concat([result,d])
    
#     Extracting data for no meal class
    for x in range(5):
        d = pd.read_csv('data/Nomeal'+str(x+1)+'.csv', header = None,error_bad_lines=False)
        d['y']= 0
        result = pd.concat([result,d])
        
#         Imputing for NaN value removal
    result = impute_data(result)
    
#     Renaming Target column to dataframe
    result = pd.DataFrame(result)
    columns = list(result.columns)
    columns.pop()
    columns.append('target')
    result.columns = columns
    
#     Extracting features and writing into files
    features = feature_extract(result,'data/features_file.csv')
#     Cross fold validation and Training models
    classifier()
    
    
    
    
    

In [3]:
def feature_extract(result, filename):
    y = result.target
    result.drop( 'target', axis = 1, inplace = True )
    d = result.stack()
    d.index.rename([ 'id', 'time' ], inplace = True )
    d = d.reset_index()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        f = extract_features( d, column_id = "id", column_sort = "time")
    impute(f)
    assert f.isnull().sum().sum() == 0
    result_ml = feature_selection.relevance.calculate_relevance_table(f, y, ml_task='auto', n_jobs=2, chunksize=None, test_for_binary_target_binary_feature='fisher', test_for_binary_target_real_feature='mann', test_for_real_target_binary_feature='mann', test_for_real_target_real_feature='kendall', fdr_level=0.05, hypotheses_independent=False)
    result_ml = result_ml[result_ml['relevant']==True]
    f=f[result_ml['feature']]
    f['y'] = y
    f.to_csv( filename, index = None )
    columns = f.columns
    columns = pd.DataFrame(columns)
    columns.to_csv("data/features_name.csv", index=None)
    return f
    

In [4]:
def impute_data(result):
    imp_mean = SimpleImputer(missing_values=np.nan,strategy='mean')
    imp_mean.fit(result)
    return(imp_mean.transform(result))

In [23]:
def classifier():
    data = pd.read_csv("data/features_file.csv")
    classifiers, classifiers_name = classi_vals()
    idx = 0
    scoring= ['accuracy','precision_macro', 'recall_macro','f1_macro']
    for clf in classifiers: 
        print("Classifer : {}".format(classifiers_name[idx]))
        idx+=1
        scores = cross_validate(clf,data.loc[:, data.columns != 'y'],data['y'],cv = 5,scoring = scoring)
        print("Average score are {}, Average Precision is {},Average Recall is {}, Average f1 score is {}".format(np.mean(scores['test_accuracy']),np.mean(scores['test_precision_macro']),np.mean(scores['test_recall_macro']),np.mean(scores['test_f1_macro'])))

In [15]:
def classi_vals():
    classifiers = [   
        make_pipeline( StandardScaler(), LR()),
        make_pipeline( MinMaxScaler(), LR()),
        make_pipeline( MinMaxScaler(), SVM(gamma='auto', probability=True)),
        RF( n_estimators = 100, min_samples_leaf = 5 ),
        SGD(random_state = 42),
        GPC(1.0 * RBF(1.0)),
        ABC(),
        NB(),
        DTC(max_depth = 5)
    ]
    classifiers_names = ["Logistic Regression(Standard Scalar)","Logistic Regression(MinMax Scalar)","SVM","RandomForest","Stocahastic gradient descent","Gaussian Process Classifier","Ada boost","Naive bayes","Decision Trees"]
    return classifiers, classifiers_names

In [24]:
train()

Feature Extraction: 100%|██████████| 29/29 [00:06<00:00,  4.53it/s]


Classifer : Logistic Regression(Standard Scalar)
Average score are 0.6006311803071743, Average Precision is 0.6017359785430016,Average Recall is 0.6001411564625851, Average f1 score is 0.5970754282575271
Classifer : Logistic Regression(MinMax Scalar)
Average score are 0.6193141173995371, Average Precision is 0.6216131393300928,Average Recall is 0.6190561224489796, Average f1 score is 0.6154333019854679
Classifer : SVM
Average score are 0.6439511887229118, Average Precision is 0.6471401242213606,Average Recall is 0.6430425170068028, Average f1 score is 0.6336955952648907
Classifer : RandomForest
Average score are 0.615148327372186, Average Precision is 0.6180659273258893,Average Recall is 0.6148452380952381, Average f1 score is 0.6094266128944321
Classifer : Stocahastic gradient descent
Average score are 0.5185356616873553, Average Precision is 0.31708281437191943,Average Recall is 0.5153486394557824, Average f1 score is 0.38316026916409396
Classifer : Gaussian Process Classifier
Averag