In [1]:
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger('tsfresh').setLevel(logging.ERROR)
import pandas as pd
import numpy as np
from pprint import pprint
from joblib import dump, load
from sklearn.impute import SimpleImputer
from tsfresh import extract_features, select_features,feature_selection
from tsfresh.utilities.dataframe_functions import impute
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression as LR
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_auc_score as AUC, accuracy_score as accuracy
from sklearn.svm import SVC as SVM
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.gaussian_process import GaussianProcessClassifier as GPC
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier as DTC

In [2]:
def train():
    result = pd.DataFrame()
#     Extracting Data for meal class

    for x in range(5):
        d = pd.read_csv('data/mealData'+str(x+1)+'.csv', header = None,error_bad_lines=False)
        d['y']= 1
        result = pd.concat([result,d])
    
#     Extracting data for no meal class
    for x in range(5):
        d = pd.read_csv('data/Nomeal'+str(x+1)+'.csv', header = None,error_bad_lines=False)
        d['y']= 0
        result = pd.concat([result,d])
        
#         Imputing for NaN value removal
    result = impute_data(result)
    
#     Renaming Target column to dataframe
    result = pd.DataFrame(result)
    columns = list(result.columns)
    columns.pop()
    columns.append('target')
    result.columns = columns
    
#     Extracting features and writing into files
    features = feature_extract(result,'data/features_file.csv')
#     Cross fold validation and Training models
    classifier()
    
    
    
    
    

In [3]:
def feature_extract(result, filename):
    y = result.target
    result.drop( 'target', axis = 1, inplace = True )
    d = result.stack()
    d.index.rename([ 'id', 'time' ], inplace = True )
    d = d.reset_index()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        f = extract_features( d, column_id = "id", column_sort = "time")
    impute(f)
    assert f.isnull().sum().sum() == 0
    result_ml = feature_selection.relevance.calculate_relevance_table(f, y, ml_task='auto', n_jobs=2, chunksize=None, test_for_binary_target_binary_feature='fisher', test_for_binary_target_real_feature='mann', test_for_real_target_binary_feature='mann', test_for_real_target_real_feature='kendall', fdr_level=0.05, hypotheses_independent=False)
    result_ml = result_ml[result_ml['relevant']==True]
    f=f[result_ml['feature']]
    f = f.iloc[:,:10]
    columns = f.columns
    columns = pd.DataFrame(columns)
    columns.to_csv("data/features_name.csv", index=None)
    f['y'] = y  
    f.to_csv( filename, index = None )
    return f
    

In [4]:
def impute_data(result):
    imp_mean = SimpleImputer(missing_values=np.nan,strategy='mean')
    imp_mean.fit(result)
    return(imp_mean.transform(result))

In [5]:
def classifier():
    data = pd.read_csv("data/features_file.csv")
    print("The shape of data is {}".format(data.shape))
    classifiers, classifiers_name = classi_vals()
    idx = 0
    scoring= ['accuracy','precision_macro', 'recall_macro','f1_macro']
    for clf in classifiers: 
        print("Classifer : {}".format(classifiers_name[idx]))
        X = data.iloc[:,:-1]
        y = data['y']
        clf.fit(X,y)
        dump(clf, 'models/'+classifiers_name[idx]+'.joblib') 
#         print(clf.predict(data.iloc[0]))
        idx+=1
        scores = cross_validate(clf,data.loc[:, data.columns != 'y'],data['y'],cv = 5,scoring = scoring)
        print("Average score are {}, Average Precision is {},Average Recall is {}, Average f1 score is {}".format(np.mean(scores['test_accuracy']),np.mean(scores['test_precision_macro']),np.mean(scores['test_recall_macro']),np.mean(scores['test_f1_macro'])))

In [6]:
def classi_vals():
    classifiers = [   
        make_pipeline( StandardScaler(), LR()),
        make_pipeline( MinMaxScaler(), LR()),
        make_pipeline( MinMaxScaler(), SVM(gamma='auto', probability=True)),
        RF( n_estimators = 100, min_samples_leaf = 5 ),
        SGD(random_state = 42),
        GPC(1.0 * RBF(1.0)),
        ABC(),
        NB(),
        DTC(max_depth = 5)
    ]
    classifiers_names = ["Logistic Regression(Standard Scalar)","Logistic Regression(MinMax Scalar)","SVM","RandomForest","Stocahastic gradient descent","Gaussian Process Classifier","Ada boost","Naive bayes","Decision Trees"]
    return classifiers, classifiers_names

In [7]:
train()

Feature Extraction: 100%|██████████| 29/29 [00:06<00:00,  4.37it/s]


The shape of data is (486, 11)
Classifer : Logistic Regression(Standard Scalar)
Average score are 0.5945297706711551, Average Precision is 0.59620638912483,Average Recall is 0.5946071428571428, Average f1 score is 0.5923481932372564
Classifer : Logistic Regression(MinMax Scalar)
Average score are 0.6295813170629077, Average Precision is 0.6314499304735774,Average Recall is 0.6294285714285714, Average f1 score is 0.6258452502618141
Classifer : SVM
Average score are 0.6316010940458658, Average Precision is 0.6323713001973872,Average Recall is 0.6313044217687075, Average f1 score is 0.6286000854078783
Classifer : RandomForest
Average score are 0.6233115926783084, Average Precision is 0.6251651994330393,Average Recall is 0.6228078231292516, Average f1 score is 0.6210958098190076
Classifer : Stocahastic gradient descent
Average score are 0.5184935830002104, Average Precision is 0.4539312748421791,Average Recall is 0.5199472789115647, Average f1 score is 0.40874572803559345
Classifer : Gauss

In [47]:
data = pd.read_csv("data/features_file.csv")

In [58]:
data.head()

Unnamed: 0,0__spkt_welch_density__coeff_2,"0__fft_coefficient__coeff_1__attr_""abs""",0__variance,0__standard_deviation,0__autocorrelation__lag_2,0__autocorrelation__lag_1,0__partial_autocorrelation__lag_1,"0__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2",0__autocorrelation__lag_3,"0__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.2",...,"0__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.6","0__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""rvalue""","0__change_quantiles__f_agg_""var""__isabs_False__qh_1.0__ql_0.8","0__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""stderr""","0__agg_linear_trend__f_agg_""mean""__chunk_len_10__attr_""stderr""",0__cid_ce__normalize_False,0__index_mass_quantile__q_0.6,0__large_standard_deviation__r_0.30000000000000004,"0__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_13__w_2",y
0,4400.873903,609.257683,867.8,29.458445,0.801863,0.913209,0.913209,75.84,0.674887,7.6,...,4.5,0.216795,19.0,32.313123,27.886018,46.162756,0.6,1.0,15.043549,1.0
1,8740.14161,814.553186,1707.266667,41.319084,0.842073,0.934989,0.934989,17.61,0.722966,5.3,...,5.25,-0.65213,54.0,20.036346,34.641016,45.738387,0.633333,0.0,19.072014,1.0
2,14365.42749,1109.645519,3192.712222,56.50409,0.936196,0.978882,0.978882,47.4,0.873854,11.0,...,3.75,0.038011,2.6875,55.882651,2.655811,45.188494,0.433333,1.0,29.153469,1.0
3,5917.460252,689.838444,1236.026667,35.157171,0.89992,0.96347,0.96347,27.76,0.799895,5.2,...,6.0,-0.239465,20.8,24.669784,21.13102,34.452866,0.533333,1.0,0.285149,1.0
4,1639.481005,537.283682,921.183333,30.351002,0.885352,0.942781,0.942781,8.24,0.82005,5.6,...,1.4,0.110434,5.673469,7.981549,8.313844,23.811762,0.533333,1.0,7.276075,1.0


In [55]:
main = data.iloc[:,:-1]

In [57]:
main.head()

Unnamed: 0,0__spkt_welch_density__coeff_2,"0__fft_coefficient__coeff_1__attr_""abs""",0__variance,0__standard_deviation,0__autocorrelation__lag_2,0__autocorrelation__lag_1,0__partial_autocorrelation__lag_1,"0__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2",0__autocorrelation__lag_3,"0__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.2",...,"0__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_2__w_10","0__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.6","0__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""rvalue""","0__change_quantiles__f_agg_""var""__isabs_False__qh_1.0__ql_0.8","0__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""stderr""","0__agg_linear_trend__f_agg_""mean""__chunk_len_10__attr_""stderr""",0__cid_ce__normalize_False,0__index_mass_quantile__q_0.6,0__large_standard_deviation__r_0.30000000000000004,"0__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_13__w_2"
0,4400.873903,609.257683,867.8,29.458445,0.801863,0.913209,0.913209,75.84,0.674887,7.6,...,437.465754,4.5,0.216795,19.0,32.313123,27.886018,46.162756,0.6,1.0,15.043549
1,8740.14161,814.553186,1707.266667,41.319084,0.842073,0.934989,0.934989,17.61,0.722966,5.3,...,134.103838,5.25,-0.65213,54.0,20.036346,34.641016,45.738387,0.633333,0.0,19.072014
2,14365.42749,1109.645519,3192.712222,56.50409,0.936196,0.978882,0.978882,47.4,0.873854,11.0,...,405.580147,3.75,0.038011,2.6875,55.882651,2.655811,45.188494,0.433333,1.0,29.153469
3,5917.460252,689.838444,1236.026667,35.157171,0.89992,0.96347,0.96347,27.76,0.799895,5.2,...,424.430023,6.0,-0.239465,20.8,24.669784,21.13102,34.452866,0.533333,1.0,0.285149
4,1639.481005,537.283682,921.183333,30.351002,0.885352,0.942781,0.942781,8.24,0.82005,5.6,...,333.021392,1.4,0.110434,5.673469,7.981549,8.313844,23.811762,0.533333,1.0,7.276075
