# EMG data for gestures

In [48]:
## 0. Importing libraries

# Stop warnings from scikit-learn
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

#==== Importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
# from sklearn.model_selection import GridSearchCV



In [None]:
## 1. defining functions for main programm - part 1

def getdata(): #========= importing data from different files
    
    filename = 'E:\\ML\\ML test 1 - UCI database - EMG data for gestures Data Set\\train_data_set2.csv'
    dataset = pd.read_csv(filename, sep = ';')
    X = dataset.iloc[:,:-1].values
    y = dataset.iloc[:,-1].values
    y.reshape((y.shape[0],1))
    
    print('Data loaded...')

    return X,y

def prepdata(X,y): #==== preparing data for Model
    
    #--- Splitting the dataset into the Training set and Test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123)
    y.reshape((y.shape[0],1))

    #--- Feature Scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    print('Data prepared...')

    return X_train, X_test, y_train, y_test

def makeandrunmodel(X_train, X_test, y_train, y_test): #==== Preparing and fitting Model

    #--- Logistic Regression
    
    classifier = RandomForestClassifier(n_estimators = 4, criterion = 'entropy', random_state = 123)
    classifier.fit(X_train,y_train)

    #--- Predicting the Test set results
    y_pred = classifier.predict(X_test)
    print('Model compiled and fitted...')
    
    return y_pred, classifier

def showresults(y_test, y_pred, y): #=== presenting results
    
    print('\nResults: ')
    #--- Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)

    #--- Calculate ratio
    ttp = 0
    for i in range(int(y.max())+1):
        tp = cm[i,i]
        n = np.sum(cm[:,i])
        acc = tp/n
        ttp += cm[i,i]
        print('{}) {} There are {} predictions of {} which is {:0.2f}% of the total amount.'.format(i+1,cm[i], tp, i, acc*100))
    print('Total accuracy of {:0.2f}%\n'.format(ttp/sum(sum(cm))*100))

In [49]:
# MAIN PROGRAMM - part 1

X,y = getdata() # load data
X_train, X_test, y_train, y_test = prepdata(X,y) # prep data
y_pred, classifier = makeandrunmodel(X_train, X_test, y_train, y_test) # compile and run model
showresults(y_test, y_pred, y) # show results

Data loaded...
Data prepared...
Model compiled and fitted...

Results: 
1) [620   6   5   0   0   2] There are 620 predictions of 0 which is 85.28% of the total amount.
2) [ 12 542  85   9   4  43] There are 542 predictions of 1 which is 62.08% of the total amount.
3) [ 34 131 963  28   6  70] There are 963 predictions of 2 which is 77.16% of the total amount.
4) [ 24  55  58 798  68  56] There are 798 predictions of 3 which is 73.08% of the total amount.
5) [ 12  13  24 169 237  11] There are 237 predictions of 4 which is 71.17% of the total amount.
6) [ 25 126 113  88  18 438] There are 438 predictions of 5 which is 70.65% of the total amount.
Total accuracy of 73.53%



In [52]:
# Defining functions for additional programm: randomsearch for the best parameters

def makegrid():
    # Number of trees in random forest
    n_estimators = [2, 8, 32, 64, 256, 512, 1024]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10, 20]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1,2,4,8]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    return random_grid

def randomsearchfit(classifier, random_grid, X_train, y_train):
    
    # search across 100 different combinations, and use all available cores
    classifier_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    classifier_random.fit(X_train, y_train)
    
    print(classifier_random.best_params_)
    
    return classifier_random.best_estimator_

def runbasemodel(X_train, y_train, X_test, y_test):
    base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
    base_model.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    print('Results of basemodel')
    showresults(y_test, y_pred, y)

def runbestmodel(best_random, X_train, y_train, X_test, y_test):
    y_pred = best_random.predict(X_test)
    print('Results of best model')
    showresults(y_test, y_pred, y)

In [53]:
# ADDITIONAL PROGRAMM - randomsearch for the best parameters

random_grid = makegrid()  # define the grid to be searched
best_random = randomsearchfit(classifier, random_grid, X_train, y_train)  # randomsearchfit with best parameters as output
runbasemodel(X_train, y_train, X_test, y_test)  # accuracy of a basemodel for comparison
runbestmodel(best_random, X_train, y_train, X_test, y_test)  # accuracy of a model with the best parameters according to the search


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 20.3min finished


{'n_estimators': 512, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 70, 'bootstrap': False}
Results of basemodel

Results: 
1) [620   6   5   0   0   2] There are 620 predictions of 0 which is 85.28% of the total amount.
2) [ 12 542  85   9   4  43] There are 542 predictions of 1 which is 62.08% of the total amount.
3) [ 34 131 963  28   6  70] There are 963 predictions of 2 which is 77.16% of the total amount.
4) [ 24  55  58 798  68  56] There are 798 predictions of 3 which is 73.08% of the total amount.
5) [ 12  13  24 169 237  11] There are 237 predictions of 4 which is 71.17% of the total amount.
6) [ 25 126 113  88  18 438] There are 438 predictions of 5 which is 70.65% of the total amount.
Total accuracy of 73.53%

Results of best model

Results: 
1) [624   3   6   0   0   0] There are 624 predictions of 0 which is 98.11% of the total amount.
2) [  3 630  43   1   1  17] There are 630 predictions of 1 which is 89.87% of the total amount.
3) 

In [None]:
## Next step can be a gridsearch with the module 'GridSearchCV' to narrow down the best parameters.