In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import copy

### Paste data set 

In [None]:
df = pd.read_pickle("data/interim/03_data_features.pkl")

In [None]:
df.info()

In [None]:
# Create a training and test set
df_train = df.drop(["participant", "category", "set"], axis=1)

X = df_train.drop(["label"], axis=1)
y = df_train["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

In [None]:
print(f'Train: {X_train.shape,y_train.shape} Test: {X_test.shape, y_test.shape}')

In [None]:
# Split feature subsets

basic_features = ["acc_x", "acc_y", "acc_z", "gyr_x", "gyr_y", "gyr_z"]
square_features = ["acc_r", "gyr_r"]
pca_features = ["pca_1", "pca_2", "pca_3"]
time_features = [f for f in df_train.columns if "_temp_" in f]
freq_features = [f for f in df_train.columns if (("_freq" in f) or ("_pse" in f))]
cluster_features = ["cluster"]

print("Basic Feature", len(basic_features))
print("Square Feature", len(square_features))
print("PCA Feature", len(pca_features))
print("Time Feature", len(time_features))
print("Frequency Feature", len(freq_features))
print("Cluster Feature", len(cluster_features))

feature_set_1 = list(set(basic_features))
feature_set_2 = list(set(basic_features + square_features + pca_features))
feature_set_3 = list(set(feature_set_2 + time_features))
feature_set_4 = list(set(feature_set_3 + freq_features + cluster_features))


In [None]:
def decision_tree(
        train_X,
        train_y,
        test_X,
        min_samples_leaf=50,
        criterion="gini",
        print_model_details=False,
        export_tree_path="Example_graphs/Chapter7/",
        export_tree_name="tree.dot",
        gridsearch=True,
    ):
        # Create the model
        if gridsearch:
            tuned_parameters = [
                {
                    "min_samples_leaf": [2, 10, 50, 100, 200],
                    "criterion": ["gini", "entropy"],
                }
            ]
            dtree = GridSearchCV(
                DecisionTreeClassifier(), tuned_parameters, cv=5, scoring="accuracy"
            )
        else:
            dtree = DecisionTreeClassifier(
                min_samples_leaf=min_samples_leaf, criterion=criterion
            )

        # Fit the model

        dtree.fit(train_X, train_y.values.ravel())

        if gridsearch and print_model_details:
            print(dtree.best_params_)

        if gridsearch:
            dtree = dtree.best_estimator_

        # Apply the model
        pred_prob_training_y = dtree.predict_proba(train_X)
        pred_prob_test_y = dtree.predict_proba(test_X)
        pred_training_y = dtree.predict(train_X)
        pred_test_y = dtree.predict(test_X)
        frame_prob_training_y = pd.DataFrame(
            pred_prob_training_y, columns=dtree.classes_
        )
        frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=dtree.classes_)

        return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y

In [62]:
#def forward_selection(max_features, X_train, y_train):
        # Start with no features.
max_features = 2
ordered_features = []
ordered_scores = []
selected_features = []
dtree = decision_tree
prev_best_perf = 0

# Select the appropriate number of features.
for i in range(0, max_features):
    print(i)
    print(ordered_features)
    # Determine the features left to select.
    features_left = list(set(X_train.columns) - set(selected_features))
    print(len(features_left))
    best_perf = 0
    best_attribute = ""

    # For all features we can still select...
    for f in features_left:
        temp_selected_features = copy.deepcopy(selected_features)
        temp_selected_features.append(f)

        # Determine the accuracy of a decision tree learner if we were to add
        # the feature.
        (
            pred_y_train,
            pred_y_test,
            prob_training_y,
            prob_test_y,
        ) = dtree(
            X_train[temp_selected_features],
            y_train,
            X_train[temp_selected_features],
        )
        perf = accuracy_score(y_train, pred_y_train)
        

        # If the performance is better than what we have seen so far (we aim for high accuracy)
        # we set the current feature to the best feature and the same for the best performance.
        if perf > best_perf:
            best_perf = perf
            best_feature = f
    # We select the feature with the best performance.
    selected_features.append(best_feature)
    prev_best_perf = best_perf
    ordered_features.append(best_feature)
    ordered_scores.append(best_perf)
        #return selected_features, ordered_features, ordered_scores

0
[]
117
0.7114788004136504 acc_r_freq_2.5_Hz_ws_14
0.7445708376421923 acc_r_pse
0.755601516718373 acc_y_freq_0.357_Hz_ws_14
0.7690451568424681 gyr_z_freq_1.786_Hz_ws_14
0.8300586004825922 pca_2
0.8579800068941744 acc_x
0.8876249569114099 pca_1
1
['pca_1']
116
0.9283005860048259 acc_r_freq_2.5_Hz_ws_14
0.9372630127542226 acc_y_pse
0.9386418476387453 acc_y_freq_0.357_Hz_ws_14
0.9617373319544984 pca_2
0.9762150982419855 duration


In [None]:
# Perform forward feature selection using simple decision tree

max_features = 10

selected_features, ordered_features, ordered_scores = forward_selection(
    max_features, X_train, y_train
)

In [64]:
print("Training neural network,")

Training neural network,
