# Clean Model

This notebook is used to train a combined model with the clustering and the Random forest Model. The purpose of this model is to have a single Object which can then be nicely integrated to the webservice.

# Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from tslearn.clustering import TimeSeriesKMeans

# Load Data
Since Ronny and I use the same preprocessing, I used the preprocessed data, which he exported.

In [3]:
train = np.load('../train.npz')
test = np.load('../test.npz')

X_train = train['X']
y_train = train['y']
X_test = test['X']
y_test = test['y']

# Data

In [4]:
X_train.shape

(33887, 43, 13)

As we can see we work with 33887 samples to train our model on. Each sample is a matrix of size 43x13, where the 13 indicates the number of features we have per sample and the 43 are the number of measurement points in each sample (43 points ~ 2.5 Seconds).

The 13 features are divied into 4 sensors, with every sensor measuring 3 Axis except the last one, which is records a fourth axis.
The sensors used Are:
Accerometer, Magnetometer, Gyroscope, and an Orientation Sensor.

# Model

I decided to go with a bit more of a special Model, which first fits a Dynamic Time Warping Clustering model on each feature, and then uses the resulting distances to the centroids as input for a random Forest Classifier. I also expanded the capabilities a bit, so that one can add commen statistical parameters like var, min, max... to the data before it goes into the random forest.

In [13]:
class ClusteringForest():
    def __init__(self, combineSensors=True, useEuclidCombineDistance=True, useStatisialParameters=False, useAllActivities=True, seed=420):
        self.combineOnSensor = combineSensors
        self.useEuclidCombineDistance = useEuclidCombineDistance
        self.useStatisialParameters = useStatisialParameters
        self.useAllActivities = useAllActivities
        self.seed = seed
        self.numberOfActivities = 7 if useAllActivities else 4
        self.numberOfSensors = 4 if combineSensors else 13
        self.models = None
        self.best_estimator = None
    

    def fit(self, X, y, maxIter=5, n_jobs=-1):
        # preprocess the data as needed
        preprocessedX = self.preprocess_X(X)

        self.fit_time_series_clustering(preprocessedX, maxIter=maxIter, n_jobs=n_jobs)
        distances = self.get_distances(preprocessedX)

        self.fit_random_forest(distances, y, n_jobs=n_jobs)
        return self

    def fit_time_series_clustering(self, preprocessedX, maxIter=5, n_jobs=-1):
        self.models = []
        for i in range(4):
            print("fitting model number {}".format(i))
            model = TimeSeriesKMeans(n_clusters=self.numberOfActivities, metric="dtw",
                                    max_iter=maxIter, random_state=self.seed, verbose=False, n_jobs=n_jobs)  # use number of activities as number of clusters
            model.fit(preprocessedX[:, :, i])
            self.models.append(model)

    def plot_centroids(self, names, size = (20, 20)):
        fig, ax = plt.subplots(self.numberOfActivities, self.numberOfSensors, figsize=size)
        # model names
        # plot the centroids for each model
        for i in range(self.numberOfSensors):
            for j in range(self.numberOfActivities):
                ax[j, i].plot(self.models[i].cluster_centers_[j])
                
            # set the tiltle per column of subplot
            ax[0, i].set_title(names[i])

    def fit_random_forest(self, distances, y_train, n_jobs=-1):
        random_forest = RandomForestClassifier(random_state=self.seed)
        # create random forest grid search
        param_grid = {
            'n_estimators': [100, 300, 500],
            'max_depth': [15, 30, 45],
            'min_samples_split': [2, 6],
        }

        # run gridSerach
        grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, n_jobs=-n_jobs, verbose=5)
        grid_search.fit(distances, y_train)
        self.best_estimator = grid_search.best_estimator_

    def preprocess_X(self, X):
        if (self.combineOnSensor):
            if (self.useEuclidCombineDistance):
                pre_pros = self.combineAxisOnSensorsEuclidLength(X)
            else:
                pre_pros = self.combineAxisOnSensors(X)
        else:
            pre_pros = X
        return pre_pros

    def predict(self, X):
        if (self.best_estimator is None):
            return None
        
        prepros = self.preprocess_X(X)

        # stack distances horizontally
        distances = self.get_distances(prepros)

        return self.best_estimator.predict(distances)

    def get_distances(self, X):
        distances = []
        for i in range(len(self.models)):
            distances.append(self.models[i].transform(X[:, :, i]))

        distances = np.hstack(distances)
        if (self.useStatisialParameters):
            # stack statistical parameters to distances
            distances = np.hstack((distances, ClusteringForest.calclateStatical_measurements(X)))
        return distances

    def score(self, X, y):
        if (self.best_estimator is None):
            return None
        else:
            prepros = self.preprocess_X(X)
            distances = self.get_distances(prepros)
            return self.best_estimator.score(distances, y)
    
    @staticmethod
    def calclateStatical_measurements(X):
        # calculate mean, min, max and std for each sensor time series
        statsMeasurements = []
        for j in range(X.shape[0]): # for each sample
            statsMeasurements.append([]) # add empty list for each sample
            for i in range(X.shape[-1]): # for each sensor
                statsMeasurements[j].append(np.mean(X[j, :, i]))
                statsMeasurements[j].append(np.min(X[j, :, i]))
                statsMeasurements[j].append(np.max(X[j, :, i]))
                statsMeasurements[j].append(np.std(X[j, :, i]))
        return statsMeasurements
        

    @staticmethod
    def combineAxisOnSensors(input):
        s, n, f = input.shape
        output = np.zeros((s, n, 4))
        # for each sample, sum rows 0-2
        output[:, :, 0] = input[:, :, 0] + input[:, :, 1] + input[:, :, 2]
        # for each sample, sum rows 3-5
        output[:, :, 1] = input[:, :, 3] + input[:, :, 4] + input[:, :, 5]
        # for each sample, sum rows 6-8
        output[:, :, 2] = input[:, :, 6] + input[:, :, 7] + input[:, :, 8]
        # for each sample, sum rows 9-12
        output[:, :, 3] = input[:, :, 9] + input[:, :, 10] + input[:, :, 11] + input[:, :, 12]
        return output

    @staticmethod
    def combineAxisOnSensorsEuclidLength(input):
        s, n, _ = input.shape
        output = np.zeros((s, n, 4))
        # for each sample, sum rows 0-2
        output[:, :, 0] = np.sqrt(input[:, :, 0] ** 2 + input[:, :, 1] ** 2 + input[:, :, 2] ** 2)
        # for each sample, sum rows 3-5
        output[:, :, 1] = np.sqrt(input[:, :, 3] ** 2 + input[:, :, 4] ** 2 + input[:, :, 5] ** 2)
        # for each sample, sum rows 6-8
        output[:, :, 2] = np.sqrt(input[:, :, 6] ** 2 + input[:, :, 7] ** 2 + input[:, :, 8] ** 2)
        # for each sample, sum rows 9-12
        output[:, :, 3] = np.sqrt(input[:, :, 9] ** 2 + input[:, :, 10] ** 2 + input[:, :, 11] ** 2 + input[:, :, 12] ** 2)
        return output
    

# Fit the Model

Now we want to fit the model, for the we use our wrapper class and train it right there. To test the implementation we just use a very small portion of the data.

In [14]:
# randomly select 20% of samples to train on
def getTrainingSamples(X, y, seed=0):
    np.random.seed(seed)
    indices = np.random.permutation(X.shape[0])
    training_idx = indices[:int(X.shape[0] * 0.02)]
    return X[training_idx], y[training_idx]

small_X, small_y = getTrainingSamples(X_train, y_train, seed=0)

basemodel = ClusteringForest(useStatisialParameters=True)
basemodel.fit(small_X, small_y)

fitting model number 0
fitting model number 1
fitting model number 2
fitting model number 3




Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV 2/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV 3/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV 4/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV 5/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV 1/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=   0.5s
[CV 2/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=   0.5s
[CV 3/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=   0.5s
[CV 4/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=   0.5s
[CV 5/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=   0.5s
[CV 1/5] END max_depth=15, min_samples_split=2, n_estimators=500; total time=   0.9s
[CV 

<__main__.ClusteringForest at 0x7fd6c12cc8e0>

In [15]:
basemodel.score(X_test, y_test)



0.7707743153918791

In [16]:
basemodel.predict(X_test)



array([1, 3, 5, ..., 6, 5, 6])

Now that we see the model works, let it run with all samples and different parameters and see how it performs.

In [17]:
import pickle
# safe model to file
with open('basemodel.pkl', 'wb') as f:
    pickle.dump(basemodel, f)


# Train the real Models

Now that we know our model works, we are ready to train it on the whole data. I will just train a bunch of models with different parameters and then choose the one with the best score. Normally we would need to use an extra validation set to choose the best model on becasue our decision now could be biased towards the test set. However since our test set is very large and the diffrences in accuracy are not all to small, the probability of choosing the wrong model becuase it overfits the test set vanishes.

In [23]:
euclid_no_stats_model = ClusteringForest(useStatisialParameters=False, useEuclidCombineDistance=True)
euclid_no_stats_model.fit(X_train, y_train)
print(euclid_no_stats_model.score(X_test, y_test))

# save file as pickle
with open('euclid_no_stats_model.pkl', 'wb') as f:
    pickle.dump(euclid_no_stats_model, f)

fitting model number 0
fitting model number 1
fitting model number 2
fitting model number 3




Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=   9.1s
[CV 2/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=   9.2s
[CV 3/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=   9.1s
[CV 4/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=   9.1s
[CV 5/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=   9.2s
[CV 1/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  27.5s
[CV 2/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  27.2s
[CV 3/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  27.1s
[CV 4/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  27.1s
[CV 5/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  27.1s
[CV 1/5] END max_depth=15, min_samples_split=2, n_estimators=500; total time=  45.1s
[CV 



0.846081208687441


In [24]:
euclid_with_stats_model = ClusteringForest(useStatisialParameters=True, useEuclidCombineDistance=True)
euclid_with_stats_model.fit(X_train, y_train)
print(euclid_with_stats_model.score(X_test, y_test))

# save file as pickle
with open('euclid_with_stats_model.pkl', 'wb') as f:
    pickle.dump(euclid_with_stats_model, f)

fitting model number 0
fitting model number 1
fitting model number 2
fitting model number 3




Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=  10.6s
[CV 2/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=  10.7s
[CV 3/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=  10.6s
[CV 4/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=  10.6s
[CV 5/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=  10.6s
[CV 1/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  31.9s
[CV 2/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  32.1s
[CV 3/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  32.1s
[CV 4/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  32.2s
[CV 5/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  31.9s
[CV 1/5] END max_depth=15, min_samples_split=2, n_estimators=500; total time=  53.2s
[CV 



0.8884560906515581


In [18]:
normal_with_stats_model = ClusteringForest(useStatisialParameters=True, useEuclidCombineDistance=False)
normal_with_stats_model.fit(X_train, y_train)
print(normal_with_stats_model.score(X_test, y_test))

# save file as pickle
with open('normal_with_stats_model.pkl', 'wb') as f:
    pickle.dump(normal_with_stats_model, f)

fitting model number 0
fitting model number 1
fitting model number 2
fitting model number 3




Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=  11.8s
[CV 2/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=  11.6s
[CV 3/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=  11.6s
[CV 4/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=  11.5s
[CV 5/5] END max_depth=15, min_samples_split=2, n_estimators=100; total time=  11.6s
[CV 1/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  34.8s
[CV 2/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  34.8s
[CV 3/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  34.7s
[CV 4/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  34.7s
[CV 5/5] END max_depth=15, min_samples_split=2, n_estimators=300; total time=  34.7s
[CV 1/5] END max_depth=15, min_samples_split=2, n_estimators=500; total time=  57.8s
[CV 



0.9147780925401322


Looks like we found the best model. This will now be integrated in the webservice via pickle file.