In [13]:
import turicreate as tc
import numpy as np

In [14]:
tc.__version__

'6.4.1'

In [15]:
from pymongo import MongoClient

In [16]:
client = MongoClient(serverSelectionTimeoutMS=50)
db = client.turidatabase

In [39]:
db.labeledinstances.find({'dsid': 5})

<pymongo.cursor.Cursor at 0x7f9b2bed18d0>

In [40]:
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=50), 'turidatabase')

In [41]:
def get_dataset_data(dsid):
    features = []
    labels = []
    
    for a in db.labeledinstances.find({"dsid": dsid}):
        features.append([float(val) for val in a['feature']])
        labels.append(a['label'])
    data = {'target': labels, 'sequence':np.array(features)}
    
    return data


def get_dataset_sframe(dsid):
    data = get_dataset_data(dsid)
    return tc.SFrame(data=data)


In [42]:
dsid_4_sframe = get_dataset_sframe(4)

## Create a defualt model

In [43]:
moel = tc.classifier.create(dsid_4_sframe, target='target')

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: The following methods are available for this type of problem.
PROGRESS: BoostedTreesClassifier, RandomForestClassifier, DecisionTreeClassifier, LogisticClassifier
PROGRESS: The returned model will be chosen according to validation accuracy.


PROGRESS: Model selection based on validation accuracy:
PROGRESS: ---------------------------------------------
PROGRESS: BoostedTreesClassifier          : 0.851063829787234
PROGRESS: RandomForestClassifier          : 0.8297872340425532
PROGRESS: DecisionTreeClassifier          : 0.7446808510638298
PROGRESS: LogisticClassifier              : 0.8085106382978723
PROGRESS: ---------------------------------------------
PROGRESS: Selecting BoostedTreesClassifier based on validation set performance.


In [44]:
from sklearn.decomposition import PCA

data = get_dataset_data(4)
X = data['sequence']
pca = PCA(n_components=150)
pca.fit(X) # fit data and then transform it
X_pca = pca.transform(X)

data['sequence'] = X_pca
s_frame = tc.SFrame(data=data)

moel = tc.classifier.create(s_frame, target='target')

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: The following methods are available for this type of problem.
PROGRESS: BoostedTreesClassifier, RandomForestClassifier, DecisionTreeClassifier, LogisticClassifier
PROGRESS: The returned model will be chosen according to validation accuracy.


PROGRESS: Model selection based on validation accuracy:
PROGRESS: ---------------------------------------------
PROGRESS: BoostedTreesClassifier          : 0.8191489361702128
PROGRESS: RandomForestClassifier          : 0.7978723404255319
PROGRESS: DecisionTreeClassifier          : 0.7127659574468085
PROGRESS: LogisticClassifier              : 0.7340425531914894
PROGRESS: ---------------------------------------------
PROGRESS: Selecting BoostedTreesClassifier based on validation set performance.


## Trying new appraoch

In [45]:
from sklearn import __version__ as sklearn_version
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Split data for Model development

In [46]:
data = get_dataset_data(4)
X = data['sequence']

encode_rotation = {'x90':0,
                  'xNeg90':1,
                  'x180':2,
                  'xNeg180':3,
                  'y90':4,
                  'yNeg90':5,
                  'y180':6,
                  'yNeg180':7,
                  'z90':8,
                  'zNeg90':9,
                  'z180':10,
                  'zNeg180':11}

y = np.array([ encode_rotation[s] for s in data['target']])

In [47]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [48]:
def get_mlp_1(x, y, ephocs=150):
    clf = MLPClassifier(hidden_layer_sizes=(50, 25), 
                        activation='relu', # compare to sigmoid
                        solver='adam',
                        alpha=1e-4, # L2 penalty
                        batch_size='auto', # min of 200, num_samples
                        learning_rate='constant', 
                        #learning_rate_init=0.2, # only SGD
                        #power_t=0.5,    # only SGD
                        max_iter=ephocs, 
                        shuffle=True, 
                        random_state=1, 
                        tol=1e-9, # for stopping
                        verbose=False, 
                        warm_start=False, 
                        momentum=0.9, # only SGD
                        #nesterovs_momentum=True, # only SGD
                        early_stopping=False, 
                        validation_fraction=0.1, # only if early_stop is true
                        beta_1=0.9, # adam decay rate of moment
                        beta_2=0.999, # adam decay rate of moment
                        epsilon=1e-08) # adam numerical stabilizer
    clf.fit(x, y)
    return clf

In [49]:
%time clf = get_mlp_1(x_train, y_train)

CPU times: user 4.37 s, sys: 6.34 s, total: 10.7 s
Wall time: 1.62 s




In [50]:

yhat = clf.predict(x_train)
print('Validation Acc:',accuracy_score(yhat,y_train))

Validation Acc: 0.9993297587131368


In [51]:
yhat = clf.predict(x_test)
print('Validation Acc:',accuracy_score(yhat,y_test))

Validation Acc: 0.9144385026737968


## Perform Strat K Fold

In [52]:
from sklearn.model_selection import StratifiedKFold
data = get_dataset_data(4)
X = data['sequence']
y = np.array([ encode_rotation[s] for s in data['target']])

In [53]:
kfold = StratifiedKFold(n_splits=10).split(X, y)

In [54]:
skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

accs = []
for train, test in skf.split(X,y):
    x_train, x_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    clf = get_mlp_1(x_train, y_train, ephocs=500)
    yhat = clf.predict(x_test)
    accs.append(accuracy_score(yhat, y_test))
    
for a in accs:
    print("Acc:", a)



Acc: 0.8502673796791443
Acc: 0.8609625668449198
Acc: 0.8983957219251337
Acc: 0.8609625668449198
Acc: 0.93048128342246
Acc: 0.8235294117647058
Acc: 0.8978494623655914
Acc: 0.8817204301075269
Acc: 0.9193548387096774
Acc: 0.8924731182795699




# Attempt 2 with a Deeper Network

In [34]:
def get_mlp_2(x, y, ephocs=150):
    clf = MLPClassifier(hidden_layer_sizes=(100, 50, 25, 10), 
                        activation='relu', # compare to sigmoid
                        solver='adam', 
                        alpha=1e-4, # L2 penalty
                        batch_size='auto', # min of 200, num_samples
                        learning_rate='constant', 
                        #learning_rate_init=0.2, # only SGD
                        #power_t=0.5,    # only SGD
                        max_iter=ephocs, 
                        shuffle=True, 
                        random_state=1, 
                        tol=1e-9, # for stopping
                        verbose=False, 
                        warm_start=False, 
                        momentum=0.9, # only SGD
                        #nesterovs_momentum=True, # only SGD
                        early_stopping=False, 
                        validation_fraction=0.1, # only if early_stop is true
                        beta_1=0.9, # adam decay rate of moment
                        beta_2=0.999, # adam decay rate of moment
                        epsilon=1e-08) # adam numerical stabilizer
    clf.fit(x, y)
    return clf

In [35]:
%%time
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
clf = get_mlp_2(x_train, y_train)
yhat = clf.predict(x_test)
print('Validation Acc:',accuracy_score(yhat,y_test))

Validation Acc: 0.8823529411764706
CPU times: user 9.04 s, sys: 11.6 s, total: 20.6 s
Wall time: 2.98 s




In [55]:
import pickle
pickle.dump(clf, open('./model1_4.save', 'wb'))

clf1_2 = pickle.load(open('./model1_4.save', 'rb'))

yhat = clf.predict(x_test)
print('Validation Acc:',accuracy_score(yhat,y_test))

Validation Acc: 0.8924731182795699


# Results


The deeper model did not give us any meaningful increased accuracy, we are going to continue with the first model.