In [1]:
import turicreate as tc
import numpy as np

In [2]:
tc.__version__

'6.4.1'

In [3]:
from pymongo import MongoClient

In [4]:
client = MongoClient(serverSelectionTimeoutMS=50)
db = client.turidatabase

In [5]:
db.labeledinstances.find({'dsid': 4})

<pymongo.cursor.Cursor at 0x7fc8100c2350>

In [6]:
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=50), 'turidatabase')

In [7]:
def get_dataset_data(dsid):
    features = []
    labels = []
    
    for a in db.labeledinstances.find({"dsid": dsid}):
        features.append([float(val) for val in a['feature']])
        labels.append(a['label'])
        +
    data = {'target': labels, 'sequence':np.array(features)}
    
    return data


def get_dataset_sframe(dsid):
    data = get_dataset_data(dsid)
    return tc.SFrame(data=data)


In [8]:
dsid_4_sframe = get_dataset_sframe(4)

## Create a defualt model

In [9]:
moel = tc.classifier.create(dsid_4_sframe, target='target')

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: The following methods are available for this type of problem.
PROGRESS: BoostedTreesClassifier, RandomForestClassifier, DecisionTreeClassifier, LogisticClassifier
PROGRESS: The returned model will be chosen according to validation accuracy.


PROGRESS: Model selection based on validation accuracy:
PROGRESS: ---------------------------------------------
PROGRESS: BoostedTreesClassifier          : 0.7956989247311828
PROGRESS: RandomForestClassifier          : 0.7096774193548387
PROGRESS: DecisionTreeClassifier          : 0.5698924731182796
PROGRESS: LogisticClassifier              : 0.7526881720430108
PROGRESS: ---------------------------------------------
PROGRESS: Selecting BoostedTreesClassifier based on validation set performance.


In [10]:
from sklearn.decomposition import PCA

data = get_dataset_data(4)
X = data['sequence']
pca = PCA(n_components=150)
pca.fit(X) # fit data and then transform it
X_pca = pca.transform(X)

data['sequence'] = X_pca
s_frame = tc.SFrame(data=data)

moel = tc.classifier.create(s_frame, target='target')

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: The following methods are available for this type of problem.
PROGRESS: BoostedTreesClassifier, RandomForestClassifier, DecisionTreeClassifier, LogisticClassifier
PROGRESS: The returned model will be chosen according to validation accuracy.


PROGRESS: Model selection based on validation accuracy:
PROGRESS: ---------------------------------------------
PROGRESS: BoostedTreesClassifier          : 0.8279569892473119
PROGRESS: RandomForestClassifier          : 0.8172043010752689
PROGRESS: DecisionTreeClassifier          : 0.7204301075268817
PROGRESS: LogisticClassifier              : 0.7634408602150538
PROGRESS: ---------------------------------------------
PROGRESS: Selecting BoostedTreesClassifier based on validation set performance.


## Trying new appraoch

In [11]:
from sklearn import __version__ as sklearn_version
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Split data for Model development

In [12]:
data = get_dataset_data(4)
X = data['sequence']

encode_rotation = {'x90':0,
                  'xNeg90':1,
                  'x180':2,
                  'xNeg180':3,
                  'y90':4,
                  'yNeg90':5,
                  'y180':6,
                  'yNeg180':7,
                  'z90':8,
                  'zNeg90':9,
                  'z180':10,
                  'zNeg180':11}

y = np.array([ encode_rotation[s] for s in data['target']])

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [14]:
def get_mlp_1(x, y, ephocs=150):
    clf = MLPClassifier(hidden_layer_sizes=(50, 25), 
                        activation='relu', # compare to sigmoid
                        solver='adam',
                        alpha=1e-4, # L2 penalty
                        batch_size='auto', # min of 200, num_samples
                        learning_rate='constant', 
                        #learning_rate_init=0.2, # only SGD
                        #power_t=0.5,    # only SGD
                        max_iter=ephocs, 
                        shuffle=True, 
                        random_state=1, 
                        tol=1e-9, # for stopping
                        verbose=False, 
                        warm_start=False, 
                        momentum=0.9, # only SGD
                        #nesterovs_momentum=True, # only SGD
                        early_stopping=False, 
                        validation_fraction=0.1, # only if early_stop is true
                        beta_1=0.9, # adam decay rate of moment
                        beta_2=0.999, # adam decay rate of moment
                        epsilon=1e-08) # adam numerical stabilizer
    clf.fit(x, y)
    return clf

In [15]:
%time clf = get_mlp_1(x_train, y_train)

CPU times: user 9.43 s, sys: 1.08 s, total: 10.5 s
Wall time: 1.36 s




In [16]:

yhat = clf.predict(x_train)
print('Validation Acc:',accuracy_score(yhat,y_train))

Validation Acc: 0.9993211133740665


In [17]:
yhat = clf.predict(x_test)
print('Validation Acc:',accuracy_score(yhat,y_test))

Validation Acc: 0.8617886178861789


## Perform Strat K Fold

In [18]:
from sklearn.model_selection import StratifiedKFold
data = get_dataset_data(4)
X = data['sequence']
y = np.array([ encode_rotation[s] for s in data['target']])

In [19]:
kfold = StratifiedKFold(n_splits=10).split(X, y)

In [20]:
skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

accs = []
for train, test in skf.split(X,y):
    x_train, x_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    clf = get_mlp_1(x_train, y_train, ephocs=500)
    yhat = clf.predict(x_test)
    accs.append(accuracy_score(yhat, y_test))
    
for a in accs:
    print("Acc:", a)



Acc: 0.9135135135135135
Acc: 0.8702702702702703
Acc: 0.9130434782608695
Acc: 0.8532608695652174
Acc: 0.8641304347826086
Acc: 0.8967391304347826
Acc: 0.9184782608695652
Acc: 0.8532608695652174
Acc: 0.8478260869565217
Acc: 0.8913043478260869




# Attempt 2 with a Deeper Network

In [21]:
def get_mlp_2(x, y, ephocs=150):
    clf = MLPClassifier(hidden_layer_sizes=(100, 50, 25, 10), 
                        activation='relu', # compare to sigmoid
                        solver='adam', 
                        alpha=1e-4, # L2 penalty
                        batch_size='auto', # min of 200, num_samples
                        learning_rate='constant', 
                        #learning_rate_init=0.2, # only SGD
                        #power_t=0.5,    # only SGD
                        max_iter=ephocs, 
                        shuffle=True, 
                        random_state=1, 
                        tol=1e-9, # for stopping
                        verbose=False, 
                        warm_start=False, 
                        momentum=0.9, # only SGD
                        #nesterovs_momentum=True, # only SGD
                        early_stopping=False, 
                        validation_fraction=0.1, # only if early_stop is true
                        beta_1=0.9, # adam decay rate of moment
                        beta_2=0.999, # adam decay rate of moment
                        epsilon=1e-08) # adam numerical stabilizer
    clf.fit(x, y)
    return clf

In [31]:
%%time
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
clf = get_mlp_2(x_train, y_train)
yhat = clf.predict(x_test)
print('Validation Acc:',accuracy_score(yhat,y_test))

Validation Acc: 0.8401084010840109
CPU times: user 17.6 s, sys: 2.23 s, total: 19.8 s
Wall time: 2.71 s




In [33]:
import pickle
pickle.dump(clf, open('.\model1_4.save', 'wb'))

clf1_2 = pickle.load(open('.\model1_4.save', 'rb'))

yhat = clf.predict(x_test)
print('Validation Acc:',accuracy_score(yhat,y_test))

Validation Acc: 0.8401084010840109


### predict 1

In [43]:
x = x_train[1]

x = x.reshape(1, -1)
yhat = clf1_2.predict(x)
yhat[0]

0

In [23]:
%%time
skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

accs = []
for train, test in skf.split(X,y):
    x_train, x_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    clf = get_mlp_2(x_train, y_train, ephocs=500)
    yhat = clf.predict(x_test)
    accs.append(accuracy_score(yhat, y_test))
    
for a in accs:
    print("Acc:", a)
print("Mean:",np.mean(accs))



Acc: 0.8918918918918919
Acc: 0.8702702702702703
Acc: 0.9021739130434783
Acc: 0.8858695652173914
Acc: 0.875
Acc: 0.8695652173913043
Acc: 0.907608695652174
Acc: 0.8695652173913043
Acc: 0.8478260869565217
Acc: 0.8858695652173914
Mean: 0.8805640423031728
CPU times: user 10min 53s, sys: 1min 14s, total: 12min 8s
Wall time: 1min 35s


