# Extract features, retrain Sherlock and generate predictions.

### Necessary imports

In [2]:
import pandas as pd
import numpy as np
import sys
import pickle
import matplotlib.pyplot as plt
from ast import literal_eval
from sklearn.metrics import f1_score
sys.path.append("..")

In [3]:
from src.features.build_features import build_features
from src.deploy.train_sherlock import train_sherlock
from src.deploy.predict_sherlock import predict_sherlock

  '{0}.{1}.{2}'.format(*version.hdf5_built_version_tuple)


### Load small raw data sample and corresponding labels

In [4]:
data = pd.read_csv('../data/raw/test_values.csv', sep=',', index_col=0, header=None)
labs = pd.read_csv('../data/raw/test_labels.csv', sep=',', index_col=0, header=None)

In [5]:
data.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
20368,"['Central Missouri', 'unattached', 'unattached..."
664102,"[95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, ..."
366813,"['Katie Crews', 'Christian Hiraldo', 'Alex Est..."
530567,"['Christian', 'Non-Christian', 'Unreported', '..."
176253,"['AAF-McQuay Canada Inc.', 'AAF-McQuay Canada ..."


In [27]:
data = data.iloc[:,0].apply(literal_eval)

In [27]:
data.head()

0
20368     [Central Missouri, unattached, unattached, Kan...
664102    [95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, ...
366813    [Katie Crews, Christian Hiraldo, Alex Estrada,...
530567    [Christian, Non-Christian, Unreported, Jewish,...
176253    [AAF-McQuay Canada Inc., AAF-McQuay Canada Inc...
Name: 1, dtype: object

In [24]:
labs.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
20368,affiliation
664102,weight
366813,jockey
530567,religion
176253,company


### Extract features of raw data samples from dataframes and save preprocessed data

In [28]:
data.values[0]

['Central Missouri',
 'unattached',
 'unattached',
 'Kansas State University',
 'unattached',
 'North Dakota State',
 'Nike']

In [60]:
X_train = build_features(data.values)
y_train = labs.values.flatten()
print('Extracted features.')

In [56]:
with open('../data/processed/X_train.data', 'wb') as f:
    pickle.dump(X_train, f)
    
with open('../data/processed/y_train.data', 'wb') as f:
    pickle.dump(y_train, f)

In [57]:
with open('../data/processed/X_train.data', 'rb') as f:
    X_train = pickle.load(f)
    
with open('../data/processed/y_train.data', 'rb') as f:
    y_train = pickle.load(f)

### Train sherlock on new data

In [53]:
# For simplicity provide X_train as validation set.
train_sherlock(X_train, y_train, X_train, y_train, 'retrain_minimal_sample') 
print('Trained new model.')

Successfully loaded and compiled model, now fitting.
Train on 1001 samples, validate on 1001 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100


Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Retrained Sherlock.
Trained new model.


### Generate predictions with the retrained model

In [58]:
# Predict labels using the retrained model (with nn_id retrain_minimal_sample)
predicted_labels = predict_sherlock(X_train, 'retrain_minimal_sample')
print('Predicted labels: ', predicted_labels, 'true labels: ', y_train)

Predicted labels:  ['team' 'weight' 'jockey' ... 'company' 'plays' 'jockey'] true labels:  ['affiliation' 'weight' 'jockey' ... 'company' 'plays' 'continent']


In [59]:
f1_score(y_train, predicted_labels, average='weighted')

0.8086928777494169