## 1. Import all the packages

In [1]:
import numpy as np
import math
import pandas as pd
import pickle, os, math
from keras.models import Sequential
from keras.callbacks import Callback
from keras.layers import Dense
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

Using TensorFlow backend.


## 2. Import Data

Import the data from the data folder. This step only uses the data for surviving in the rain of topedoes for the fighter. 

In [2]:
data_dir = os.path.join(os.curdir, 'Data', '200000_sur', 'basic_data_pics.pkl')
with open(data_dir, 'rb') as in_file:
    ot = pickle.load(in_file)
data_pics = ot['data']
target_pics = ot['target']
data_pics.shape

(200036, 24)

## 3. Preprocess Data

The first raw data is unbalanced with classes. The ratio of classes 0 and 1 are almost 1:10. Very unbalanced. 

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data_pics, target_pics, test_size=0.1, random_state=152)
X_train.shape
X_train_train, X_vali, y_train_train, y_vali = train_test_split(
    X_train, y_train, test_size=0.3, random_state=15545)

In [4]:
print('class 0 has ' + str(len(y_train.index[y_train[0] == 0].tolist())) + ' points')
print('class 1 has ' + str(len(y_train.index[y_train[0] == 1].tolist())) + ' points')

class 0 has 15061 points
class 1 has 164971 points


Create balanced classes sample data, each class has 15061 data points

In [5]:
index_0 = y_train.index[y_train[0] == 0].tolist()
print(len(index_0))
index_1 = y_train.index[y_train[0] != 0].tolist()
index_1_comparable_to_0 = np.random.choice(index_1, math.floor(len(index_0) * 1))
print(len(index_1_comparable_to_0))
samples = np.concatenate([index_0, index_1_comparable_to_0])
print(len(samples))

15061
15061
30122


`small_data` and `small_target` are the balanced data. All variables with `_small` is the balanced result.

In [6]:
small_data = data_pics.iloc[samples, :]
small_target = target_pics.iloc[samples, :]

X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(
    small_data, small_target, test_size=0.3, random_state=152)
X_train_train_small, X_vali_small, y_train_train_small, y_vali_small = train_test_split(
    X_train_small, y_train_small, test_size=0.3, random_state=152)

## 4. Pretraining Several Models

For the scikit models, target (y) has to be ravelled.

In [7]:
y_train_small_m = np.ravel(y_train_small)
y_test_small_m = np.ravel(y_test_small)
y_train_m = np.ravel(y_train)
y_test_m = np.ravel(y_test)

### 4.1 SVC with scikit-learn

In [8]:
clf_1 = SVC(C=10.0, gamma='auto', verbose=True)
clf_1.fit(X_train_small, y_train_small_m)
clf_1.score(X_test_small, y_test_small_m)

[LibSVM]

0.9391390948323559

In [9]:
cross_val_score(clf_1, X_train_small, y_train_small_m, cv=6)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

array([0.943101  , 0.93454752, 0.93625498, 0.93284007, 0.94023904,
       0.94280023])

### 4.2 MLPC with scikit-learn

In [10]:
mlpc_1 = MLPClassifier(hidden_layer_sizes=(50, 20),
                       alpha=0.15, max_iter=1000, batch_size=5000,
                       verbose=False, learning_rate_init=0.01, tol=1e-5,
                       learning_rate='adaptive')

mlpc_1.fit(X_train_small, y_train_small_m)
mlpc_1.score(X_test_small, y_test_small_m)

0.8528272656855151

In [11]:
cross_val_score(mlpc_1, X_train_small, y_train_small_m, cv=5)

array([0.86744131, 0.81550866, 0.88617501, 0.66137064, 0.68840408])

### 4.3 NN with Keras

In [12]:
model_NNK = Sequential()
model_NNK.add(Dense(units=12, activation='relu', input_dim=24))
model_NNK.add(Dense(units=6, activation='relu'))
model_NNK.add(Dense(units=1, activation='sigmoid'))
model_NNK.compile(loss='mean_squared_error',
              optimizer='rmsprop',
              metrics=['accuracy'])

model_NNK.fit(X_train_train_small, y_train_train_small,
          validation_data=(X_vali_small, y_vali_small), 
          epochs=20, batch_size=256, verbose=1)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 14759 samples, validate on 6326 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x258df4e0>

In [13]:
loss_and_metrics = model_NNK.evaluate(X_test_small, y_test_small, batch_size=128)
print(loss_and_metrics)
y_predict = model_NNK.predict(X_test_small, batch_size=None, verbose=0)
print(np.sum(y_predict) / len(y_predict))

[0.08583185275901375, 0.8903397143421361]
0.46987817444948543


## 5. Selecting models

the SVC model apparently has a higher accuracy, grid search is used to find the best parameters.

In [14]:
parameters = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']}

In [15]:
from sklearn.model_selection import GridSearchCV
clf_2 = SVC(C=10.0, gamma='auto', verbose=True)
svc_vc = GridSearchCV(clf_2, parameters, cv=6, refit=True)
svc_vc.fit(X_train_small, y_train_small_m)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

GridSearchCV(cv=6, error_score='raise',
       estimator=SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [16]:
svc_vc.cv_results_



{'mean_fit_time': array([ 6.52448571,  6.30663069,  6.09377599, 12.40724055,  7.36623653,
        60.76290898]),
 'std_fit_time': array([0.41663224, 0.25077265, 0.19768908, 0.56153832, 0.44656144,
        3.58722979]),
 'mean_score_time': array([0.69106905, 0.34920148, 0.53855387, 0.36386979, 0.46838025,
        0.36653662]),
 'std_score_time': array([0.02388407, 0.01261624, 0.04772186, 0.02875132, 0.02333083,
        0.02490056]),
 'param_C': masked_array(data=[0.1, 0.1, 1, 1, 10, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.1, 'kernel': 'rbf'},
  {'C': 0.1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'l

In [17]:
clf_1.score(X_test_small, y_test_small_m)

0.9391390948323559

In [18]:
import joblib
svc_best = svc_vc.best_estimator_
joblib.dump(svc_best, 'model_svc_survive.joblib')
svc_best_loaded = joblib.load('model_svc_survive.joblib')

In [22]:
svc_best_loaded.predict([X_test_small.iloc[0, :]])

array([0], dtype=int64)