# Prepare Dataset

Load data from file.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/titanic.csv')
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings and Spouses Aboard,Parents and Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


Define features and target names.

In [2]:
numeric_features = ['Age', 'Fare']
categorical_features =  ['Pclass', 'Sex']
target = 'Survived'

Filter dataset by creating **X** and **y**, where **X** combines feature columns and **y** the target column. 

In [3]:
X = df[numeric_features + categorical_features]
y = df[[target]]

Transform dataset features.

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

class Transformer(ColumnTransformer):
    """A default transformer for numeric and categorical features.

    Args:
        num (list): The list of numeric fiatures.
        cat (list): The list of categorical fiatures.
    """
    def __init__(self):
        super().__init__(transformers=[
            (
                'num',
                Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', StandardScaler()),
                ]),
                numeric_features
            ),
            (
                'cat',
                Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('onehot', OneHotEncoder(handle_unknown='ignore')),
                ]),
                categorical_features
            )
        ])

trans = Transformer()
X_trans = trans.fit_transform(X)
X_trans.shape

(887, 7)

Split dataset on training and test.

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_trans, y, test_size=0.33)

Train a new model.

In [6]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train.values.ravel())

RandomForestClassifier()

Make prediction using test data.

In [7]:
y_pred = clf.predict(X_test)
y_pred

array([1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0])

Score the model.

In [8]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print('The model accuracy: %.2f' % accuracy)

The model accuracy: 0.82



Convert and save the model into ONNX format.

In [9]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

initial_type = [
    ('float_input', FloatTensorType([None, 7]))
]

onx = convert_sklearn(clf, initial_types=initial_type)
with open("titanic.onnx", "wb") as f:
    f.write(onx.SerializeToString())

# Load & Test Model

In [10]:
# Compute the prediction with ONNX Runtime
import onnxruntime as rt
import numpy

sess = rt.InferenceSession("titanic.onnx")
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
y_onnx = sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0]

miss = 0
for pred, onnx in zip(y_pred, y_onnx):
    if pred != onnx:
        miss += 1

print('The total number of testing samples: %d' % len(y_pred))
print('The number of samples misclassified by the ONNX model: %d' % miss)

The total number of testing samples: 293
The number of samples misclassified by the ONNX model: 2
