# Data preparation and model training

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelBinarizer, LabelEncoder, OneHotEncoder, PowerTransformer
from sklearn.model_selection import train_test_split, KFold
import xgboost as xgb
import pickle

In [2]:
df = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [3]:
df_new = df.drop(columns=['Name'])

In [4]:
df_new.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [5]:
# Could also use sth. like LabelEncoder...
df_new['Sex'] = pd.get_dummies(df_new['Sex'])

#LabelEncoder
#le = LabelEncoder()
#le.fit_transform(df_new['Sex'])
df_new.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,0,22.0,1,0,7.25
1,1,1,1,38.0,1,0,71.2833
2,1,3,1,26.0,0,0,7.925
3,1,1,1,35.0,1,0,53.1
4,0,3,0,35.0,0,0,8.05


In [6]:
#X=df_new[['Pclass', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']].to_numpy()
X=df_new.drop(columns=['Survived']).to_numpy()
y=df_new['Survived'].to_numpy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
print(X)

[[ 3.      0.     22.      1.      0.      7.25  ]
 [ 1.      1.     38.      1.      0.     71.2833]
 [ 3.      1.     26.      0.      0.      7.925 ]
 ...
 [ 3.      1.      7.      1.      2.     23.45  ]
 [ 1.      0.     26.      0.      0.     30.    ]
 [ 3.      0.     32.      0.      0.      7.75  ]]


In [8]:
print(X_scaled)

[[ 0.83052363 -0.74026551 -0.52936601  0.42990395 -0.4749808  -0.50358635]
 [-1.56127657  1.35086667  0.60426454  0.42990395 -0.4749808   0.78341245]
 [ 0.83052363  1.35086667 -0.24595837 -0.47585568 -0.4749808  -0.49001959]
 ...
 [ 0.83052363  1.35086667 -1.59214465  0.42990395  2.00330136 -0.17798419]
 [-1.56127657 -0.74026551 -0.24595837 -0.47585568 -0.4749808  -0.04633641]
 [ 0.83052363 -0.74026551  0.17915309 -0.47585568 -0.4749808  -0.4935369 ]]


In [9]:
x_train, x_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)
all_results = {}

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

clf = DecisionTreeClassifier(random_state=1)
clf = clf.fit(x_train, y_train)

In [11]:
print("Test data: ",clf.score(x_test, y_test))

Test data:  0.7528089887640449


In [12]:
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier()
scores = cross_val_score(clf, X_scaled, y, cv=10)
print("CV mean: ", scores.mean())

all_results['Decision Tree'] = scores.mean()

CV mean:  0.7791368743615935


In [13]:
from sklearn.ensemble import RandomForestClassifier
est = RandomForestClassifier(n_estimators=10)
est.fit(x_train, y_train)
est.score(x_test, y_test)

0.7490636704119851

In [14]:
clf = RandomForestClassifier(n_estimators=10)
scores = cross_val_score(clf, X_scaled, y, cv=10)
print("CV mean: ", scores.mean())

all_results['Random Forest'] = scores.mean()

CV mean:  0.814032175689479


In [15]:
%%time
from xgboost.sklearn import XGBClassifier, DMatrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

#parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
parameters = {'n_estimators': [x for x in range(5, 11, 5)], 'max_depth':[x for x in range(1,4)], 'learning_rate': [round(x, 2) for x in np.arange(0.01, 0.11, 0.01)]}
xgb = XGBClassifier()
clf = GridSearchCV(xgb, parameters, cv=10)
clf.fit(X_scaled, y)
#print(clf.best_estimator_)
print(clf.best_score_)
all_results['XG Boost'] = clf.best_score_
#xgb = XGBClassifier(n_estimators=10, max_depth=1, learning_rate=0.1, objective='binary:logistic')
#cv_scores = cross_val_score(xgb, scaled, labels, cv=10)
#print(cv_scores.mean())

0.8286516853932584
CPU times: user 2.7 s, sys: 54.2 ms, total: 2.76 s
Wall time: 2.76 s


In [16]:
from sklearn import svm
from sklearn.pipeline import Pipeline

KERNELS = ['linear', 'poly', 'rbf', 'sigmoid']
for kernel in KERNELS:
    svc = svm.SVC(kernel=kernel, C=1.0)#.fit(x_train, y_train)
    cv_scores = cross_val_score(svc, X_scaled, y, cv=10)
    print("Kernel: {}, accuracy: {}".format(kernel, cv_scores.mean()))
    all_results[str('SVM-'+ kernel)] = cv_scores.mean()
    
svc = svm.SVC(kernel='rbf', C=1.0)
pipeline = Pipeline([('scaler', scaler), ('svc', svc)])
model = pipeline.fit(x_train, y_train)

Kernel: linear, accuracy: 0.7857507660878447
Kernel: poly, accuracy: 0.81511746680286
Kernel: rbf, accuracy: 0.8252936670071502
Kernel: sigmoid, accuracy: 0.7024004085801839


In [17]:
from sklearn.neighbors import KNeighborsClassifier
d = {}
for i in range(1,50):
    knn = KNeighborsClassifier(n_neighbors=i)
    cv_scores = cross_val_score(knn, X_scaled, y, cv=10)
    d[str(i)]=cv_scores.mean()
    
max_key = max(d, key=d.get)
print("Best K: {} with accuracy: {}".format(max_key, d[max_key]))
all_results[str('KNN-'+max_key)] = d[max_key]

Best K: 31 with accuracy: 0.8207865168539327


In [18]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
cv_scores = cross_val_score(lr, X_scaled, y, cv=10)
print(cv_scores.mean())
all_results['Logistic Regression'] = cv_scores.mean()

0.7971016343207354


In [19]:
max_key = max(all_results, key=all_results.get)
print("Best Algorithm: {} with an accuracy of: {}".format(max_key, all_results[max_key]))
print("\nSee the all results here:\n")
for key, value in sorted(all_results.items(), reverse=True, key=lambda item: item[1]):
    print("%s: %s" % (key, value))

Best Algorithm: XG Boost with an accuracy of: 0.8286516853932584

See the all results here:

XG Boost: 0.8286516853932584
SVM-rbf: 0.8252936670071502
KNN-31: 0.8207865168539327
SVM-poly: 0.81511746680286
Random Forest: 0.814032175689479
Logistic Regression: 0.7971016343207354
SVM-linear: 0.7857507660878447
Decision Tree: 0.7791368743615935
SVM-sigmoid: 0.7024004085801839


# --------------------

# Deploying a model

In [20]:
###
# Beim CP4Dv3.0.1 sollte eigentlich watson-machine-learning-client-V4 vorinstalliert sein und es sollte alles "problemlos" funktionieren!
# Falls es dennoch Probleme geben sollte, mal "vorsichtshalber" alles deinstallieren und nur V4 installieren...
# WICHTIG: Falls irgendwas uninstalled oder neuinstalled wurde den Jupyter Kernel neustarten!
###

#!pip uninstall watson-machine-learning-client-V4 -y
#!pip uninstall watson-machine-learning-client -y
#!pip uninstall ibm-watson-machine-learning -y
#!pip install watson-machine-learning-client-V4

In [21]:
#Option 1 - Curl:
#!curl -k -X GET https://zen-cpd-zen.apps.edb-bde1.cecc.ihost.com/v1/preauth/validateAuth -u admin:password

#Option 2 - Python requests:
import requests, json
from requests.auth import HTTPBasicAuth

s = requests.Session()
res = s.get('https://zen-cpd-zen.apps.edb-b59f.cecc.ihost.com/v1/preauth/validateAuth', auth=HTTPBasicAuth('admin', 'password'), verify=False)
res = json.loads(res.text)
token = res['accessToken']
print(token)

eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VybmFtZSI6ImFkbWluIiwicm9sZSI6IkFkbWluIiwicGVybWlzc2lvbnMiOlsiYWRtaW5pc3RyYXRvciIsImNhbl9wcm92aXNpb24iLCJtYW5hZ2VfY2F0YWxvZyJdLCJzdWIiOiJhZG1pbiIsImlzcyI6IktOT1hTU08iLCJhdWQiOiJEU1giLCJ1aWQiOiIxMDAwMzMwOTk5IiwiYXV0aGVudGljYXRvciI6ImRlZmF1bHQiLCJpYXQiOjE2MDA0MzM4NDIsImV4cCI6MTYwMDQ3NzAwNn0.jY1tDXo_8wEJsdew8PnSqkBu8yU5LFlfzHXoW5d0esbmvz7nbIQKYhN6Nh4TJrajq3K5nK1my-sIiwIORzFMD6xsmqmwuK5lnOIw5yLRZ9qAuB8rckZzKOwrYC80j9VQQpe_rc5F5ch8xR5CEoAdPZc7_VHMpyuS0J_WeLmCEL1yaPw4liHlQ0ZhDDFqES2b5JaoF84XGTyuWIX7eRKcxMWsWHAPVT_RFpAvQ6YmX2q9wy4saN45PR5V2ddV_uHaRSN77DqIz__l0ASDiOpp5KL2_bql6xyHwf7DhS4TNIErAhjANwSNc3QBTFbYhAQxEk0-S3oyiEdNk2iHkRwEaA




In [22]:
wml_credentials = {
    "token": token,
    "instance_id" : "wml_local",
    "url"         : "https://zen-cpd-zen.apps.edb-b59f.cecc.ihost.com",
    "version": "3.0.1"
}

from watson_machine_learning_client import WatsonMachineLearningAPIClient
client = WatsonMachineLearningAPIClient(wml_credentials)

## List and/or create space

In [23]:
client.spaces.list()

------------------------------------  ---------  ------------------------
GUID                                  NAME       CREATED
79201026-8364-435e-8962-05c3db970d9b  dev_space  2020-09-18T06:29:56.237Z
------------------------------------  ---------  ------------------------


In [24]:
if not client.spaces.get_details():
    print("Create resources!")
    space_details = client.spaces.store(meta_props={client.spaces.ConfigurationMetaNames.NAME: "dev_space2"})
    space_id = client.spaces.get_uid(space_details)
else:
    print("Resource vorhanden!")
    for sp in client.spaces.get_details()['resources']:
        print(sp,"\n")
    print("Using '{}' as default space".format(client.spaces.get_details()['resources'][0]['metadata']['name']))
    space_id = client.spaces.get_details()['resources'][0]['metadata']['id']

Resource vorhanden!
{'metadata': {'name': 'dev_space', 'role': 'Admin', 'guid': '79201026-8364-435e-8962-05c3db970d9b', 'id': '79201026-8364-435e-8962-05c3db970d9b', 'created_at': '2020-09-18T06:29:56.237Z', 'owner': '1000330999', 'href': '/v4/spaces/79201026-8364-435e-8962-05c3db970d9b'}, 'entity': {'name': 'dev_space'}} 

Using 'dev_space' as default space


In [25]:
client.set.default_space(space_id)

'SUCCESS'

In [26]:
client.software_specifications.list()

--------------------------  ------------------------------------  ----
NAME                        ASSET_ID                              TYPE
default_py3.6               0062b8c9-8b7d-44a0-a9b9-46c416adcbd9  base
scikit-learn_0.20-py3.6     09c5a1d0-9c1e-4473-a344-eb7b665ff687  base
ai-function_0.1-py3.6       0cdb0f1e-5376-4f4d-92dd-da3b69aa9bda  base
shiny-r3.6                  0e6e79df-875e-4f24-8ae9-62dcc2148306  base
pytorch_1.1-py3.6           10ac12d6-6b30-4ccd-8392-3e922c096a92  base
scikit-learn_0.22-py3.6     154010fa-5b3b-4ac1-82af-4d5ee5abbc85  base
default_r3.6                1b70aec3-ab34-4b87-8aa0-a4a3c8296a36  base
tensorflow_1.15-py3.6       2b73a275-7cbf-420b-a912-eae7f436e0bc  base
pytorch_1.2-py3.6           2c8ef57d-2687-4b7d-acce-01f94976dac1  base
spark-mllib_2.3             2e51f700-bca0-4b0d-88dc-5c6791338875  base
pytorch-onnx_1.1-py3.6-edt  32983cea-3f32-4400-8965-dde874a8d67e  base
spark-mllib_2.4             390d21f8-e58b-4fac-9c55-d7ceda621326  base
xgboos

## Define model properties and store

In [34]:
software_spec_uid = client.software_specifications.get_uid_by_name("scikit-learn_0.22-py3.6")

model_props = {
    client.repository.ModelMetaNames.NAME: "Titanic Survivor Prediction",
    client.repository.ModelMetaNames.TYPE: "scikit-learn_0.22",
    client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,
    client.repository.ModelMetaNames.INPUT_DATA_SCHEMA: [{'id': '1',
                                                                'type': 'ndarray',
                                                                 'fields': [{'name': 'Pclass', 'type': 'float'},
                                                                            {'name': 'Sex', 'type': 'float'},
                                                                            {'name': 'Age', 'type': 'float'},
                                                                            {'name': 'Siblings/Spouses Aboard', 'type': 'float'},
                                                                            {'name': 'Parents/Children Aboard', 'type': 'float'},
                                                                            {'name': 'Fare', 'type': 'float'}]
                                                                   }]
}
published_model = client.repository.store_model(model=model, pipeline=pipeline, meta_props=model_props, training_data=x_train, training_target=y_train)

In [59]:
#TODO: Warnungen unterdrücken, da es sonst etwas verwirrend ist...

if not client.repository.get_details()['models']['resources']:
    print("Publish model...")
    software_spec_uid = client.software_specifications.get_uid_by_name("scikit-learn_0.22-py3.6")
    model_props = {
        client.repository.ModelMetaNames.NAME: "Titanic Survivor Prediction",
        client.repository.ModelMetaNames.TYPE: "scikit-learn_0.22",
        client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,
        client.repository.ModelMetaNames.INPUT_DATA_SCHEMA: [{'id': '1',
                                                                    'type': 'ndarray',
                                                                     'fields': [{'name': 'Pclass', 'type': 'float'},
                                                                                {'name': 'Sex', 'type': 'float'},
                                                                                {'name': 'Age', 'type': 'float'},
                                                                                {'name': 'Siblings/Spouses Aboard', 'type': 'float'},
                                                                                {'name': 'Parents/Children Aboard', 'type': 'float'},
                                                                                {'name': 'Fare', 'type': 'float'}]
                                                                       }]
    }
    published_model = client.repository.store_model(model=model, pipeline=pipeline, meta_props=model_props, training_data=x_train, training_target=y_train)
else:
    print("Model found!")
    print("Using default model {}".format(client.repository.get_details()['models']['resources'][0]['metadata']['name']))
    published_model = client.repository.get_details()['models']['resources'][0]

# Optional: Delete by (gu)id
#client.repository.delete('b6591272-ee24-40a7-841b-d7e8846277d2')

Model found!
Using default model Titanic Survivor Prediction
{'metadata': {'name': 'Titanic Survivor Prediction', 'guid': '2c2e1c46-14cb-4ec0-b7b5-3402b5060cca', 'id': '2c2e1c46-14cb-4ec0-b7b5-3402b5060cca', 'modified_at': '2020-09-18T13:17:12.002Z', 'created_at': '2020-09-18T13:17:10.002Z', 'owner': '1000330999', 'href': '/v4/models/2c2e1c46-14cb-4ec0-b7b5-3402b5060cca?space_id=79201026-8364-435e-8962-05c3db970d9b', 'space_id': '79201026-8364-435e-8962-05c3db970d9b'}, 'entity': {'name': 'Titanic Survivor Prediction', 'training_data_references': [{'location': {'bucket': 'not_applicable'}, 'type': 'fs', 'connection': {'access_key_id': 'not_applicable', 'secret_access_key': 'not_applicable', 'endpoint_url': 'not_applicable'}, 'schema': {'id': '1', 'type': 'ndarray', 'fields': [{'name': 'f0', 'type': 'float'}, {'name': 'f1', 'type': 'float'}, {'name': 'f2', 'type': 'float'}, {'name': 'f3', 'type': 'float'}, {'name': 'f4', 'type': 'float'}, {'name': 'f5', 'type': 'float'}]}}], 'label_colum

In [60]:
import json
published_model_uid = client.repository.get_model_uid(published_model)
model_details = client.repository.get_details(published_model_uid)
print(json.dumps(model_details, indent=2))

{
  "metadata": {
    "name": "Titanic Survivor Prediction",
    "guid": "2c2e1c46-14cb-4ec0-b7b5-3402b5060cca",
    "id": "2c2e1c46-14cb-4ec0-b7b5-3402b5060cca",
    "modified_at": "2020-09-18T13:17:12.002Z",
    "created_at": "2020-09-18T13:17:10.002Z",
    "owner": "1000330999",
    "href": "/v4/models/2c2e1c46-14cb-4ec0-b7b5-3402b5060cca?space_id=79201026-8364-435e-8962-05c3db970d9b",
    "space_id": "79201026-8364-435e-8962-05c3db970d9b"
  },
  "entity": {
    "name": "Titanic Survivor Prediction",
    "training_data_references": [
      {
        "location": {
          "bucket": "not_applicable"
        },
        "type": "fs",
        "connection": {
          "access_key_id": "not_applicable",
          "secret_access_key": "not_applicable",
          "endpoint_url": "not_applicable"
        },
        "schema": {
          "id": "1",
          "type": "ndarray",
          "fields": [
            {
              "name": "f0",
              "type": "float"
            },
      

In [61]:
loaded_model = client.repository.load(published_model_uid)
print(loaded_model)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)


In [62]:
loaded_model.score(x_test, y_test)
#test_predictions = loaded_model.predict(x_test).transform(x_test)
#test_predictions.select('probability', 'predictedLabel').show(n=3, truncate=False)

0.7752808988764045

In [78]:
if not client.deployments.get_details()['resources']:
    print("Create deployment...")
    meta_props = {
        client.deployments.ConfigurationMetaNames.NAME: "Titanic Survivor Prediction",
        client.deployments.ConfigurationMetaNames.SPACE_UID: space_id,
        client.deployments.ConfigurationMetaNames.ONLINE: {}
    }

    created_deployment = client.deployments.create(artifact_uid=published_model_uid, meta_props=meta_props, name="Titanic Survivor Prediction")
else:
    print("Deployment found!")
    print("Using default deployment: '{}' ".format(client.deployments.get_details()['resources'][0]['entity']['name']))
    created_deployment = client.deployments.get_details()['resources'][0]

Deployment found!
Using default deployment: 'Titanic Survivor Prediction' 


In [79]:
scoring_endpoint = client.deployments.get_scoring_href(created_deployment)
deployment_id = created_deployment.get("metadata").get("id")
print(f'Scoring endpoint is available at: {scoring_endpoint}')
print(f'Deployment ID is: {deployment_id}')

Scoring endpoint is available at: https://zen-cpd-zen.apps.edb-b59f.cecc.ihost.com/v4/deployments/61d07f78-fa96-4da2-b987-bbd1240d2d8c/predictions
Deployment ID is: 61d07f78-fa96-4da2-b987-bbd1240d2d8c


In [80]:
# Prepare scoring payload.
job_payload = {
    client.deployments.ScoringMetaNames.INPUT_DATA: [{
        'values': [list(x_test[-1])]
    }]
}
print(job_payload)

{'input_data': [{'values': [[0.8305236329179975, 1.3508666715598585, -0.03340264143741528, 0.42990394821142364, 0.7641602818172867, -0.4390186346895787]]}]}


In [82]:
# Perform prediction and display the result.
job_details = client.deployments.score('61d07f78-fa96-4da2-b987-bbd1240d2d8c', job_payload)
print(job_details)

{'predictions': [{'fields': ['prediction'], 'values': [[1]]}]}


## Let's try our deployed model and see it in action!

### First let's create two passengers:
P1 which had a 3rd class ticket(3), was a male (0), 22 years old, had 1 sibling aboard, no parents or childrens aboard and paid 8.25 GBP for his ticket<br>
--> Very similar datapoint to people who actually did NOT survive (0)...let's see<br><br>
P2 which had a 1st class ticket(1), was a female (1), 38 years old, had 1 sibling aboard, no parents or childrens aboard and paid 70.5 GBP for her ticket<br>
--> Very similar datapoint to people actually DID survive (1) ... let's see

In [83]:
###Spielwiese:
#(Survieved)->Pclass	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare

#0	3	0	22.0	1	0	7.2500
#Hat nicht überlebt
Passagier1 = [3, 0, 22., 1, 0 , 8.25]

#1	1	1	38.0	1	0	71.2833
#Hat überlebt
Passagier2 = [1, 1, 38., 1, 0 , 70.5]

In [84]:
# Komischerweise wird der Scaler im Pipeline Codesegment nochmal angepasst, daher muss der hier "reinitialisiert" werden
X_scaled = scaler.fit_transform(X)

scaled_pass = scaler.transform([Passagier1, Passagier2])
scaled_pass

array([[ 0.83052363, -0.74026551, -0.52936601,  0.42990395, -0.4749808 ,
        -0.48348745],
       [-1.56127657,  1.35086667,  0.60426454,  0.42990395, -0.4749808 ,
         0.76766898]])

Create the payload that we will send to our model