# Step 1: Data extraction, inspection & preparation
![title](https://github.com/mgiessing/CP4D/raw/master/img/DSP_Step1.png)
![title](https://github.com/mgiessing/CP4D/raw/master/img/1_Extract.png)

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelBinarizer, LabelEncoder, OneHotEncoder, PowerTransformer
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import xgboost as xgb
import pickle
%matplotlib inline

In [None]:
df = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')
df.head()

In [None]:
fig, axs = plt.subplots(1,2, figsize=(15,5))

df.groupby(['Survived','Sex']).size().unstack().plot(kind='bar', stacked=True, ax=axs[0], title='Survived vs Sex')
df.groupby(['Survived','Pclass']).size().unstack().plot(kind='bar', stacked=True, ax=axs[1], title='Survived vs Pclass')

plt.show()

In [None]:
fig, axs = plt.subplots(1,2, figsize=(15,5))

df['Age'].plot(kind='kde', ax=axs[0], title='Age distribution')
df['Fare'].plot(kind='hist', ax=axs[1], title='Fare distribution')

plt.show()

In [None]:
df.describe()

# Step 2: Model configuration, training & optimization
![title](https://github.com/mgiessing/CP4D/raw/master/img/DSP_Step2.png)
![title](https://github.com/mgiessing/CP4D/raw/master/img/2_Train.png)

In [None]:
df_new = df.drop(columns=['Name'])

In [None]:
df_new.head()

In [None]:
# Could also use sth. like LabelEncoder...
df_new['Sex'] = pd.get_dummies(df_new['Sex'])

#LabelEncoder
#le = LabelEncoder()
#le.fit_transform(df_new['Sex'])
df_new.head()

In [None]:
#X=df_new[['Pclass', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']].to_numpy()
X=df_new.drop(columns=['Survived']).to_numpy()
y=df_new['Survived'].to_numpy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
print(X)

In [None]:
print(X_scaled)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)
all_results = {}

## Decisiontree classifiers & Random Forest

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score


clf = DecisionTreeClassifier()
scores = cross_val_score(clf, X_scaled, y, cv=10)
print("CV mean: ", scores.mean())

all_results['Decision Tree'] = [scores.mean(), clf]

In [None]:
clf = RandomForestClassifier(n_estimators=10)
scores = cross_val_score(clf, X_scaled, y, cv=10)
print("CV mean: ", scores.mean())

all_results['Random Forest'] = [scores.mean(), clf]

## Boosting classifier (XGBoost)

In [None]:
from xgboost.sklearn import XGBClassifier, DMatrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

#parameters = {'n_estimators': [x for x in range(5, 11, 5)], 'max_depth':[x for x in range(1,4)], 'learning_rate': [[round(x, 2) for x in np.arange(0.01, 0.11, 0.01)]]}
parameters = {'n_estimators': [5], 'max_depth':[4], 'learning_rate': [0.01]}
xgb = XGBClassifier()
clf = GridSearchCV(xgb, parameters, cv=10)
clf.fit(X_scaled, y)
#print(clf.best_estimator_)
print("CV mean: ", clf.best_score_)
all_results['XG Boost'] = [clf.best_score_, clf]
#xgb = XGBClassifier(n_estimators=10, max_depth=1, learning_rate=0.1, objective='binary:logistic')
#cv_scores = cross_val_score(xgb, scaled, labels, cv=10)
#print(cv_scores.mean())

## Support Vector Machines (SVMs)

In [None]:
from sklearn import svm

KERNELS = ['linear', 'poly', 'rbf', 'sigmoid']
for kernel in KERNELS:
    svc = svm.SVC(kernel=kernel, C=1.0)#.fit(x_train, y_train)
    cv_scores = cross_val_score(svc, X_scaled, y, cv=10)
    print("Kernel: {}, accuracy: {}".format(kernel, cv_scores.mean()))
    all_results[str('SVM-'+ kernel)] = [cv_scores.mean(),svc]

## K-nearest-neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
d = {}
for i in range(1,50):
    knn = KNeighborsClassifier(n_neighbors=i)
    cv_scores = cross_val_score(knn, X_scaled, y, cv=10)
    d[str(i)]= [cv_scores.mean(), knn]
    
max_key = max(d, key=d.get)
print("Best K: {} with accuracy: {}".format(max_key, d[max_key][0]))
all_results[str('KNN-'+max_key)] = [d[max_key][0], d[max_key][1]]

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
cv_scores = cross_val_score(lr, X_scaled, y, cv=10)
print(cv_scores.mean())
all_results['Logistic Regression'] = [cv_scores.mean(), lr]

## Simple Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop, SGD


def create_model():
    model = Sequential()
    model.add(Dense(512, activation='relu', input_dim=6))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

est = KerasClassifier(build_fn=create_model, epochs=5)
cv_scores = cross_val_score(est, X_scaled, y, cv=10)
print("CV mean: ", cv_scores.mean())
all_results['Neural Network'] = [cv_scores.mean(), est]

## Which algorithm performed best...?

In [None]:
print("\nSee the full results in ascending order here:\n")
for key, value in sorted(all_results.items(), reverse=False, key=lambda item: item[1]):
    print("%s: %s" % (key, value[0]))
    clf_nm, clf = key, value[1]

    
from sklearn.pipeline import Pipeline
# Make pipeline object
print("\nCreating pipeline object for best algorithm: {}".format(clf_nm))
pipe = Pipeline([('scaler', scaler), (clf_nm, clf)])
model = pipe.fit(x_train, y_train)

# --------------------

# Step 3: Model deployment
![title](https://github.com/mgiessing/CP4D/raw/master/img/DSP_Step3.png)
![title](https://github.com/mgiessing/CP4D/raw/master/img/3_Deploy.png)

## After training a model we can make it usable on our cluster in 3 steps:
#### (Log in if necessary)
#### 1.) Create a space or use an existing
#### 2.) Create a model repository or use an existing
#### 3.) Create a model deployment or use an existing

In [None]:
###
# Beim CP4Dv3.0.1 sollte eigentlich watson-machine-learning-client-V4 vorinstalliert sein und es sollte alles "problemlos" funktionieren!
# Falls es dennoch Probleme geben sollte, mal "vorsichtshalber" alles deinstallieren und nur V4 installieren...
# WICHTIG: Falls irgendwas uninstalled oder neuinstalled wurde den Jupyter Kernel neustarten!
###

#!pip uninstall watson-machine-learning-client-V4 -y
#!pip uninstall watson-machine-learning-client -y
#!pip uninstall ibm-watson-machine-learning -y
#!pip install watson-machine-learning-client-V4

In [None]:
#Option 1 - Curl:
#!curl -k -X GET https://zen-cpd-zen.apps.edb-bde1.cecc.ihost.com/v1/preauth/validateAuth -u admin:password

#Option 2 - Python requests:
import requests, json
from requests.auth import HTTPBasicAuth

### Folgende Variablen anpassen ###
HOST_URL = ''
PROJECT_ID = ''
### --------------------------- ###

s = requests.Session()
res = s.get(HOST_URL, auth=HTTPBasicAuth('admin', 'password'), verify=False)
res = json.loads(res.text)
token = res['accessToken']
print(token)

In [None]:
wml_credentials = {
    "token": token,
    "instance_id" : "wml_local",
    "url"         : HOST_URL,
    "version": "3.0.1"
}

from watson_machine_learning_client import WatsonMachineLearningAPIClient
client = WatsonMachineLearningAPIClient(wml_credentials)

client.set.default_project(PROJECT_ID)

## List and/or create space

In [None]:
client.spaces.list()

In [None]:
if not client.spaces.get_details()['resources']:
    print("Create resources!")
    space_details = client.spaces.store(meta_props={client.spaces.ConfigurationMetaNames.NAME: "dev_space2"})
    space_id = client.spaces.get_uid(space_details)
else:
    print("Resource vorhanden!")
    for sp in client.spaces.get_details()['resources']:
        print(sp,"\n")
    print("Using '{}' as default space".format(client.spaces.get_details()['resources'][0]['metadata']['name']))
    space_id = client.spaces.get_details()['resources'][0]['metadata']['id']

In [None]:
client.set.default_space(space_id)

In [None]:
# Liste der zugehörigen Algorithmus ID ausgeben lassen...
# client.software_specifications.list()

## List or create model repository

In [None]:
#TODO: Warnungen unterdrücken, da es sonst etwas verwirrend ist...

if not client.repository.get_details()['models']['resources']:
    print("Publish model...")
    software_spec_uid = client.software_specifications.get_uid_by_name("scikit-learn_0.22-py3.6")
    model_props = {
        client.repository.ModelMetaNames.NAME: "Titanic Survivor Prediction",
        client.repository.ModelMetaNames.TYPE: "scikit-learn_0.22",
        client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,
        client.repository.ModelMetaNames.INPUT_DATA_SCHEMA: [{'id': '1',
                                                                    'type': 'ndarray',
                                                                     'fields': [{'name': 'Pclass', 'type': 'float'},
                                                                                {'name': 'Sex', 'type': 'float'},
                                                                                {'name': 'Age', 'type': 'float'},
                                                                                {'name': 'Siblings/Spouses Aboard', 'type': 'float'},
                                                                                {'name': 'Parents/Children Aboard', 'type': 'float'},
                                                                                {'name': 'Fare', 'type': 'float'}]
                                                                       }]
    }
    published_model = client.repository.store_model(model=model, pipeline=pipe, meta_props=model_props, training_data=x_train, training_target=y_train)
else:
    print("Model found!")
    print("Using default model {}".format(client.repository.get_details()['models']['resources'][0]['metadata']['name']))
    published_model = client.repository.get_details()['models']['resources'][0]

# Optional: Delete by (gu)id
#client.repository.delete('b6591272-ee24-40a7-841b-d7e8846277d2')

In [None]:
import json
published_model_uid = client.repository.get_model_uid(published_model)
model_details = client.repository.get_details(published_model_uid)
print(json.dumps(model_details, indent=2))

In [None]:
loaded_model = client.repository.load(published_model_uid)
print(loaded_model)

In [None]:
loaded_model.score(x_test, y_test)
#test_predictions = loaded_model.predict(x_test).transform(x_test)
#test_predictions.select('probability', 'predictedLabel').show(n=3, truncate=False)

## List or create deployment

In [None]:
client.deployments.list()
# Optional: Delete deployment
# client.deployments.delete('70a57b43-3a4c-41ce-81e0-fdcf3a6116b1')

In [None]:
if not client.deployments.get_details()['resources']:
    print("Create deployment...")
    meta_props = {
        client.deployments.ConfigurationMetaNames.NAME: "Titanic Survivor Prediction",
        client.deployments.ConfigurationMetaNames.SPACE_UID: space_id,
        client.deployments.ConfigurationMetaNames.ONLINE: {}
    }

    created_deployment = client.deployments.create(artifact_uid=published_model_uid, meta_props=meta_props, name="Titanic Survivor Prediction")
else:
    print("Deployment found!")
    print("Using default deployment: '{}' ".format(client.deployments.get_details()['resources'][0]['entity']['name']))
    created_deployment = client.deployments.get_details()['resources'][0]

In [None]:
scoring_endpoint = client.deployments.get_scoring_href(created_deployment)
deployment_id = created_deployment.get("metadata").get("id")
print(f'Scoring endpoint is available at: {scoring_endpoint}')
print(f'Deployment ID is: {deployment_id}')

In [None]:
# Prepare scoring payload.
job_payload = {
    client.deployments.ScoringMetaNames.INPUT_DATA: [{
        'values': [list(x_test[-1])]
    }]
}
print(job_payload)

X_scaled = scaler.fit_transform(X)
scaler.inverse_transform(x_test[-3])

In [None]:
# Perform prediction and display the result.
job_details = client.deployments.score(deployment_id, job_payload)
print(job_details)

## Let's try our deployed model and see it in action!

### First let's create two passengers:
P1 which had a 3rd class ticket(3), was a male (0), 22 years old, had 1 sibling aboard, no parents or childrens aboard and paid 8.25 GBP for his ticket<br>
--> Very similar datapoint to people who actually did NOT survive (0)...let's see<br><br>
P2 which had a 1st class ticket(1), was a female (1), 38 years old, had 1 sibling aboard, no parents or childrens aboard and paid 70.5 GBP for her ticket<br>
--> Very similar datapoint to people actually DID survive (1) ... let's see

In [None]:
###Spielwiese:
#(Survieved)->Pclass	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare

#0	3	0	22.0	1	0	7.2500
#Hat nicht überlebt
Passagier1 = [3, 0, 22., 1, 0 , 8.25]

#1	1	1	38.0	1	0	71.2833
#Hat überlebt
Passagier2 = [1, 1, 38., 1, 0 , 70.5]

Scale the data using the scaler that was fit&transformed on the trainingsdata. Here we just need to transform data to our scaler using .transform()

In [None]:
# Komischerweise wird der Scaler im Pipeline Codesegment nochmal angepasst, daher muss der hier "reinitialisiert" werden
X_scaled = scaler.fit_transform(X)

scaled_pass = scaler.transform([Passagier1, Passagier2])
scaled_pass

Create the payload that we will send to our model

In [None]:
payload = {client.deployments.ScoringMetaNames.INPUT_DATA: [{
    'values': scaled_pass
}]}

In [None]:
job_details = client.deployments.score(deployment_id, payload)
job_details

As expected the first passenger did not survive while the second (better class, but also paid more) did survive.<br>
Here a little bit prettified

In [None]:
for idx,num in enumerate(job_details['predictions'][0]['values']):
    if 0 in num:
        print(f'Passenger #{idx+1} would probably not survive')
    elif 1 in num:
        print(f'Passenger #{idx+1} would probably survive')
    else:
        print("That shouldn't have happened!")