In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

## Titanic
data_titanic = pd.read_csv("data/titanic/train.csv")

column_titanic_dict = {"PassengerId": "PassengerId", "Survived": "Survived", "Pclass": "TicketClass", "Name": "Name", "Sex": "Sex", "Age": "Age", "SibSp": "NumberSiblingsSpouses", "Parch": "NumberParentsChildren", "Ticket": "TicketNumber", "Fare": "Fare", "Cabin": "CabinNumber", "Embarked": "Port"}
data_titanic = data_titanic.rename(columns=column_titanic_dict)

train_X_titanic_columns = ["TicketClass", "Sex", "Age", "NumberSiblingsSpouses", "NumberParentsChildren", "Fare", "Port"]
train_y_titanic_columns = ["Survived"]
train_X_titanic_ordinal_columns = ["Sex", "Port"]

X_titanic = data_titanic[train_X_titanic_columns]
y_titanic = data_titanic[train_y_titanic_columns]
 
X_titanic.loc[:, train_X_titanic_ordinal_columns] = OrdinalEncoder().fit_transform(X_titanic[train_X_titanic_ordinal_columns])

X_titanic_train, X_titanic_test, y_titanic_train, y_titanic_test = train_test_split(X_titanic, y_titanic, test_size=0.4)
y_titanic_train = np.squeeze(y_titanic_train)
y_titanic_test = np.squeeze(y_titanic_test)

## Machine Failure
data_machine_failure = pd.read_csv("data/machine_failure/train.csv")

column_machine_failure_dict = {"PassengerId": "PassengerId", "Survived": "Survived", "Pclass": "TicketClass", "Name": "Name", "Sex": "Sex", "Age": "Age", "SibSp": "NumberSiblingsSpouses", "Parch": "NumberParentsChildren", "Ticket": "TicketNumber", "Fare": "Fare", "Cabin": "CabinNumber", "Embarked": "Port"}
data_machine_failure = data_machine_failure.rename(columns=column_machine_failure_dict)

train_X_machine_failure_columns = ["TicketClass", "Sex", "Age", "NumberSiblingsSpouses", "NumberParentsChildren", "Fare", "Port"]
train_y_machine_failure_columns = ["Survived"]
train_X_machine_failure_ordinal_columns = ["Sex", "Port"]

X_machine_failure = data_machine_failure[train_X_machine_failure_columns]
y_machine_failure = data_machine_failure[train_y_machine_failure_columns]
 
X_machine_failure.loc[:, train_X_machine_failure_ordinal_columns] = OrdinalEncoder().fit_transform(X_machine_failure[train_X_machine_failure_ordinal_columns])

X_machine_failure_train, X_machine_failure_test, y_machine_failure_train, y_machine_failure_test = train_test_split(X_machine_failure, y_machine_failure, test_size=0.4)
y_machine_failure_train = np.squeeze(y_machine_failure_train)
y_machine_failure_test = np.squeeze(y_machine_failure_test)

## Synthetic
X_synthetic = np.random.randn(int(3e3)).reshape([int(1e3), 3])
y_synthetic = X_synthetic[:, 0] + X_synthetic[:, 1] + X_synthetic[:, 2] > 0
yn_synthetic = np.logical_not(y_synthetic)

X_synthetic[y_synthetic, :] += np.array([1, 1, 1]) * 0.1
X_synthetic[yn_synthetic, :] -= np.array([1, 1, 1]) * 0.1

X_synthetic_train, X_synthetic_test, y_synthetic_train, y_synthetic_test = train_test_split(X_synthetic, y_synthetic, test_size=0.4)
y_synthetic_train = np.squeeze(y_synthetic_train)
y_synthetic_test = np.squeeze(y_synthetic_test)

In [2]:
## Titanic and Synthetic
## Standard Scaler

import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, precision_score, accuracy_score, recall_score

param_grid = [
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["linear"]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["poly"], "svm_clf__degree": [1, 2, 3, 4, 5, 6]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["rbf"]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["rbf"]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["sigmoid"]},
]

svm_clf = GridSearchCV(Pipeline([
    ("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("scaler", StandardScaler()),
    ("svm_clf", SVC()),]),
    param_grid, cv=3, verbose=0)

## Titanic
print("Titanic")
svm_clf.fit(X_titanic_train, y_titanic_train)
y_titanic_svm_clf_pred = svm_clf.predict(X_titanic_test)
# Score
print("Confusion Matrix")
print(confusion_matrix(y_titanic_test, y_titanic_svm_clf_pred))
print("Precision Score")
print(precision_score(y_titanic_test, y_titanic_svm_clf_pred))
print("Accuracy Score")
print(accuracy_score(y_titanic_test, y_titanic_svm_clf_pred))
print("Recall Score")
print(recall_score(y_titanic_test, y_titanic_svm_clf_pred))
print("F1 Score")
print(f1_score(y_titanic_test, y_titanic_svm_clf_pred))
print("Best Estimator")
print(svm_clf.best_estimator_)
print("Best Parameters")
print(svm_clf.best_params_)

print("")
## Synthetic
print("Synthetic")
svm_clf.fit(X_synthetic_train, y_synthetic_train)
y_synthetic_svm_clf_pred = svm_clf.predict(X_synthetic_test)
# Score
print("Confusion Matrix")
print(confusion_matrix(y_synthetic_test, y_synthetic_svm_clf_pred))
print("Precision Score")
print(precision_score(y_synthetic_test, y_synthetic_svm_clf_pred))
print("Accuracy Score")
print(accuracy_score(y_synthetic_test, y_synthetic_svm_clf_pred))
print("Recall Score")
print(recall_score(y_synthetic_test, y_synthetic_svm_clf_pred))
print("F1 Score")
print(f1_score(y_synthetic_test, y_synthetic_svm_clf_pred))
print("Best Estimator")
print(svm_clf.best_estimator_)
print("Best Parameters")
print(svm_clf.best_params_)

Titanic
Confusion Matrix
[[197  24]
 [ 42  94]]
Precision Score
0.7966101694915254
Accuracy Score
0.8151260504201681
Recall Score
0.6911764705882353
F1 Score
0.7401574803149606
Best Estimator
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('svm_clf', SVC(C=0.7196856730011519))])
Best Parameters
{'svm_clf__C': 0.7196856730011519, 'svm_clf__kernel': 'rbf'}

Synthetic
Confusion Matrix
[[212   0]
 [  0 188]]
Precision Score
1.0
Accuracy Score
1.0
Recall Score
1.0
F1 Score
1.0
Best Estimator
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('svm_clf', SVC(C=0.1, kernel='linear'))])
Best Parameters
{'svm_clf__C': 0.1, 'svm_clf__kernel': 'linear'}


In [3]:
## Titanic and Synthetic
## Min Max Scaler

import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, precision_score, accuracy_score, recall_score

param_grid = [
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["linear"]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["poly"], "svm_clf__degree": [1, 2, 3, 4, 5, 6]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["rbf"]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["rbf"]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["sigmoid"]},
]

svm_clf = GridSearchCV(Pipeline([
    ("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("scaler", MinMaxScaler()),
    ("svm_clf", SVC()),]),
    param_grid, cv=3, verbose=0)

## Titanic
print("Titanic")
svm_clf.fit(X_titanic_train, y_titanic_train)
y_titanic_svm_clf_pred = svm_clf.predict(X_titanic_test)
# Score
print("Confusion Matrix")
print(confusion_matrix(y_titanic_test, y_titanic_svm_clf_pred))
print("Precision Score")
print(precision_score(y_titanic_test, y_titanic_svm_clf_pred))
print("Accuracy Score")
print(accuracy_score(y_titanic_test, y_titanic_svm_clf_pred))
print("Recall Score")
print(recall_score(y_titanic_test, y_titanic_svm_clf_pred))
print("F1 Score")
print(f1_score(y_titanic_test, y_titanic_svm_clf_pred))
print("Best Estimator")
print(svm_clf.best_estimator_)
print("Best Parameters")
print(svm_clf.best_params_)

print("")
## Synthetic
print("Synthetic")
svm_clf.fit(X_synthetic_train, y_synthetic_train)
y_synthetic_svm_clf_pred = svm_clf.predict(X_synthetic_test)
# Score
print("Confusion Matrix")
print(confusion_matrix(y_synthetic_test, y_synthetic_svm_clf_pred))
print("Precision Score")
print(precision_score(y_synthetic_test, y_synthetic_svm_clf_pred))
print("Accuracy Score")
print(accuracy_score(y_synthetic_test, y_synthetic_svm_clf_pred))
print("Recall Score")
print(recall_score(y_synthetic_test, y_synthetic_svm_clf_pred))
print("F1 Score")
print(f1_score(y_synthetic_test, y_synthetic_svm_clf_pred))
print("Best Estimator")
print(svm_clf.best_estimator_)
print("Best Parameters")
print(svm_clf.best_params_)

Titanic
Confusion Matrix
[[196  24]
 [ 41  96]]
Precision Score
0.8
Accuracy Score
0.8179271708683473
Recall Score
0.7007299270072993
F1 Score
0.7470817120622568
Best Estimator
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', MinMaxScaler()),
                ('svm_clf', SVC(C=5.17947467923121, kernel='poly'))])
Best Parameters
{'svm_clf__C': 5.17947467923121, 'svm_clf__degree': 3, 'svm_clf__kernel': 'poly'}

Synthetic
Confusion Matrix
[[186   0]
 [  0 214]]
Precision Score
1.0
Accuracy Score
1.0
Recall Score
1.0
F1 Score
1.0
Best Estimator
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', MinMaxScaler()),
                ('svm_clf', SVC(C=0.1, kernel='linear'))])
Best Parameters
{'svm_clf__C': 0.1, 'svm_clf__kernel': 'linear'}


In [4]:
## Machine Failure
## Standard Scaler

import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, precision_score, accuracy_score, recall_score

param_grid = [
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["linear"]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["poly"], "svm_clf__degree": [1, 2, 3, 4, 5, 6]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["rbf"]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["rbf"]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["sigmoid"]},
]

svm_clf = GridSearchCV(Pipeline([
    ("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("scaler", StandardScaler()),
    ("svm_clf", SVC()),]),
    param_grid, cv=3, verbose=0)

## Machine Failure
print("Machine Failure")
svm_clf.fit(X_machine_failure_train, y_machine_failure_train)
y_machine_failure_svm_clf_pred = svm_clf.predict(X_machine_failure_test)
# Score
print("Confusion Matrix")
print(confusion_matrix(y_machine_failure_test, y_machine_failure_svm_clf_pred))
print("Precision Score")
print(precision_score(y_machine_failure_test, y_machine_failure_svm_clf_pred))
print("Accuracy Score")
print(accuracy_score(y_machine_failure_test, y_machine_failure_svm_clf_pred))
print("Recall Score")
print(recall_score(y_machine_failure_test, y_machine_failure_svm_clf_pred))
print("F1 Score")
print(f1_score(y_machine_failure_test, y_machine_failure_svm_clf_pred))
print("Best Estimator")
print(svm_clf.best_estimator_)
print("Best Parameters")
print(svm_clf.best_params_)

Machine Failure
Confusion Matrix
[[189  26]
 [ 47  95]]
Precision Score
0.7851239669421488
Accuracy Score
0.7955182072829131
Recall Score
0.6690140845070423
F1 Score
0.7224334600760457
Best Estimator
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('svm_clf', SVC(C=0.7196856730011519))])
Best Parameters
{'svm_clf__C': 0.7196856730011519, 'svm_clf__kernel': 'rbf'}


In [6]:
## Machine Failure
## Min Max Scaler

import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, precision_score, accuracy_score, recall_score

param_grid = [
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["linear"]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["poly"], "svm_clf__degree": [1, 2, 3, 4, 5, 6]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["rbf"]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["rbf"]},
  {"svm_clf__C": np.logspace(-1, 2, 15), "svm_clf__kernel": ["sigmoid"]},
]

svm_clf = GridSearchCV(Pipeline([
    ("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("scaler", MinMaxScaler()),
    ("svm_clf", SVC()),]),
    param_grid, cv=3, verbose=0)

## Machine Failure
print("Machine Failure")
svm_clf.fit(X_machine_failure_train, y_machine_failure_train)
y_machine_failure_svm_clf_pred = svm_clf.predict(X_machine_failure_test)
# Score
print("Confusion Matrix")
print(confusion_matrix(y_machine_failure_test, y_machine_failure_svm_clf_pred))
print("Precision Score")
print(precision_score(y_machine_failure_test, y_machine_failure_svm_clf_pred))
print("Accuracy Score")
print(accuracy_score(y_machine_failure_test, y_machine_failure_svm_clf_pred))
print("Recall Score")
print(recall_score(y_machine_failure_test, y_machine_failure_svm_clf_pred))
print("F1 Score")
print(f1_score(y_machine_failure_test, y_machine_failure_svm_clf_pred))
print("Best Estimator")
print(svm_clf.best_estimator_)
print("Best Parameters")
print(svm_clf.best_params_)

Machine Failure
Confusion Matrix
[[187  28]
 [ 50  92]]
Precision Score
0.7666666666666667
Accuracy Score
0.7815126050420168
Recall Score
0.647887323943662
F1 Score
0.7022900763358779
Best Estimator
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', MinMaxScaler()),
                ('svm_clf',
                 SVC(C=13.894954943731374, degree=6, kernel='poly'))])
Best Parameters
{'svm_clf__C': 13.894954943731374, 'svm_clf__degree': 6, 'svm_clf__kernel': 'poly'}
