In [None]:
# data processing
import numpy as np
import os
import sys
import pandas as pd
import pickle

# Machine learning library
import h2o
from h2o.automl import H2OAutoML
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, roc_curve, auc

# data visualizations
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Processing configuration
class Location:
    """Specify the locations of inputs and outputs"""

    # Get the path of the directory containing the script file
    script_dir = os.path.dirname(os.path.abspath(sys.path[0]))

    # Navigate up to the top-level directory
    src_level_dir = os.path.dirname(script_dir)

    top_level_dir = os.path.dirname(src_level_dir)

    # Define the relative path to the data directory
    data_dir = os.path.join(top_level_dir, "AutomatingAnalysisModelsAndMisprediction\\data")

    data_raw: str = f"{data_dir}\\raw\\customer_churn.csv"
    data_csv_process: str = f"{data_dir}\\processed\\customer_churn.csv"
    data_process: str = f"{data_dir}\\processed\\customer_churn.pkl"
    
    
class ProcessConfig:
    """Specify the parameters of the `process` flow"""

    label: str = "Churn"
    test_size: float = 0.2

In [None]:
# Processing functions
def getProcessedData(file_path: str):
    # read python dict back from the file
    with open(file_path, 'rb') as f:
        split_dict = pickle.load(f)

    X_train = split_dict["X_train"]
    X_test = split_dict["X_test"]
    y_train = split_dict["y_train"]
    y_test = split_dict["y_test"]
    return X_train, X_test, y_train, y_test

def getCSVData(url: str):
    data = pd.read_csv(url)
    return data


In [None]:
# Get csv processed data
dataset = getCSVData(Location.data_csv_process)
len(dataset.columns)

In [None]:
# get the data
X_train, X_test, Y_train, Y_test = getProcessedData(Location.data_process)


In [None]:
# Standardize features. Require for Machine learning
sc = StandardScaler()
# Fit to data, then transform it
X_train = sc.fit_transform(X_train)
sc.mean_

In [None]:
sc.scale_

In [None]:
# Perform standardization by centering and scaling
X_test = sc.transform(X_test)

In [None]:
# Init an H2O cluster
h2o.init()

In [None]:
# Convert training data to H2OFrame
# the train result will be used to train a machine learning model
train = h2o.H2OFrame(np.concatenate((X_train, Y_train.values.reshape(-1, 1)), axis=1))


In [None]:
train.columns = list(dataset.columns)

In [None]:
# Specify target variable and predictor variables
x = train.drop(ProcessConfig.label, axis=1).columns
y = ProcessConfig.label

In [None]:
# Run H2O AutoML to automatically select, train and optimize SVM model
aml = H2OAutoML(max_models=10, sort_metric='mse', max_runtime_secs=5 * 60, seed=666)
aml.train(x=x, y=y, training_frame=train)

In [None]:
# View the leaderboard of trained models
lb = aml.leaderboard
print(lb.head())

In [None]:
# Use the best model to predict on test data
model = aml.leader
X_h2o = h2o.H2OFrame(X_test)
X_h2o.columns = list(dataset.columns)[:-1]
y_pred = model.predict(X_h2o).as_data_frame().values.flatten()

In [None]:
# Train a PySVM SVM model
model = LinearSVC(random_state=0, tol=1e-5, max_iter=10000, dual=False)
Y_pred = model.fit(X_train, Y_train)

In [None]:
# fig, ax = plt.subplots(figsize=(8,5))
# sns.scatterplot(Y_pred, y[ProcessConfig.label] ,ax=ax)
# sns.lineplot(Y_pred,Y_pred,ax=ax,color='black')
# ax.set_xlabel(ProcessConfig.label)

In [None]:
# Test the model on the test data
y_pred = model.predict(X_test)
misclassified = np.where(Y_test != y_pred)[0]
print("Indices of potentially misclassified instances: ", misclassified)

In [None]:
# Confusion matrix - summarizing the performance of a classification algorithm.
cm = confusion_matrix(Y_test, y_pred)
plt.figure()
plt.imshow(cm, cmap=plt.cm.Blues)
plt.title("Confusion matrix")
plt.colorbar()
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.xticks([0, 1], ["Negative", "Positive"])
plt.yticks([0, 1], ["Negative", "Positive"])
plt.tight_layout()
plt.show()

In [None]:
# ROC curve
fpr, tpr, thresholds = roc_curve(Y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic (ROC)")
plt.legend(loc="lower right")
plt.show()