In [108]:
# data processing
import numpy as np
import os
import sys
import pandas as pd
import pickle

# Machine learning library
import h2o
from h2o.automl import H2OAutoML
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, roc_curve, auc

# data visualizations
import matplotlib.pyplot as plt
import seaborn as sns


In [109]:
# Processing configuration
class Location:
    """Specify the locations of inputs and outputs"""

    # Get the path of the directory containing the script file
    script_dir = os.path.dirname(os.path.abspath(sys.path[0]))

    # Navigate up to the top-level directory
    src_level_dir = os.path.dirname(script_dir)

    top_level_dir = os.path.dirname(src_level_dir)

    # Define the relative path to the data directory
    data_dir = os.path.join(top_level_dir, "AutomatingAnalysisModelsAndMisprediction\\data")

    data_raw: str = f"{data_dir}\\raw\\customer_churn.csv"
    data_csv_process: str = f"{data_dir}\\processed\\customer_churn.csv"
    data_process: str = f"{data_dir}\\processed\\customer_churn.pkl"
    
    
class ProcessConfig:
    """Specify the parameters of the `process` flow"""

    label: str = "Churn"
    test_size: float = 0.2

In [114]:
# Processing functions
def getProcessedData(file_path: str):
    # read python dict back from the file
    with open(file_path, 'rb') as f:
        split_dict = pickle.load(f)

    X_train = split_dict["X_train"]
    X_test = split_dict["X_test"]
    y_train = split_dict["y_train"]
    y_test = split_dict["y_test"]
    return X_train, X_test, y_train, y_test

def getCSVData(url: str):
    data = pd.reader(url)
    return data


In [117]:
# Get csv processed data
dataset = getCSVData(Location.data_csv_process)
dataset.columns

AttributeError: '_csv.reader' object has no attribute 'columns'

In [None]:
# get the data
X_train, X_test, Y_train, Y_test = getProcessedData(Location.data_process)


['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'PaperlessBilling',
 'MonthlyCharges',
 'TotalCharges',
 'MultipleLines_No',
 'MultipleLines_No phone service',
 'MultipleLines_Yes',
 'InternetService_DSL',
 'InternetService_Fiber optic',
 'InternetService_No',
 'OnlineSecurity_No',
 'OnlineSecurity_No internet service',
 'OnlineSecurity_Yes',
 'OnlineBackup_No',
 'OnlineBackup_No internet service',
 'OnlineBackup_Yes',
 'DeviceProtection_No',
 'DeviceProtection_No internet service',
 'DeviceProtection_Yes',
 'TechSupport_No',
 'TechSupport_No internet service',
 'TechSupport_Yes',
 'StreamingTV_No',
 'StreamingTV_No internet service',
 'StreamingTV_Yes',
 'StreamingMovies_No',
 'StreamingMovies_No internet service',
 'StreamingMovies_Yes',
 'Contract_Month-to-month',
 'Contract_One year',
 'Contract_Two year',
 'PaymentMethod_Bank transfer (automatic)',
 'PaymentMethod_Credit card (automatic)',
 'PaymentMethod_Electronic check',
 'PaymentMethod_Mail

In [None]:
# Standardize features. Require for Machine learning
sc = StandardScaler()
# Fit to data, then transform it
X_train = sc.fit_transform(X_train)
sc.mean_

array([4.32445521e-01, 1.35835351e-01, 3.71307506e-01, 2.11501211e-01,
       2.76394673e+01, 8.95520581e-01, 5.94552058e-01, 6.78258984e+01,
       2.04043235e+03, 4.20702179e-01, 8.31719128e-02, 3.84624697e-01,
       2.92493947e-01, 5.12348668e-01, 1.63075061e-01, 5.53874092e-01,
       1.63075061e-01, 2.03268765e-01, 4.62348668e-01, 1.63075061e-01,
       2.75544794e-01, 4.63801453e-01, 1.63075061e-01, 2.72397094e-01,
       5.44067797e-01, 1.63075061e-01, 2.11501211e-01, 3.94794189e-01,
       1.63075061e-01, 3.52421308e-01, 3.84261501e-01, 1.63075061e-01,
       3.54721550e-01, 6.39951574e-01, 1.50605327e-01, 1.64164649e-01,
       1.56053269e-01, 1.52300242e-01, 3.51089588e-01, 1.78329298e-01])

In [None]:
sc.scale_

array([4.95415374e-01, 3.42613643e-01, 4.83154470e-01, 4.08372928e-01,
       2.39832072e+01, 3.05881464e-01, 4.90978521e-01, 2.86407746e+01,
       2.18393349e+03, 4.93671810e-01, 2.76141894e-01, 4.86506464e-01,
       4.54907944e-01, 4.99847487e-01, 3.69434142e-01, 4.97089109e-01,
       3.69434142e-01, 4.02430832e-01, 4.98580362e-01, 3.69434142e-01,
       4.46788385e-01, 4.98687944e-01, 3.69434142e-01, 4.45193124e-01,
       4.98054243e-01, 3.69434142e-01, 4.08372928e-01, 4.88806442e-01,
       3.69434142e-01, 4.77724324e-01, 4.86420189e-01, 3.69434142e-01,
       4.78428858e-01, 4.80014122e-01, 3.57663756e-01, 3.70424914e-01,
       3.62905836e-01, 3.59311673e-01, 4.77310894e-01, 3.82789706e-01])

In [None]:
# Perform standardization by centering and scaling
X_test = sc.transform(X_test)

In [None]:
# Init an H2O cluster
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,2 hours 28 mins
H2O_cluster_timezone:,Asia/Jerusalem
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.1
H2O_cluster_version_age:,17 days
H2O_cluster_name:,H2O_from_python_Administrator_71jcrd
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.251 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [None]:
# Convert training data to H2OFrame
# the train result will be used to train a machine learning model
train = h2o.H2OFrame(np.concatenate((X_train, Y_train.values.reshape(-1, 1)), axis=1))


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


41

In [None]:
train.columns = list(dataset.columns)
train[ProcessConfig.label] = train[ProcessConfig.label].asfactor()

H2OValueError: Argument `names` (= ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'MultipleLines_No', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_DSL', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes', 'OnlineBackup_No', 'OnlineBackup_No internet service', 'OnlineBackup_Yes', 'DeviceProtection_No', 'DeviceProtection_No internet service', 'DeviceProtection_Yes', 'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes', 'StreamingMovies_No', 'StreamingMovies_No internet service', 'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']) does not satisfy the condition len(names) == self.ncol

In [None]:
# Specify target variable and predictor variables
x = train.columns[:-1]
y = train.columns[-1]

In [None]:
# Run H2O AutoML to automatically select, train and optimize SVM model
aml = H2OAutoML(max_models=10, sort_metric='mse', max_runtime_secs=5 * 60, seed=666)
aml.train(x=x, y=y, training_frame=train)

In [None]:
# View the leaderboard of trained models
lb = aml.leaderboard
print(lb.head())

In [None]:
# Use the best model to predict on test data
model = aml.leader
X_h2o = h2o.H2OFrame(X_test)
X_h2o.columns = list(dataset.columns)[:-1]
y_pred = model.predict(X_h2o).as_data_frame().values.flatten()

In [None]:
# Train a PySVM SVM model
model = LinearSVC(random_state=0, tol=1e-5, max_iter=10000, dual=False)
Y_pred = model.fit(X_train, Y_train)

In [None]:
# fig, ax = plt.subplots(figsize=(8,5))
# sns.scatterplot(Y_pred, y[ProcessConfig.label] ,ax=ax)
# sns.lineplot(Y_pred,Y_pred,ax=ax,color='black')
# ax.set_xlabel(ProcessConfig.label)

In [None]:
# Test the model on the test data
y_pred = model.predict(X_test)
misclassified = np.where(Y_test != y_pred)[0]
print("Indices of potentially misclassified instances: ", misclassified)

In [None]:
# Confusion matrix - summarizing the performance of a classification algorithm.
cm = confusion_matrix(Y_test, y_pred)
plt.figure()
plt.imshow(cm, cmap=plt.cm.Blues)
plt.title("Confusion matrix")
plt.colorbar()
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.xticks([0, 1], ["Negative", "Positive"])
plt.yticks([0, 1], ["Negative", "Positive"])
plt.tight_layout()
plt.show()

In [None]:
# ROC curve
fpr, tpr, thresholds = roc_curve(Y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic (ROC)")
plt.legend(loc="lower right")
plt.show()