In [None]:
import pandas as pd
import numpy as np
import json
import pickle

from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold
from sklearn.preprocessing import StandardScaler  
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier,BaggingClassifier

from sklearn.metrics import classification_report

In [None]:
data = pd.read_csv("Churn_Modelling.csv")

In [None]:
X = data.iloc[:, 3:-1]
y = data.iloc[:, -1]
X

## Prepare the data for training and for Unbox

In [None]:
def data_encode_one_hot(df, encoders):
    """ Encodes categorical features using one-hot encoding. """
    df = df.copy(True)
    df.reset_index(drop=True, inplace=True) # Causes NaNs otherwise
    for feature, enc in encoders.items():
        print(f"encoding {feature}")
        enc_df = pd.DataFrame(enc.transform(df[[feature]]).toarray(), columns=enc.get_feature_names([feature]))
        df = df.join(enc_df)
        df = df.drop(columns=feature)
    return df

In [None]:
def create_encoder_dict(df, categorical_feature_names):
    """ Creates encoders for each of the categorical features. 
        The predict function will need these encoders. 
    """
    from sklearn.preprocessing import OneHotEncoder
    encoders = {}
    for feature in categorical_feature_names:
        enc = OneHotEncoder(handle_unknown='ignore')
        enc.fit(df[[feature]])
        encoders[feature] = enc
    return encoders

In [None]:
encoders = create_encoder_dict(X, ['Geography', 'Gender'])

X_enc_one_hot = data_encode_one_hot(X, encoders)
X_enc_one_hot

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)
x_train_one_hot = data_encode_one_hot(x_train, encoders)
x_val_one_hot = data_encode_one_hot(x_val, encoders)

## Train a model using the one hot inputs

In [None]:
sklearn_model = LogisticRegression(random_state=1300)
sklearn_model.fit(x_train_one_hot, y_train)

In [None]:
print(classification_report(y_val, sklearn_model.predict(x_val_one_hot)))

In [None]:
class_names = ["Retained", "Exited"]
feature_names = X.columns.values.tolist()

# UNBOX

In [None]:
import unboxapi
from unboxapi.tasks import TaskType
from unboxapi.models import ModelType
client = unboxapi.UnboxClient("YOUR_API_KEY_HERE")

## Create predict function

In [None]:
def predict_proba(model, input_features: np.ndarray, col_names, one_hot_encoder, encoders):
    """Convert the raw input_features into one-hot encoded features
    using our one hot encoder and each feature's encoder. """
    df = pd.DataFrame(input_features, columns=col_names)
    encoded_df = one_hot_encoder(df, encoders)
    return model.predict_proba(encoded_df.to_numpy())

In [None]:
predict_proba(sklearn_model, x_val[:3][feature_names].to_numpy(), feature_names, data_encode_one_hot, encoders)

In [None]:
# Add the ground truths to the ordinal dataset for Unbox
x_val['churn'] = y_val.values
x_train['churn'] = y_train.values

In [None]:
from unboxapi.tasks import TaskType

# Comment this out and uncomment the next section to load the project
project = client.create_project(
    name="Banking Churn Project",
    description="Project for predicting Banking Churn"
)
'''
# Use this for loading the project on subsequent runs
project = client.load_project(
    name="Banking Churn Project"
)
'''

dataset = project.add_dataframe(
    df=x_val,
    class_names=class_names,
    label_column_name='churn',
    name="Churn Validation",
    description='this is my churn dataset',
    task_type=TaskType.TabularClassification,
    feature_names=feature_names,
    categorical_feature_names=["Gender", "Geography"],
)

In [None]:
model = project.add_model(
    function=predict_proba, 
    model=sklearn_model,
    model_type=ModelType.sklearn,
    task_type=TaskType.TabularClassification,
    class_names=class_names,
    name='Churn Classifier',
    description='this is my churn classification model',
    feature_names=feature_names,
    train_sample_df=x_train[:3000],
    train_sample_label_column_name='churn',
    categorical_feature_names=["Gender", "Geography"],
    col_names=feature_names,
    one_hot_encoder=data_encode_one_hot,
    encoders=encoders,
)
model.to_dict()