In [1]:
import pandas as pd
import numpy as np
import json
import pickle

from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold
from sklearn.preprocessing import StandardScaler  
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier,BaggingClassifier

from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv("Churn_Modelling.csv")

In [3]:
X = data.iloc[:, 3:-1]
y = data.iloc[:, -1]
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.00,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.80,3,1,0,113931.57
3,699,France,Female,39,1,0.00,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77
9997,709,France,Female,36,7,0.00,1,0,1,42085.58
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52


## Prepare the data for training and for Unbox

In [4]:
def data_encode_one_hot(df, encoders):
    """ Encodes categorical features using one-hot encoding. """
    df = df.copy(True)
    df.reset_index(drop=True, inplace=True) # Causes NaNs otherwise
    for feature, enc in encoders.items():
        print(f"Encoding {feature}")
        print(f"Sample raw values: {df[feature].iloc[:10]}")
        enc_column = enc.transform(df[feature].to_numpy().reshape(-1, 1)).toarray()
        print(f"Sample encoded features: {enc_column[:3, :]}")
        print(f"Joining the dfs")
        enc_df = pd.DataFrame(enc_column, columns=enc.get_feature_names_out())
        df = df.join(enc_df)
        df = df.drop(columns=feature)
        print(f"Successfully encoded {feature}")
        print("-------------------------------------------")
    return df

In [5]:
def create_encoder_dict(df, categorical_feature_names):
    """ Creates encoders for each of the categorical features. 
        The predict function will need these encoders. 
    """
    from sklearn.preprocessing import OneHotEncoder
    encoders = {}
    for feature in categorical_feature_names:
        enc = OneHotEncoder(handle_unknown='ignore')
        enc.fit(df[feature].to_numpy().reshape(-1, 1))
        encoders[feature] = enc
    return encoders

In [6]:
encoders = create_encoder_dict(X, ['Geography', 'Gender'])

In [7]:
data_encode_one_hot(X, encoders)

Encoding Geography
Sample raw values: 0     France
1      Spain
2     France
3     France
4      Spain
5      Spain
6     France
7    Germany
8     France
9     France
Name: Geography, dtype: object
Sample encoded features: [[1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]
Joining the dfs
Successfully encoded Geography
-------------------------------------------
Encoding Gender
Sample raw values: 0    Female
1    Female
2    Female
3    Female
4    Female
5      Male
6      Male
7    Female
8      Male
9      Male
Name: Gender, dtype: object
Sample encoded features: [[1. 0.]
 [1. 0.]
 [1. 0.]]
Joining the dfs
Successfully encoded Gender
-------------------------------------------


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,x0_France,x0_Germany,x0_Spain,x0_Female,x0_Male
0,619,42,2,0.00,1,1,1,101348.88,1.0,0.0,0.0,1.0,0.0
1,608,41,1,83807.86,1,0,1,112542.58,0.0,0.0,1.0,1.0,0.0
2,502,42,8,159660.80,3,1,0,113931.57,1.0,0.0,0.0,1.0,0.0
3,699,39,1,0.00,2,0,0,93826.63,1.0,0.0,0.0,1.0,0.0
4,850,43,2,125510.82,1,1,1,79084.10,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,39,5,0.00,2,1,0,96270.64,1.0,0.0,0.0,0.0,1.0
9996,516,35,10,57369.61,1,1,1,101699.77,1.0,0.0,0.0,0.0,1.0
9997,709,36,7,0.00,1,0,1,42085.58,1.0,0.0,0.0,1.0,0.0
9998,772,42,3,75075.31,2,1,0,92888.52,0.0,1.0,0.0,0.0,1.0


In [8]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)
x_train_one_hot = data_encode_one_hot(x_train, encoders)
x_val_one_hot = data_encode_one_hot(x_val, encoders)

Encoding Geography
Sample raw values: 0      Spain
1    Germany
2     France
3      Spain
4      Spain
5    Germany
6     France
7      Spain
8      Spain
9     France
Name: Geography, dtype: object
Sample encoded features: [[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]
Joining the dfs
Successfully encoded Geography
-------------------------------------------
Encoding Gender
Sample raw values: 0    Female
1      Male
2    Female
3      Male
4    Female
5    Female
6      Male
7      Male
8      Male
9    Female
Name: Gender, dtype: object
Sample encoded features: [[1. 0.]
 [0. 1.]
 [1. 0.]]
Joining the dfs
Successfully encoded Gender
-------------------------------------------
Encoding Geography
Sample raw values: 0    Germany
1     France
2      Spain
3     France
4    Germany
5      Spain
6      Spain
7      Spain
8     France
9     France
Name: Geography, dtype: object
Sample encoded features: [[0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]
Joining the dfs
Successfully encoded Geography
----------------

## Train a model using the one hot inputs

In [9]:
sklearn_model = LogisticRegression(random_state=1300)
sklearn_model.fit(x_train_one_hot, y_train)

In [10]:
print(classification_report(y_val, sklearn_model.predict(x_val_one_hot)))

              precision    recall  f1-score   support

           0       0.80      0.97      0.88      1595
           1       0.37      0.06      0.11       405

    accuracy                           0.79      2000
   macro avg       0.59      0.52      0.49      2000
weighted avg       0.72      0.79      0.72      2000



In [11]:
class_names = ["Retained", "Exited"]
feature_names = X.columns.values.tolist()

# UNBOX

In [12]:
import unboxapi
from unboxapi.tasks import TaskType
from unboxapi.models import ModelType
client = unboxapi.UnboxClient("8c14712a-2901-4e6d-a2c4-452ff3659726")

## Create predict function

In [13]:
def predict_proba(model, input_features: np.ndarray, col_names, one_hot_encoder, encoders):
    """Convert the raw input_features into one-hot encoded features
    using our one hot encoder and each feature's encoder. """
    df = pd.DataFrame(input_features, columns=col_names)
    encoded_df = one_hot_encoder(df, encoders)
    return model.predict_proba(encoded_df.to_numpy())

In [14]:
predict_proba(sklearn_model, x_val[:3][feature_names].to_numpy(), feature_names, data_encode_one_hot, encoders)

Encoding Geography
Sample raw values: 0    Germany
1     France
2      Spain
Name: Geography, dtype: object
Sample encoded features: [[0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]
Joining the dfs
Successfully encoded Geography
-------------------------------------------
Encoding Gender
Sample raw values: 0    Female
1    Female
2    Female
Name: Gender, dtype: object
Sample encoded features: [[1. 0.]
 [1. 0.]
 [1. 0.]]
Joining the dfs
Successfully encoded Gender
-------------------------------------------




array([[0.78264769, 0.21735231],
       [0.66502929, 0.33497071],
       [0.81455616, 0.18544384]])

In [15]:
# Add the ground truths to the ordinal dataset for Unbox
x_val['churn'] = y_val.values
x_train['churn'] = y_train.values

In [16]:
categorical_feature_names = ["Gender", "Geography"]

In [17]:
from unboxapi.tasks import TaskType

dataset = client.add_dataframe(
    df=x_val,
    class_names=class_names,
    label_column_name=1,
    name="Churn Validation",
    description='this is my churn dataset',
    task_type=TaskType.TabularClassification,
    feature_names=feature_names,
    categorical_feature_names=categorical_feature_names,
)
dataset.to_dict()

UnboxValidationError: <Response> There are issues with the data being passed as argument. 
- `label_column_name` not a valid string. 
Make sure to respect the datatypes and constraints specified above.

In [20]:
model = client.add_model(
    function=predict_proba, 
    model=sklearn_model,
    model_type=ModelType.sklearn,
    task_type=TaskType.TabularClassification,
    class_names=class_names,
    name='Churn Classifier 1',
    description='this is my churn classification model',
    feature_names=feature_names,
    train_sample_df=x_train[:3000],
    train_sample_label_column_name='churn',
    categorical_feature_names=categorical_feature_names,
    col_names=feature_names,
    one_hot_encoder=data_encode_one_hot,
    encoders=encoders,
    requirements_txt_file='requirements_test.txt',
    project_id=1
)


UnboxResourceError: <Response> There is an issue with the speficied `function`. 
Your function's additional args ('col_names', 'one_hot_encoder', 'encoders') do not match the kwargs you specifed ('col_names', 'one_hot_encoder', 'encoders', 'project_id'). 
Make sure to include all of the required kwargs to run inference with your `function`.

In [None]:
model

In [None]:
feature_names


In [None]:
categorical_features=[feature_names.index(f) for f in categorical_map]

In [None]:
categorical_features

In [None]:
categorical_map

In [None]:
[feature_names.index(f) for f in categorical_feature_names]

In [None]:
categorical_feature_names = ["Gender", "Geography"]

In [None]:
feature_to_ordinal_map = {
            feature: {
                index: value for index, value in enumerate(list(df[feature].unique()))
            }
            for feature in categorical_feature_names
        }

In [None]:
categorical_feature_names = ["Gender", "Geography"]
categorical_features_map = {
            feature: list(X[feature].astype(str).unique())
            for feature in categorical_feature_names
        }

In [None]:
categorical_features_map

In [None]:
category_names = {
            feature_names.index(k): v for k, v in categorical_features_map.items()
        }

In [None]:
category_names

In [None]:
# categorical -> ordinal
feature_to_ordinal_map = {
            feature: {value: index for index, value in enumerate(feature_values)}
            for feature, feature_values in self.categorical_features_map.items()
        }

# ordinal -> categorical
feature_to_ordinal_map = {
            feature: {index: value for index, value in enumerate(feature_values)}
            for feature, feature_values in self.categorical_features_map.items()
        }
