In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier,VotingClassifier,BaggingClassifier

from sklearn.metrics import classification_report

In [None]:
data = pd.read_csv("fraudTrain.csv")

In [None]:
# Relevant columns
feature_names = ['amt', 'cc_num', 'merchant', 'category','state','job']
label = ['is_fraud']

# Outputs
class_names = ["normal", "fraudulent"]

# First 100,000
clean_raw_data = data.iloc[:50000][feature_names + label]

In [None]:
X = clean_raw_data.drop('is_fraud', 1)
y = clean_raw_data['is_fraud']

## Prepare the data for training and for Unbox

In [None]:
categorical_features = ['cc_num', 'merchant', 'category', 'state', 'job']

In [None]:
categorical_map = {
    feature: list(X[feature].unique().astype(str)) for feature in categorical_features
}

In [None]:
def data_encode_one_hot(df, encoders):
    """ Encodes categorical features using one-hot encoding. """
    df = df.copy(True)
    df.reset_index(drop=True, inplace=True) # Causes NaNs otherwise
    enc_dfs = []
    for feature, enc in encoders.items():
        enc_df = pd.DataFrame(enc.transform(df[[feature]]).toarray(), columns=enc.get_feature_names([feature]))
        enc_dfs.append(enc_df)
    df = pd.concat([df] + enc_dfs, axis=1)
    df.drop(list(encoders.keys()), axis=1, inplace=True)
    return df

In [None]:
def create_encoder_dict(df, categorical_feature_names):
    """ Creates encoders for each of the categorical features. 
        The predict function will need these encoders. 
    """
    from sklearn.preprocessing import OneHotEncoder
    encoders = {}
    for feature in categorical_feature_names:
        enc = OneHotEncoder(handle_unknown='error')
        enc.fit(df[[feature]])
        encoders[feature] = enc
    return encoders

In [None]:
encoders = create_encoder_dict(X, categorical_features)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)
x_train_one_hot = data_encode_one_hot(x_train, encoders)
x_val_one_hot = data_encode_one_hot(x_val, encoders)

## Train a model using the one hot inputs

In [None]:
sklearn_model = GradientBoostingClassifier(random_state=1300)
sklearn_model.fit(x_train_one_hot, y_train)

In [None]:
print(classification_report(y_val, sklearn_model.predict(x_val_one_hot)))

# UNBOX

In [None]:
import unboxapi
from unboxapi.tasks import TaskType
from unboxapi.models import ModelType
client = unboxapi.UnboxClient("YOUR_API_KEY_HERE")

## Create predict function

In [None]:
def predict_proba(model, input_features: np.ndarray, col_names, one_hot_encoder, encoders):
    """Convert the raw input_features into one-hot encoded features
    using our one hot encoder and each feature's encoder. """
    df = pd.DataFrame(input_features, columns=col_names)
    encoded_df = one_hot_encoder(df, encoders)
    return model.predict_proba(encoded_df.to_numpy())

In [None]:
# Test the predict function
predict_proba(sklearn_model, x_val[:10][feature_names].to_numpy(), feature_names, data_encode_one_hot, encoders)

In [None]:
# Add the ground truths to the ordinal dataset for Unbox
x_val['is_fraud'] = y_val.values
x_train['is_fraud'] = y_train.values

In [None]:
from unboxapi.tasks import TaskType

dataset = client.add_dataframe(
    df=x_val.sample(1000),
    class_names=class_names,
    label_column_name='is_fraud',
    name="Fraud detection",
    description='this is my fraud dataset',
    task_type=TaskType.TabularClassification,
    feature_names=feature_names,
    categorical_features_map=categorical_map,
)
dataset.to_dict()

In [None]:
model = client.add_model(
    function=predict_proba, 
    model=sklearn_model,
    model_type=ModelType.sklearn,
    task_type=TaskType.TabularClassification,
    class_names=class_names,
    name='Fraud detection',
    description='this is my fraud classification model',
    feature_names=feature_names,
    train_sample_df=x_train,
    train_sample_label_column_name='is_fraud',
    categorical_features_map=categorical_map,
    col_names=feature_names,
    one_hot_encoder=data_encode_one_hot,
    encoders=encoders,
)
model.to_dict()