# Welcome to the Unbox baseline tutorial!

In [46]:
import unboxapi
from unboxapi.tasks import TaskType
from unboxapi.models import ModelType

In [47]:
import pandas as pd
import numpy as np
import json
import pickle

from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold
from sklearn.preprocessing import StandardScaler  
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier,BaggingClassifier

from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

In [48]:
client = unboxapi.UnboxClient("867d5a09-b923-4d99-8287-1ad97bb42105")

## 1. Loading the dataset

First, we load the dataset and separate the features (which will be the model's inputs) from the labels (which will be the model's trained output)

In [49]:
df = pd.read_csv("quick2.csv")

In [50]:
categorical_features = [
    'region_emisor'
]

In [51]:
feature_names = [x for x in df.columns.values.tolist() if x != 'y']

In [52]:
def get_categorical_map(df):
    categorical_map = {}
    for col in categorical_features:
        uniques = sorted([str(c) for c in df.where(pd.notnull(df), None)[col].unique()])
        if 'None' in uniques:
            none_index = uniques.index('None')
            uniques[none_index] = None
        categorical_map[col] = list(uniques)
    return categorical_map

In [53]:
categorical_map = get_categorical_map(df)

## 2. Building Model

In [54]:
def data_encode_one_hot(df, encoders):
    """ Encodes categorical features using one-hot encoding. """
    df = df.copy(True)
    df.reset_index(drop=True, inplace=True) # Causes NaNs otherwise
    for feature, enc in encoders.items():
        enc_df = pd.DataFrame(enc.transform(df[[feature]]).toarray(), columns=enc.get_feature_names([feature]))
        df = df.join(enc_df)
        df = df.drop(columns=feature)
    return df

In [55]:
def create_encoder_dict(df, categorical_feature_names):
    """ Creates encoders for each of the categorical features. 
        The predict function will need these encoders. 
    """
    from sklearn.preprocessing import OneHotEncoder
    encoders = {}
    for feature in categorical_feature_names:
        enc = OneHotEncoder(handle_unknown='ignore')
        enc.fit(df[[feature]])
        encoders[feature] = enc
    return encoders

In [56]:
features_to_use = [
 'score',
 'amount',
 'liquidBPC',
 'ranking',
 'region_emisor',
 'ventas_promedio_12',
 'ventas_promedio_6',
 'ventas_promedio_3',
 'total_amount_60',
 'total_amount_30',
 'total_amount_90',
 'total_amount_15'
                     ]

In [57]:
X = df[features_to_use]
y = df['y']

In [58]:
encoders = create_encoder_dict(X, categorical_features)

X_enc_one_hot = data_encode_one_hot(X, encoders)
X_enc_one_hot

Unnamed: 0,score,amount,liquidBPC,ranking,ventas_promedio_12,ventas_promedio_6,ventas_promedio_3,total_amount_60,total_amount_30,total_amount_90,...,region_emisor_REGION DE TARAPACA,region_emisor_REGION DE ÑUBLE,region_emisor_REGION DEL BIO BIO,region_emisor_REGION DEL LIBERTADOR GENERAL BERNARDO O'HIGGINS,region_emisor_REGION DEL MAULE,region_emisor_REGION LOS LAGOS,region_emisor_REGION METROPOLITANA,region_emisor_REGION VALPARAISO,region_emisor_Sin Información,region_emisor_nan
0,417.0,25983840.0,0.0,,4.583395e+08,3.814219e+08,3.834096e+08,5.297243e+08,3.055204e+08,8.729310e+08,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,581.0,1419518.0,0.0,24519.0,6.740295e+08,6.464448e+08,7.091587e+08,2.708540e+08,7.767852e+07,6.288013e+08,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,761.0,9210600.0,0.0,517.0,4.192225e+08,4.192225e+08,4.192225e+08,1.598318e+08,-4.445598e+07,1.411578e+08,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,437.0,61807.0,0.0,41602.0,4.686007e+09,4.686007e+09,4.686007e+09,6.454345e+09,3.636652e+09,9.278563e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,711.0,393176.0,1.0,89.0,1.327916e+07,1.273301e+07,1.483080e+07,2.254847e+07,1.146816e+07,3.432226e+07,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,198.0,308000.0,0.0,,1.303656e+07,1.239564e+07,1.302847e+07,3.063703e+06,-2.421776e+06,7.677494e+06,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
99996,290.0,41978.0,0.0,,3.090435e+09,3.090435e+09,3.038506e+09,5.827221e+09,2.935532e+09,8.306038e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
99997,724.0,3498600.0,1.0,78.0,5.394490e+07,6.303408e+07,6.702118e+07,2.705995e+07,-1.089905e+07,2.011945e+07,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,497.0,387357.0,0.0,8557.0,3.897325e+08,4.761054e+08,5.217737e+08,3.790930e+07,-4.679972e+07,-6.260750e+07,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [59]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)
x_train_one_hot = data_encode_one_hot(x_train, encoders)
x_val_one_hot = data_encode_one_hot(x_val, encoders)

In [60]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(x_train_one_hot)

In [61]:
x_train_imp = imp.transform(x_train_one_hot)
x_val_imp = imp.transform(x_val_one_hot)

### Train

In [62]:
sklearn_model = LogisticRegression(random_state=1300)
sklearn_model.fit(x_train_imp, y_train)

LogisticRegression(random_state=1300)

In [63]:
print(classification_report(y_val, sklearn_model.predict(x_val_imp)))

              precision    recall  f1-score   support

           0       0.63      0.61      0.62     10117
           1       0.61      0.63      0.62      9883

    accuracy                           0.62     20000
   macro avg       0.62      0.62      0.62     20000
weighted avg       0.62      0.62      0.62     20000



In [64]:
class_names = ["Yes", "No"]
feature_names = X.columns.values.tolist()

### Predict

In [65]:
def predict_proba(model, input_features: np.ndarray, col_names, one_hot_encoder, encoders, imp):
    """Convert the raw input_features into one-hot encoded features
    using our one hot encoder and each feature's encoder. """
    df = pd.DataFrame(input_features, columns=col_names)
    encoded_df = one_hot_encoder(df, encoders)
    df_imp = imp.transform(encoded_df)
    return model.predict_proba(df_imp)

In [66]:
examples_val = x_val[:10].where(pd.notnull(x_val), None)[feature_names].to_numpy()

In [67]:
examples_train = x_train[:10].where(pd.notnull(x_val), None)[feature_names].to_numpy()

In [68]:
predict_proba(sklearn_model, examples_val, feature_names, data_encode_one_hot, encoders, imp)

array([[0.49677758, 0.50322242],
       [0.88889159, 0.11110841],
       [0.49303997, 0.50696003],
       [0.2060205 , 0.7939795 ],
       [0.49888171, 0.50111829],
       [0.36366563, 0.63633437],
       [0.50217093, 0.49782907],
       [0.46580328, 0.53419672],
       [0.5265824 , 0.4734176 ],
       [0.49793375, 0.50206625]])

In [69]:
predict_proba(sklearn_model, examples_train, feature_names, data_encode_one_hot, encoders, imp)

array([[0.4868948, 0.5131052],
       [0.4868948, 0.5131052],
       [0.4868948, 0.5131052],
       [0.4868948, 0.5131052],
       [0.4868948, 0.5131052],
       [0.4868948, 0.5131052],
       [0.4868948, 0.5131052],
       [0.4868948, 0.5131052],
       [0.4868948, 0.5131052],
       [0.4868948, 0.5131052]])

In [70]:
# Add the ground truths to the ordinal dataset for Unbox
x_val['y'] = y_val.values
x_train['y'] = y_train.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_val['y'] = y_val.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train['y'] = y_train.values


## 3. Add to Unbox

In [72]:
# sample = x_val.where(pd.notnull(x_val), None)

In [44]:
dataset_metadata = client.add_dataframe(
    df=x_val[:10],
    class_names=['Yes', 'No'],
    label_column_name='y',
    name="Xepelin Data Mini",
    description='this is xepelin',
    task_type=TaskType.TabularClassification,
    feature_names=feature_names,
    categorical_features_map=categorical_map,
)
dataset_metadata.to_dict()

{'categoricalFeaturesMap': {},
 'classNameCounts': None,
 'classNames': ['Yes', 'No'],
 'columnNames': None,
 'columnSettings': None,
 'dateCreated': '2022-05-16T00:21:59.459923Z',
 'description': 'this is xepelin',
 'featureNames': ['score',
  'amount',
  'liquidBPC',
  'ranking',
  'region_emisor',
  'ventas_promedio_12',
  'ventas_promedio_6',
  'ventas_promedio_3',
  'total_amount_60',
  'total_amount_30',
  'total_amount_90',
  'total_amount_15'],
 'id': 11,
 'inProgressNotifications': ['dataset_11_save_dataset_rows'],
 'labelColumnIndex': None,
 'language': 'en',
 'modelCount': 0,
 'name': 'Xepelin Data Mini',
 'projects': [],
 'rowCount': 0,
 'tagCount': 0,
 'taskType': 'tabular-classification',
 'version': 0}

In [74]:
model_metadata = client.add_model(
    function=predict_proba, 
    model=sklearn_model,
    model_type=ModelType.sklearn,
    task_type=TaskType.TabularClassification,
    class_names=class_names,
    name='Xep Classifier Fix',
    description='this is my churn classification model',
    feature_names=feature_names,
    train_sample_df=x_train.sample(5000),
    train_sample_label_column_name='y',
    categorical_features_map=categorical_map,
    col_names=feature_names,
    one_hot_encoder=data_encode_one_hot,
    encoders=encoders,
    imp=imp
)
model_metadata.to_dict()

Bundling model and artifacts...
Uploading model to Unbox...


{'categoricalFeaturesMap': {},
 'classNames': ['Yes', 'No'],
 'datasetCount': 0,
 'dateCreated': '2022-05-16T00:27:09.562349Z',
 'description': 'this is my churn classification model',
 'featureNames': ['score',
  'amount',
  'liquidBPC',
  'ranking',
  'region_emisor',
  'ventas_promedio_12',
  'ventas_promedio_6',
  'ventas_promedio_3',
  'total_amount_60',
  'total_amount_30',
  'total_amount_90',
  'total_amount_15'],
 'featureSettings': None,
 'id': '9c91b49d-67ca-4a80-8d9e-bd717ccfd258',
 'inProgressNotifications': ['model_9c91b49d-67ca-4a80-8d9e-bd717ccfd258_create_endpoint'],
 'inferenceRunCount': 0,
 'modelApi': None,
 'name': 'Xep Classifier Fix',
 'projects': [],
 'runReportCount': 0,
 'taskType': 'tabular-classification',
 'type': 'sklearn'}