<a href="https://colab.research.google.com/github/arnu123/TurboMLExploration/blob/main/quickstart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TurboML Exploration

In [None]:
!pip install -q turboml-installer
import turboml_installer ; turboml_installer.install_on_colab()

In [None]:
import turboml as tb

tb.init(
  backend_url="********",
  api_key="********"
)

In [None]:
!pip install onnx==1.14.1 scikit-learn skl2onnx

In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.helpers.onnx_helper import select_model_inputs_outputs
import matplotlib.pyplot as plt

### Dealing with the data

In [5]:
bank_df = pd.read_csv("bank.csv")

In [None]:
bank_df.head()

In [None]:
bank_df.dtypes

In [None]:
# Lets explore the unique values in each column of bank_df
for col in bank_df.columns:
    print(col, bank_df[col].unique())

#### There is no pre-defined primary key. Hence, we use the inbuilt index as the primary key.

In [9]:
bank_df['primary_key'] = bank_df.index

In [None]:
#lets see what unique values are there in pdays column
bank_df['pdays'].unique()

##### There are '-1' values in pdays representing clients who have never been contacted. Thus, we ignore these.


In [11]:
bank_df = bank_df[bank_df['pdays']!=-1]

In [12]:
labels_df = bank_df[['primary_key','deposit']]
bank_df = bank_df.drop('deposit', axis=1)

In [13]:
# Lets convert the values in column "deposit" to binary.
labels_df['deposit'] = labels_df['deposit'].apply(lambda x: 1 if x=='yes' else 0)

In [None]:
joined_df = pd.merge(bank_df, labels_df, on="primary_key", how="right")
joined_df

In [15]:
X = joined_df.drop("deposit", axis=1)
y = joined_df["deposit"]

In [16]:
X_encoded = pd.get_dummies(X) #one-hot for categorical variables encoding

In [17]:
# lets change job_admin. to job_admin column name
X_encoded.rename(columns={'job_admin.': 'job_admin'}, inplace=True)
X_encoded.rename(columns={'job_blue-collar': 'job_blue_collar'}, inplace=True)
X_encoded.rename(columns={'job_self-employed': 'job_self_employed'}, inplace=True)

In [18]:
# lets change job_admin. to job_admin column name
bank_df.rename(columns={'job_admin.': 'job_admin'}, inplace=True)
bank_df.rename(columns={'job_blue-collar': 'job_bluecollar'}, inplace=True)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.25, random_state=42
)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [27]:
initial_type = [("float_input", FloatTensorType([None, X_train.shape[1]]))]
onx = convert_sklearn(
    clf, initial_types=initial_type, options={type(clf): {"zipmap": False}}
)
onx = select_model_inputs_outputs(onx, outputs=["probabilities"])

# TurboML Data Digestion and Model Building

In [None]:
bank = tb.OnlineDataset.from_pd(
    id="details7",
    df=bank_df,
    key_field="primary_key",
    load_if_exists=True,
)

labels = tb.OnlineDataset.from_pd(
    id="details_labels5",
    df=labels_df,
    key_field="primary_key",
    load_if_exists=True,
)

In [None]:
bank.feature_engineering.get_local_features()

In [31]:
numerical_fields = ['age','balance','day','duration','campaign','pdays','previous']

features = bank.get_model_inputs(numerical_fields=numerical_fields)
label = labels.get_model_labels(label_field="deposit")

In [32]:
tb.set_onnx_model("randomforest", onx.SerializeToString())
onnx_model = tb.ONNX(model_save_name="randomforest")

In [34]:
deployed_model = onnx_model.deploy("onnx_model1", input=features, labels=label)

In [35]:
deployed_model.add_metric("WindowedAUC")

### Checking TurboML inbuilt algorithms

In [None]:
tb.ml_algorithms(have_labels=False)

Let's use the RandomCutForest (RCF) algorithm.

In [38]:
model = tb.RCF(number_of_trees=50)

In [40]:
deployed_model_rcf = model.deploy(name="explore_unsup1", input=features, labels=label)

In [None]:
outputs = deployed_model_rcf.get_outputs()

In [None]:
sample_output = outputs[-1]
sample_output

In [None]:
import matplotlib.pyplot as plt

plt.plot([output["record"].score for output in outputs])

In [None]:
model_endpoints = deployed_model_rcf.get_endpoints()
model_endpoints

In [None]:
model_query_datapoint = bank_df.iloc[-1].to_dict()
model_query_datapoint

In [47]:
import requests

resp = requests.post(
    model_endpoints[0], json=model_query_datapoint, headers=tb.common.api.headers
)

In [None]:
resp.json()

### Batch Inference on our Model

In [None]:
outputs = deployed_model_rcf.get_inference(bank_df)
outputs

In [50]:
deployed_model_rcf.add_metric("WindowedAUC")

In [None]:
model_auc_scores = deployed_model_rcf.get_evaluation(
    "WindowedAUC",

    window_size=200,
)
# model_auc_scores = deployed_model_rcf.get_evaluation("WindowedAUC")
model_auc_scores[-1]

### Lets try out Supervised Learning

In [None]:
tb.ml_algorithms(have_labels=True)

In [68]:
model = tb.HoeffdingTreeClassifier(n_classes=2)

In [70]:
deployed_model_htc = model.deploy("explore", input=features, labels=label)

We can now inspect the outputs.

In [None]:
outputs = deployed_model_htc.get_outputs()

In [None]:
len(outputs)

In [None]:
sample_output = outputs[-1]
sample_output

In [None]:
model_endpoints = deployed_model_htc.get_endpoints()
model_endpoints

In [None]:
resp = requests.post(
    model_endpoints[0], json=model_query_datapoint, headers=tb.common.api.headers
)
resp.json()

In [61]:
deployed_model_htc.add_metric("WindowedAUC")

In [None]:
model_auc_scores = deployed_model_htc.get_evaluation("WindowedAUC")
model_auc_scores[-1]