### Performance testing results (Multiple External Prediction Models)
- TREX released External Prediction Insights (1 model only) as GA in 7.3.
- TREX working on mini project to support more than 1 external model (up to 25, or 50, or 100, or ???)
- Following code is meant to be used as notes/reference material for the performance testing work

Using the following mbtest-dataset:
```
https://s3.amazonaws.com/datarobot_public_datasets/phone_gender_wide_test_20.csv
```

Dataset properties:
- RangeIndex: `14929 entries`, `0 to 14928`
- Columns: `4124 entries`, `device_id to hier_3_944`
- dtypes: `float64(2547)`, `int64(1572), object(5)`

In [52]:
# Imports
import copy
import json
import time
import requests
import pandas as pd
import datarobot as dr

In [4]:
############################################################### 
# WARNING: The following can take a couple of minutes
###############################################################
df = pd.read_csv("https://s3.amazonaws.com/datarobot_public_datasets/phone_gender_wide_test_20.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [15]:
# Dataset changes for easier preprocessing
df2 = copy.deepcopy(df)
# Rename columns (pick a column with 2 levels to be used as UserCV partition column)
df2 = df2.rename(columns={'hier_1_1': 'partition_column'})
# Just take a couple thousand rows sample
df2 = df2.sample(n=2000, random_state=1234)

In [13]:
# Using datarobot to create External Predictions configured project
# First, setting up advanced options and partitioning settings
# advanced_options = dr.AdvancedOptions(
#     external_predictions=#asdf,
# )

# project.set_target(
#     target='gender',
#     metric='LogLoss',
#     mode=dr.AUTOPILOT_MODE.QUICK,
#     partition=#asdf,
#     advanced_options=#asdf
# )

# oops, not available in public_api_client yet

In [23]:
df2.to_csv("/Users/alexander.shoop/Downloads/phone_gender_wide_ext_preds_perf.csv", index=False, header=True)

In [39]:
df3 = df2.filter(regex='X.*')

In [40]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 7635 to 238
Columns: 4124 entries, device_id to hier_3_944
dtypes: float64(2547), int64(1572), object(5)
memory usage: 62.9+ MB


In [41]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 7635 to 238
Columns: 2189 entries, X.1017948566093528960 to X80s
dtypes: float64(2111), int64(78)
memory usage: 33.4 MB


In [76]:
external_pred_columns_full = list(df3.columns)
external_pred_columns_full = [feature.replace('.', '_') for feature in external_pred_columns_full]
current_max_list = external_pred_columns_full[:25]
print(f"Public preview max allowed as of 7.3: {len(current_max_list)}")

# TODO, the following is the important part
bigger_max_list = external_pred_columns_full[:50]
print(f"Bigger max: {len(bigger_max_list)}")

ultra_max_list = external_pred_columns_full[:101]
print(f"Ultra max: {len(ultra_max_list)}")

Public preview max allowed as of 7.3: 25
Bigger max: 50
Ultra max: 101


In [85]:
# Credentials (LOCAL)
API_KEY = "NjE1Y2ZiNTBkYWQzMzAzNmNjMDRmYjJhOlFCYjZUeWZYQmh4VXdtZW9VeFB6emRrNGZrbUFzc1FwdUd6SDcrTk9ZN009"
api_endpoint = "http://localhost/api/v2"
headers = {"Authorization": f"Bearer {API_KEY}", "content-type": "application/json"}

# phone_gender_wide_ext_preds_perf.csv dataset ID in AI Catalog
dataset_id = "61a7d5e791d21db1186ed0ef"

project_endpoint = f"{api_endpoint}/projects/"
project_payload = {"datasetId": dataset_id}

project = requests.post(
    url=project_endpoint, data=json.dumps(project_payload), headers=headers
)
project_id = project.json().get("pid")

# Wait for EDA1 to complete (so that we have target name for Autopilot)
for _ in range(40):
    time.sleep(1)
    project_status = requests.get(url=project.headers.get("Location"), headers=headers)
    if "projectName" in project_status.json():
        print("EDA1 is done.")
        break

# AUTOPILOT START (QUICK)
autopilot_endpoint = f"{api_endpoint}/projects/{project_id}/aim/"
autopilot_payload = {
    "target": "gender",
    "mode": "quick",
    "cvMethod": "user",
    "validationType": "TVH",
    "userPartitionCol": "partition_column",
    "trainingLevel": 0,
    "validationLevel": 1,
    "externalPredictions": ultra_max_list,
}
print(json.dumps(autopilot_payload))
start_autopilot = requests.patch(
    url=autopilot_endpoint, data=json.dumps(autopilot_payload), headers=headers
)
print(start_autopilot)
print(start_autopilot.text)

EDA1 is done.
{"target": "gender", "mode": "quick", "cvMethod": "user", "validationType": "TVH", "userPartitionCol": "partition_column", "trainingLevel": 0, "validationLevel": 1, "externalPredictions": ["X_1017948566093528960", "X_1020265322706106880", "X_102121250986068720", "X_1023242980262397056", "X_1025376319773210240", "X_1030677981150664064", "X_1043843386262755968", "X_1047273806018410496", "X_1049178431836388480", "X_1051230690008472576", "X_1051986413094956928", "X_1052652594976102656", "X_1056519382337269888", "X_1060787891591033088", "X_1065003475710886784", "X_1066222546835696768", "X_1068679832545653120", "X_1073344577746533120", "X_1101403352972671616", "X_112564492675318256", "X_1137288509384140416", "X_1167860292985101568", "X_1169833384053497856", "X_1186012730215562496", "X_1200607960388315136", "X_12138416858907556", "X_1221284816487722752", "X_1229355550180989696", "X_1233535283618068480", "X_1233889048809966080", "X_1234297064722937600", "X_1235212919783589888", "

In [83]:
# for feature in ultra_max_list:
#     print(f"- {feature}")