In [None]:
import os
import sys
import numpy as np
import pandas as pd
import datarobot as dr
import re
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
from mpl_toolkits.mplot3d import Axes3D
# import seaborn as sns

# specify your working derictory
os.chdir("/Users/..")

np.random.seed(12345)

PROJECT_ID = 'project_id' #required
MODEL_ID = 'model_id' #required

USERNAME = 'your_username' #required
API_TOKEN = 'your_api_token' #required

PREDICTION_SERVER_URL = 'https://app.datarobot.com' #required
PREDICTION_SERVER_KEY = '' #optional datarobot_key for dedicated servers, not needed for shared prediction server

In [None]:
# Specify features for ploting PDP, target, input data file
nSample = 1000
feature1 = "PrimaryDiagnosisType" 
feature2 = 'DisablingEventType'
target = "claimduration"
inputFile = "trainingdata.csv"

In [None]:
# set up datarobot
dr.Client(endpoint='https://app.datarobot.com/api/v2', token=API_TOKEN)
project = dr.Project.get(PROJECT_ID)
model = dr.Model.get(project=PROJECT_ID,
                     model_id=MODEL_ID)

In [None]:
# slightly reduces memory consumption, comment it out if you have in your dataset very large integer and/or float numbers
def dataset_reduce_memory(data):
    for c in data.select_dtypes(include=['float64']).columns:
        data[c]=data[c].astype(np.float32)
    for c in data.select_dtypes(include=['int64']).columns:
        data[c]=data[c].astype(np.int32)
    return data

In [None]:
# read in the training data
rawData = pd.read_csv(inputFile, encoding='utf-8', engine='c')
rawData = dataset_reduce_memory(rawData)

In [None]:
# create an artificial dataset
np.random.seed(12345)
small = rawData.sample(nSample).reset_index()
# small.drop(['index'], axis=1, inplace=True)
small.shape

In [None]:
def define_class_types(data, feature1, feature2, n=50):
    '''
    A function to determine data types of chosen features and to generate n possible values of numeric feature(s) and/or up to 50 unique values of categorical feature(s)
    Outliers of numeric features are pruned by filtering out 1st and 99th percentiles

    Attributes
    ----------
    data : pd.DataFrame object
        input dataset
    feature1 : str
        name of the feature1
    feature2 : str
        name of the feature2
    n : int
        number of values of numeric features to be drawn from np.linspace
        
    Returns
    -------
    v1 : np.array
        generated values of first feature
    v2 : np.array
        generated values of second feature
    dtype1 : str
        datatype of the first feature
    dtype2 : str
        datatype of the second feature
    '''
    dtype1 = re.sub('\d+','',str(rawData[feature1].dtype))
    dtype2 = re.sub('\d+','',str(rawData[feature2].dtype))
    
    # Change it if other data types are in your dataset
    if(dtype1 in['int','float','double']): dtype1 = 'numeric' 
    if(dtype2 in['int','float','double']): dtype2 = 'numeric' 
    
    print(dtype1, dtype2)
    if dtype1 == 'object' and dtype2=='numeric':
        v1 = list(set(data[feature1].values))
        assert len(v1) <= 50 , "Too many levels in categorical feature1"
        v2 = data[feature2].values
        v2 = np.linspace(np.nanpercentile(v2, 1), np.nanpercentile(v2, 99), n, endpoint=True)
        
    elif dtype1 == 'numeric' and dtype2=='object':
        v2 = list(set(data[feature2].values))
        assert len(v2) <= 50 , "Too many levels in categorical feature2"
        v1 = data[feature1].values
        v1 = np.linspace(np.nanpercentile(v1, 1), np.nanpercentile(v1, 99), n, endpoint=True)
        
    elif dtype1 == 'numeric' and dtype2=='numeric':
        v1 = data[feature1].values
        v1 = np.linspace(np.nanpercentile(v1, 1), np.nanpercentile(v1, 99), n, endpoint=True)
        v2 = data[feature2].values
        v2 = np.linspace(np.nanpercentile(v2, 1), np.nanpercentile(v2, 99), n, endpoint=True)
        assert len(v1) == len(v2), 'Smth is buggy'
    
    else: #'object' & 'object'
        assert dtype1 == dtype2 == 'object' , "Check data types"
        v1 = np.array(list(set(data[feature1].values)))
        assert len(v1) <= 50 , "Too many levels in categorical feature1"
        v2 = np.array(list(set(data[feature2].values)))
        assert len(v2) <= 50 , "Too many levels in categorical feature2"
        
    return v1, v2, dtype1, dtype2

In [None]:
# create a grid of possible feature values
v1, v2, dtype1, dtype2 = define_class_types(rawData, feature1, feature2)
newValues = np.array(np.meshgrid(v1, v2)).reshape(2, len(v1)*len(v2)).T #equivalent to expand.grid in R

In [None]:
# create a dataframe for making predictions on new feature values
artificial = small.loc[np.repeat(np.arange(0, small.shape[0]), len(newValues)),]
artificial[feature1] = list(newValues[:,0]) * small.shape[0]
artificial[feature2] = list(newValues[:,1]) * small.shape[0]
artificial.drop(['index'], axis=1,inplace=True)
artificial.shape

In [None]:
# Old version of prediction function
# # NB! THIS TAKES TIME.
# # function to upload dataset, predict and retrieve predictions from app.datarobot.com 
# def predict_artificial(artificial):
#     dataset = project.upload_dataset(artificial)
#     print('Dataset is uploaded')
#     predict_job = model.request_predictions(dataset.id)
#     predictions = predict_job.get_result_when_complete()
#     print('Predictions are ready and retrieved') 
#     #TODO found out if time to predict N rows could be retrieved (as Model Info shows) to precalculate waiting time  
#     assert artificial.shape[0] == predictions.shape[0], 'Something is wrong here'
#     artificial[target] = predictions['prediction'].values
#     # dataset.delete()#optional, cleaning
#     return artificial

# # artificial = predict_artificial(artificial)

In [None]:
def predict_batch_scoring(dataset, shared_server=True, filepath_in='temp_scoring.csv', filepath_out='temp_preds.csv', n_samples=10000, n_concurrent=5):
    '''
    A function for prediction in batches using DataRobot API

    Attributes
    ----------
    dataset : pd.DataFrame object
        artificially created dataset for Partial Dependence Plot
    shared_server : bool
        Set to True when you want use a shared server, otherwise PREDICTION_SERVER_KEY must be specified and dedicated prediction servers are used
    filepath_in : str
        path to input file
    filepath_out : str
        path to temporary output file
    n_samples : int
        number of observation in dataset to predict in one batch
    n_concurrent : int
        number of concurrent prediction processes
    
    Returns
    -------
    dataset : pd.DataFrame object
        dataset with target column replaced by predictions
    '''
    #remove output file if already exists
    if os.path.isfile(filepath_out): 
        os.remove(filepath_out)
    
    dataset.to_csv(filepath_in, index=False, encoding='utf-8')
    print('Input dataset is stored in temp file\nBatch scoring is started')
    
    # modify --api_version if you need
    if shared_server:
        cmd = ("batch_scoring --host={0}/api --user=\"{1}\" --out={2} {3} {4} {5} --api_token={6} --datarobot_key={7} --api_version=predApi/v1.0 --timeout=600 --n_samples={8} --n_concurrent={9} --n_retry=5 --no").format(
            PREDICTION_SERVER_URL, USERNAME, filepath_out, PROJECT_ID, MODEL_ID, filepath_in, API_TOKEN, PREDICTION_SERVER_KEY, n_samples, n_concurrent)
    else:
        cmd = ("batch_scoring --host={0}/api --user=\"{1}\" --out={2} {3} {4} {5} --datarobot_key={6} --timeout=600 --verbose --n_samples={7} --n_concurrent={8} --n_retry=5 --no").format(
            PREDICTION_SERVER_URL, USERNAME, filepath_out, PROJECT_ID, MODEL_ID, filepath_in, PREDICTION_SERVER_KEY, n_samples, n_concurrent)
    # print(cmd)
    os.system(cmd)
    print('Batch scoring is finished')

    preds = pd.read_csv(filepath_out, encoding='utf-8', engine='c')
    preds = preds.sort_values('row_id')
    assert dataset.shape[0] == preds.shape[0], 'Input and output dataset mismatch in rows number'
    # assert dataset.index == preds.index, 'Input and output dataset mismatch in rows indexes'
    dataset[target] = preds[target].values
    
    return dataset

artificial = predict_batch_scoring(artificial)

In [None]:
# compute an average prediction per feature1 and feature2 value pair
plot_data = artificial.groupby([feature1, feature2]).agg({target: "mean"}).reset_index()
if dtype1 == 'numeric':
    plot_data[feature1] = plot_data[feature1].astype(np.float32)

# plot_data_pivot = plot_data.pivot(index=feature1, columns=feature2, values=target)

In [None]:
def plot_pd_num_obj(plot_data, feature1=feature1, feature2=feature2):
    '''
    plot a numeric-object or object-numeric line chart
    '''
    plt.style.use('seaborn-notebook')
    palette = plt.get_cmap('Set1')
    
    fig, ax = plt.subplots(figsize=(16,16))
    num=0
    for label, df in plot_data.groupby(feature2):
        num+=1
        plt.plot(df[feature1], df[target], marker='', color=palette(num), linewidth=1, alpha=0.9, label=label)
     
    ax.set_xticklabels(plot_data[feature1].values, rotation=45)
    # Add legend
    plt.legend(loc=2, ncol=1, title = feature2, fontsize = 10)#'best'
 
    # Add titles
    plt.title(("Partial Dependence Plot Of {} \nOn {} And {}").format(target, feature1, feature2), loc='center', fontsize=16, fontweight=0, color='darkblue')
    plt.xlabel(feature1)
    plt.ylabel(target)
    
    plt.show()
    
    #Some simple and usefull stuff:
    #plot_data.set_index(feature1).sort_index().groupby(feature2)[target].plot(style='--o', legend=True)
    #sns.pointplot(x=feature1, y=target, hue=feature2, data=plot_data)
    
    # print('Convenience plot with ``partial_dependence_plots``')
    # XX, YY = np.meshgrid(np.array(plot_data_pivot.index), np.array(plot_data_pivot.columns))
    # Z = np.array(plot_data_pivot.T)
    # plt.contour(XX, YY, Z, colors='black');

# plot_pd_num_obj(plot_data)

In [None]:
def plot_pd_obj(plot_data):
    '''
    plot a heatmap of object features
    '''
    plot_data = plot_data.pivot(index=feature1, columns=feature2, values=target)
    
    fig = plt.figure()
    fig, ax = plt.subplots(1,1, figsize=(16,16))
    heatmap = ax.imshow(plot_data, cmap='BuPu')
    
    ax.set_xticks(np.arange(len(plot_data.columns)))
    ax.set_yticks(np.arange(len(plot_data.index)))    
    
    ax.set_xticklabels(plot_data.columns, rotation=45)
    ax.set_yticklabels(plot_data.index)
    
    ax.set_title(("Partial Dependence Of {} \nOn Categorical Variables {} And {}").format(target, feature1, feature2))
    ax.set_xlabel(feature1)
    ax.set_ylabel(feature2)
    
    plt.colorbar(heatmap)
    plt.show()
    
# plot_pd_obj(plot_data)

In [None]:
# plot a 3d scatterplot
def plot_pd_numerics(plot_data):
    '''
    plot a 3d scatterplot of numeric features
    '''
    plot_data_pivot = plot_data.pivot(index=feature1, columns=feature2, values=target)
    
    fig = plt.figure(figsize=(16,16))

    XX, YY = np.meshgrid(np.array(plot_data_pivot.index), np.array(plot_data_pivot.columns))
    Z = np.array(plot_data_pivot)
    ax = Axes3D(fig)
    surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1,
                       cmap=plt.cm.BuPu, edgecolor='k')
    ax.set_xlabel(feature1)
    ax.set_ylabel(feature2)
    ax.set_zlabel('Partial dependence')
    #  pretty init view
    ax.view_init(elev=22, azim=122)
    plt.colorbar(surf)
    plt.suptitle('Partial Dependence Plot Of {} \nOn average {} And {}'.format(target, feature1, feature2))
    plt.subplots_adjust(top=0.9)

    plt.show()

# plot_pd_numerics(plot_data)

In [None]:
def plot_partial_dependence(plot_data, dtype1, dtype2):
    '''
    main plotting function
    
    Attributes
    ----------
    plot_data : pd.DataFrame object
        dataset for Partial Dependence Plot with target column
    dtype1 : str
        datatype of the first feature
    dtype2 : str
        datatype of the second feature
    
    Returns
    -------
    Partial dependence plot shown inline
    '''
    if dtype1 == 'numeric' and dtype2 == 'object':
        plot_pd_num_obj(plot_data)
    
    if dtype1 == 'object' and dtype2 == 'numeric':
        f1, f2 = feature2, feature1
        plot_pd_num_obj(plot_data, f1, f2)
    
    if dtype1 == 'numeric' and dtype2 == 'numeric':
        plot_pd_numerics(plot_data)
    
    if dtype1 == 'object' and dtype2 == 'object':
        plot_pd_obj(plot_data)

plot_partial_dependence(plot_data, dtype1, dtype2)