In [1]:
import csv
import pandas as pd
import numpy as np
import datarobot as dr

In [2]:
# Connect to DataRobot
dr.Client(config_path='/Users/vinay.wunnava/Documents/DRU R Class/drconfig.yaml')

<datarobot.rest.RESTClientObject at 0x120d0eeb8>

In [3]:
# Enter the project ID as pid and model ID as mid
# Existing Lending club model is taken as an example
# Model ID should have rating tables (GA2M)
# Blueprint should not contain any text based features
pid = '5e13fa5e98cb9d14b39e9028'
mid = '5f1a8f98ecf25d105eaff6a8'
# mid = '5e13fbcc243e8e7eea79d0bc'

In [15]:
def download_rating_table(pid, mid):
    """ Download the rating table corresponding to the pid and mid
    """
    project = dr.Project.get(pid)
    rating_tables = rating_tables = project.get_rating_tables()
#     rating_table_model = dr.RatingTableModel.get(project_id=pid, model_id=mid) # does not work with frozen models
    # Then retrieve the rating table from the model
#     rating_table_id = rating_table_model.rating_table_id
#     rating_table = dr.RatingTable.get(pid, rating_table_id)
    rating_table = [rt for rt in rating_tables if rt.model_id == mid][0]
    filepath = './my_rating_table_' + mid + '.csv'
    rating_table.download('./my_rating_table_' + mid + '.csv')
    return filepath

def csv_after_emptylines(filepath, bl_group_n=1, dtype=str):
    """ Read a .CSV into a Pandas DataFrame, but only after at least one blank line has been skipped.
    bl_group_n is the expected number of distinct blocks of blank lines (of any number of rows each) to skip before reading data.
    NB: E.g. pd.read_csv(filepath, skiprows=[0, 1, 2]) works if you know the number of rows to be skipped. Use this function if you have a variable / unknown number of filled rows (to be skipped / ignored) before the empty rows.
    """
    with open(filepath, newline='') as f:
        blank_lines = 0
        bl_groups = 0
        contents = []
        headers = None
        r = csv.reader(f)
        for i, l in enumerate(r):
            if bl_groups < bl_group_n:
                if not l:
                    blank_lines += 1
                    continue
                if blank_lines == 0:
                    continue
                bl_groups += 1
                blank_lines = 0
                headers = l
                continue
            contents.append(l)
        return pd.DataFrame(data=contents, columns=headers, dtype=dtype)

def csv_until_emptyline(filepath, dtype=str):
    """ Read a .CSV into a Pandas DataFrame until a blank line is found, then stop.
    """
    with open(filepath, newline='') as f:
        contents = []
        r = csv.reader(f)
        for i, l in enumerate(r):
            if not l:
                break
            if i == 0:
                headers = l
                continue
            contents.append(l)
        return pd.DataFrame(data=contents)

def extract_intercept(filepath):
    """ Read a .CSV into a Pandas DataFrame until a blank line is found, then stop.
        Extract intercept value and return it
    """ 
    df = csv_until_emptyline(filepath)
    df.rename(columns={df.columns[0]: "raw" }, inplace = True)
    df[['name','value']] = df['raw'].str.split(":",expand=True)
    intercept = pd.to_numeric(df.loc[df.name == 'Intercept','value'].values[0])
    return intercept

def invert_coefficients(intercept, rating_table):
    """ Inverting the sign of intercept and all the coefficients - this is to ensure that the high risk people are given low scores
        Mathematically, we are modelling log of odds and the riskier profiles have high probability
        When we negate the coefficients, it will mean the log of odds of non-risky profiles (- log(p/1-p) = log(1-p/p))
    """
    intercept = - intercept
    rating_table.loc[:,'Coefficient'] = - rating_table['Coefficient'].astype(float)
    return intercept, rating_table

def convert_rating_table_to_scores(intercept, rating_table, min_score=300, max_score=850):
    rating_table['Rel_Coefficient'] = rating_table['Coefficient']
    baseline = intercept
    min_sum_coef = 0
    max_sum_coef = 0
    for feat in rating_table['Feature Name'].unique():
        min_feat_coef = rating_table.loc[rating_table['Feature Name'] == feat]['Coefficient'].min()
        print('Minimum coefficient for feature ' + feat + ' ' + str(min_feat_coef))
        rating_table.loc[rating_table['Feature Name'] == feat,'Rel_Coefficient'] = rating_table['Coefficient'] - min_feat_coef
        baseline += min_feat_coef
        min_sum_coef = min_sum_coef + rating_table.loc[rating_table['Feature Name'] == feat]['Rel_Coefficient'].min()
        max_sum_coef = max_sum_coef + rating_table.loc[rating_table['Feature Name'] == feat]['Rel_Coefficient'].max()

    min_sum_coef = min_sum_coef + baseline
    max_sum_coef = max_sum_coef + baseline
    
    rating_table.loc[:,'Variable Score'] = rating_table['Rel_Coefficient']*((max_score-min_score)/(max_sum_coef - min_sum_coef))
    baseline_score = (((baseline-min_sum_coef)/(max_sum_coef-min_sum_coef))*(max_score-min_score))+min_score
    
    return baseline_score, rating_table.drop(columns=['Coefficient','Rel_Coefficient'])

def get_scorecard(pid,mid, min_score=300, max_score=850):
    """ Download rating table for a particular pid and mid and return scorecard
    """
    filepath = download_rating_table(pid,mid)    
    rating_table_raw = csv_after_emptylines(filepath)
    intercept_raw = extract_intercept(filepath)
    intercept, rating_table = invert_coefficients(intercept_raw, rating_table_raw)
    intercept_score, scorecard = convert_rating_table_to_scores(intercept, rating_table, min_score, max_score)
    
    return intercept_score, scorecard

In [16]:
intercept_score, scorecard = get_scorecard(pid,mid,min_score=300,max_score=850)

Minimum coefficient for feature annual_inc -0.7971001550635846
Minimum coefficient for feature dti -0.18100027283235265
Minimum coefficient for feature inq_last_6mths -0.3096886255271891
Minimum coefficient for feature revol_util_percent -0.6561420159264604
Minimum coefficient for feature ( inq_last_6mths & revol_util_percent ) -0.2953854956969044


In [17]:
intercept_score

300.0

In [18]:
scorecard

Unnamed: 0,Feature Name,Feature Strength,Type,Transform1,Value1,Transform2,Value2,Weight,Variable Score
0,annual_inc,0.300775009108807,NUM,Binning,"(-inf, 13700.0]",,,57.0,0.000000
1,annual_inc,0.300775009108807,NUM,Binning,"(13700.0, 20200.0]",,,159.0,47.420770
2,annual_inc,0.300775009108807,NUM,Binning,"(20200.0, 21576.0]",,,32.0,49.394504
3,annual_inc,0.300775009108807,NUM,Binning,"(21576.0, 22489.0]",,,33.0,47.922292
4,annual_inc,0.300775009108807,NUM,Binning,"(22489.0, 24584.0]",,,125.0,30.520414
...,...,...,...,...,...,...,...,...,...
346,( inq_last_6mths & revol_util_percent ),0.033830128907363956,2W-INT,Binning,"(6, inf)",Binning,"(89, 91]",1.0,45.210374
347,( inq_last_6mths & revol_util_percent ),0.033830128907363956,2W-INT,Binning,"(6, inf)",Binning,"(91, 92]",0.0,43.279859
348,( inq_last_6mths & revol_util_percent ),0.033830128907363956,2W-INT,Binning,"(6, inf)",Binning,"(92, 95]",2.0,30.759709
349,( inq_last_6mths & revol_util_percent ),0.033830128907363956,2W-INT,Binning,"(6, inf)",Binning,"(95, inf)",0.0,31.991202
