In [1]:
# from:  https://docs.google.com/document/d/1m-2j_1ntvmuSwaOvjgX3qXmYDbQWN5uEkPh9UQKKTJM/edit

In [2]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import os
import requests
import pickle

In [3]:
# 
# Get a lending club project so we can loop thru the features by type
#

import datarobot as dr
dr.Client(token=os.environ['DATAROBOT_API_TOKEN'], endpoint='https://app.datarobot.com/api/v2')

USERNAME = os.environ['DATAROBOT_USERNAME']
API_TOKEN = os.environ['DATAROBOT_API_TOKEN']
ENDPOINT = 'https://app.datarobot.com/api/v2'
DEPLOYMENT_ID = '5c19273c06eeed008a2ac7f9'

headers = {'Content-Type': 'application/json', 'Authorization': 'token %s' % API_TOKEN}
health_response = requests.get('%s/modelDeployments/%s/' % (ENDPOINT, DEPLOYMENT_ID), 
                               headers=headers)
if health_response.status_code == 200:
    deployment_data = health_response.json()
    PROJECT_ID = deployment_data['project']['id']
    MODEL_ID = deployment_data['model']['id']
else:
    PROJECT_ID = ''  # your project id
    MODEL_ID = ''  # your model id

project = dr.Project.get(project_id=PROJECT_ID)

numeric_features = []
categorical_features = []
text_features = []

# feats1 = project.get_features()
fl = project.get_featurelists()
flr = fl[0]
feats = sorted(flr.features)
for feat in feats:
    f = dr.Feature.get(PROJECT_ID, feat)
    if f.feature_type == 'Numeric':
        if f.name != 'is_bad':
            numeric_features.append(f.name)
    elif f.feature_type == 'Categorical':
        categorical_features.append(f.name)
    elif f.feature_type == 'Text':
        text_features.append(f.name)
        
print('numeric_features: %s\n' % numeric_features)
print('categorical_features: %s\n' % categorical_features)
print('text_features: %s\n' % text_features)

numeric_features: ['annual_inc', 'delinq_2yrs', 'dti', 'funded_amnt', 'inq_last_6mths', 'installment', 'loan_amnt', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc']

categorical_features: ['addr_state', 'emp_length', 'grade', 'home_ownership', 'initial_list_status', 'policy_code', 'purpose', 'pymnt_plan', 'sub_grade', 'term', 'verification_status', 'zip_code']

text_features: ['desc', 'emp_title', 'title', 'url']



In [4]:
PATH = '.'
FILEPATH = 'DR_Demo_10K_Lending_Club_Loans_train.csv'
TARGET = 'is_bad'


numeric_transformer = Pipeline(steps=[
         ('imputer', SimpleImputer(strategy='median', add_indicator=True)),
         ('poly', PolynomialFeatures(degree=0, interaction_only=False)),
         ('scaler', StandardScaler())
            ])


categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ])


tfidf = Pipeline([('tfidf', TfidfVectorizer(max_features=300000))])

text_split = ColumnTransformer(
    transformers=[
        ('text_1', tfidf, 0),
        ('text_2', tfidf, 1),
        ('text_3', tfidf, 2)
    ])

text_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('text', text_split)
             ])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('text', text_transformer, text_features)
    ])


full_pipe =  Pipeline([
                ('preprocess', preprocessor),
                ('clf', SGDClassifier(loss='log', penalty='l2', max_iter=1000, tol=0.0001))
            ])


hyperparameters = {
                'preprocess__num__poly__degree':[0,1],
                'preprocess__text__text__text_1__tfidf__ngram_range':((1,1),(1,2)),
                'preprocess__text__text__text_2__tfidf__ngram_range':((1,1),(1,2),(1,3)),
                'preprocess__text__text__text_3__tfidf__ngram_range':((1,1),(1,2),(1,3)),
                'clf__alpha':10.0**-np.arange(1,7)
                   }


grid = GridSearchCV(full_pipe, 
                    param_grid=hyperparameters, 
                    cv=2, scoring='neg_log_loss', 
                    error_score='raise', 
                    verbose=False, 
                    iid=False)

df = pd.read_csv(FILEPATH, encoding='latin-1')
X = df.drop(TARGET, axis=1)
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)

In [5]:
# print(X_train.columns)

# (Explicitly)
# text_features = ['purpose','title','desc']   
# numeric_features = list(X_train.select_dtypes(include='number').columns.values)
# categorical_features = list(set(X_train.columns) - set(numeric_features + text_features))

import time
t1 = time.time()
grid.fit(X_train, y_train)
print('- Time: %0.2f minutes' % ((time.time()-t1)/60))
os.chdir(PATH)
clf = grid.best_estimator_
pickle.dump(clf, open('custom_model.pickle', 'wb'))

- Time: 5.438 minutes
