In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler
import time
import datarobot as dr
import os
from pprint import pprint

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Dict of source data files and relevant text columns
sources = {
    '10kDiabetes': {
        'target': 'readmitted',
        'text_columns': ['diag_1_desc', 'diag_2_desc', 'diag_3_desc'],
        'kmeans_columns' : ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications']
    },
    '10K_Lending_Club_Loans': {
        'target': 'is_bad',
        'text_columns': ['emp_title', 'desc', 'purpose', 'title'],
        'kmeans_columns' : ['loan_amnt', 'funded_amnt', 'annual_inc', 'dti', 'open_acc', 'revol_bal']
    }
}

# Designate the source you want to use here:
source_used = '10kDiabetes'

# Read the data file
file_path = 'data/'
file_name = source_used
file_extn = '.csv'
df = pd.read_csv(file_path + file_name + file_extn, encoding = "ISO-8859-1")

# Source target
target = sources.get(source_used).get('target')

# Source text columns
text_columns = sources.get(source_used).get('text_columns')
kmeans_columns = sources.get(source_used).get('kmeans_columns')

In [None]:
# Standardize the input features
scaler = StandardScaler()
df_standardized = pd.DataFrame(scaler.fit_transform( df[kmeans_columns] ), columns=kmeans_columns)
    
# create new plot and data
X = df_standardized  # Should we use standardized inputs data to Kmeans?  Yes.
 
# k means determine k
distortions = []
kmax = 20
K = range(1,kmax)
for k in K:
    t1 = time.time()
    kmeanModel = KMeans(n_clusters=k).fit(X)
    kmeanModel.fit(X)
    distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
    print("Processed kmeans for k=%d of %d - Time: %0.3fs." % (k, kmax, (time.time() - t1)), end='\r', flush=True)
 
# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Centroid Distance Distortion')
plt.title('The elbow for optimal k')
plt.show()

Processed kmeans for k=18 of 20 - Time: 2.440s.

In [None]:
optimal_k = 7

kmeans = KMeans(n_clusters=optimal_k).fit(X)
cluster_dist = kmeans.transform(X)

# Save the cluster label to the dataframe
df['cluster_number'] = kmeans.labels_

# Save the confidence of the cluster, using closeness of distance to the centroid 
# relative to the other observation distances to the centroid
cl_df = pd.DataFrame(cluster_dist)
d = pd.DataFrame()
for i in range(optimal_k):
    d[i] = pd.qcut(cl_df[i], 10, labels=range(1,11))
df['cluster_confidence'] = [d.loc[i][label] for i, label in enumerate(kmeans.labels_)]

In [None]:
df.head()

In [None]:
print(cl_df.head())
cl_df[5].hist(bins=100)

In [None]:
fig, axs = plt.subplots(1,2, figsize=(15,3))
df.cluster_number.hist(bins=optimal_k, ax=axs[0])
df.cluster_confidence.hist(bins=10, ax=axs[1])
print('kmeans cluster value counts:\n',df.cluster_number.value_counts())

In [None]:
# Write the dataframe to csv
file_with_topics = file_path + file_name + '_with_kmeans_clusters' + file_extn
df.to_csv(file_with_topics)

# Create a new DataRobot project with the new dataset with cluster features

In [148]:
dr.Client(token=os.environ['DATAROBOT_API_TOKEN'], endpoint=os.environ['DATAROBOT_ENDPOINT'])

<datarobot.rest.RESTClientObject at 0x1a0d3dda58>

In [149]:
# # Create a project: and upload the data, set the target, give the project a name
project = dr.Project.create(sourcedata=file_with_topics, 
                            project_name="{} with kmeans clustering".format(source_used))

print('project id:  ', project.id)
print('project name:', project.project_name)
print('project.get_status():\n- ', project.get_status())

project id:   5b0498db6cd83a5985c870c5
project name: 10kDiabetes with kmeans clustering
project.get_status():
-  {'autopilot_done': False, 'stage_description': 'Ready to set target', 'stage': 'aim'}


In [150]:
# Output the identified project features# Outpu 
features = project.get_features()
sorted([f.name for f in features])

['A1Cresult',
 'Unnamed: 0',
 'acarbose',
 'acetohexamide',
 'admission_source_id',
 'admission_type_id',
 'age',
 'change',
 'chlorpropamide',
 'citoglipton',
 'cluster_confidence',
 'cluster_number',
 'diabetesMed',
 'diag_1',
 'diag_1_desc',
 'diag_2',
 'diag_2_desc',
 'diag_3',
 'diag_3_desc',
 'discharge_disposition_id',
 'examide',
 'gender',
 'glimepiride',
 'glimepiride_pioglitazone',
 'glipizide',
 'glipizide_metformin',
 'glyburide',
 'glyburide_metformin',
 'insulin',
 'max_glu_serum',
 'medical_specialty',
 'metformin',
 'metformin_pioglitazone',
 'metformin_rosiglitazone',
 'miglitol',
 'nateglinide',
 'num_lab_procedures',
 'num_medications',
 'num_procedures',
 'number_diagnoses',
 'number_emergency',
 'number_inpatient',
 'number_outpatient',
 'payer_code',
 'pioglitazone',
 'race',
 'readmitted',
 'repaglinide',
 'rosiglitazone',
 'rowID',
 'time_in_hospital',
 'tolazamide',
 'tolbutamide',
 'troglitazone',
 'weight']

In [151]:
# Convert top topic to a categorical variable
new_feature = project.create_type_transform_feature(name='cluster_number_cat', 
                                      parent_name='cluster_number', 
                                      variable_type='categoricalInt')
new_feature

Feature(cluster_number_cat)

In [152]:
# By calling set_target, we run datarobot
target=target
project.set_target(target=target, 
                   mode='auto',
                   worker_count=20)

Project(10kDiabetes with kmeans clustering)

In [153]:
# Get the models, which are already ordered by rank from the leaderboard
models = project.get_models()

# Get the best performing model (excluding the blenders, which are typically the top 4 models)
best_model = models[4]
print('Best model from the leaderboard:\n\'%s\'' % best_model.model_type)
print()

# Get the blueprint
blueprint_id = best_model.blueprint_id
blueprint = dr.models.Blueprint.get(project.id, blueprint_id)
print('Best model blueprint preprocessing steps:')
pprint(blueprint.processes)
print()

# Get the model scoring metrics
print('Best model metrics:')
pprint(best_model.metrics)

Best model from the leaderboard:
'Regularized Logistic Regression (L2)'

Best model blueprint preprocessing steps:
['One-Hot Encoding',
 'Matrix of word-grams occurrences',
 'Missing Values Imputed',
 'Smooth Ridit Transform',
 'Standardize',
 'Regularized Logistic Regression (L2)']

Best model metrics:
{'AUC': {'backtesting': None,
         'backtestingScores': None,
         'crossValidation': None,
         'holdout': None,
         'validation': 0.69358},
 'FVE Binomial': {'backtesting': None,
                  'backtestingScores': None,
                  'crossValidation': None,
                  'holdout': None,
                  'validation': 0.08293},
 'Gini Norm': {'backtesting': None,
               'backtestingScores': None,
               'crossValidation': None,
               'holdout': None,
               'validation': 0.38716},
 'Kolmogorov-Smirnov': {'backtesting': None,
                        'backtestingScores': None,
                        'crossValidation': None