# Breast cancer diagnosis classification with scikit-learn (run model explainer locally)

Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn import svm
from azureml.contrib.explain.model.tabular_explainer import TabularExplainer
import pandas as pd
import numpy as np

  from numpy.core.umath_tests import inner1d


In [2]:
np.random.seed(0)

## Load the breast cancer diagnosis data

In [4]:
breast_cancer_data = load_breast_cancer()
classes = breast_cancer_data.target_names.tolist()

In [5]:
# Split data into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(breast_cancer_data.data, breast_cancer_data.target, test_size=0.2, random_state=0)

## Looking through the data

In [6]:
#type(classes)
print(classes)
type(breast_cancer_data.data)
len(breast_cancer_data.data)
# 0 means malignant, 1 means benign

['malignant', 'benign']


569

In [7]:
print(len(x_train))
print(len(x_test))

455
114


In [6]:
breast_cancer_data.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [7]:
df_data = pd.DataFrame(data=breast_cancer_data.data,    
         columns=breast_cancer_data.feature_names)

In [8]:
print(df_data.shape)
df_data.head(5)

(569, 30)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Train a SVM classification model

In [8]:
clf = svm.SVC(gamma=0.001, C=100., probability=True, random_state=1)
model = clf.fit(x_train, y_train)

## Model Prediction for scoring

In [9]:
#note that the data for prediction should be read from SQL DB, or passed in through the web service end point for real-time prediction
df_score =  pd.DataFrame(model.predict_proba(breast_cancer_data.data))

In [10]:
# 0 means malignant, 1 means benign
df_score.columns =['malignant', 'benign']

In [11]:
df_score.head(5)

Unnamed: 0,malignant,benign
0,0.956997,0.043003
1,0.939592,0.060408
2,0.956991,0.043009
3,0.957003,0.042997
4,0.956958,0.043042


## Explain predictions on your local machine

In [12]:
tabular_explainer = TabularExplainer(model, x_train, features=breast_cancer_data.feature_names, classes=classes)

In [13]:
# Passing in test dataset for evaluation examples - note it must be a representative sample of the original data
# x_train can be passed as well, but with more examples explanations will take longer although they may be more accurate
global_explanation = tabular_explainer.explain_global(x_test)

  "l1_reg=\"auto\" is deprecated and in the next version (v0.29) the behavior will change from a " \


In [14]:
# Sorted SHAP values and feature names

sorted_global_importance_values = global_explanation.get_ranked_global_values()
sorted_global_importance_names = global_explanation.get_ranked_global_names()

print('sorted global importance values: {}'.format(sorted_global_importance_values))
print('sorted global importance names: {}'.format(sorted_global_importance_names))

sorted global importance values: [0.10875811915882352, 0.09203181071474759, 0.07866940303093073, 0.0656819829365329, 0.04061036625801283, 0.012476449610209533, 0.0018493468332682768, 0.001596457915688524, 0.0013247221957655188, 0.0011143976880106615, 0.0010703399048983225, 0.0009717442193310397, 0.000941304502499635, 0.0008830518309048856, 0.0008195917694136031, 0.0007803679235024125, 0.0007662750645640624, 0.0007504720334971652, 0.0007471899279297904, 0.0007123224555021126, 0.0006594468078831501, 0.000627458271743918, 0.0005783500635133474, 0.0005610056424596978, 0.0005371266305942543, 0.0005293339985769795, 0.0005136546090399484, 0.0004954132843831048, 0.00041870083922159644, 0.00040211948435181255]
sorted global importance names: ['worst area', 'worst perimeter', 'mean area', 'mean perimeter', 'area error', 'worst texture', 'mean texture', 'worst radius', 'mean radius', 'worst concavity', 'texture error', 'worst compactness', 'smoothness error', 'mean fractal dimension', 'mean compa

## Explain overall model predictions as a collection of local (instance-level) explanations
### this should be used in batch pipeline for exporting the local explanations in batch

In [15]:
# unsorted feature shap values for all features and all data points in the training data; reflects the original feature order
print('unsorted local importance values: {}'.format(global_explanation.local_importance_values))

unsorted local importance values: [[[0.0, 0.0, -0.013832050005734571, 0.059900884916938665, 0.0, 0.010305792768700299, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.06142941511104568, -0.007075120691136617, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03349103468491672, 0.10033757192985843, -0.09418849906520636, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, -0.04973943327246713, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.020021316705363675, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.038045771050367036, -0.08282167327692234, -0.21667075679338757, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, -0.026950839927089687, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.02756076596889917, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.019661596107197865, -0.07297571173936512, -0.18362810165612886, 0.0, -0.0063034188888589915, 0.0, 0.0, 0.0, 0.0], [0.005144157658907786, 0.0038887974549992183, 0.043455778442571674, 0.16332825011818766, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.02361372639909029, 0.0, 0.0, 0.0, 0.0, 0.0,




In [16]:
print(len(global_explanation.local_importance_values))
print(len(global_explanation.local_importance_values[0]))
print(len(x_test))
global_explanation.local_importance_values[0][0]

#the global_explanation.local_importance_values[0] should be stored back into SQL DB's local explanation tables, 
#each row is a list of feature importance weights for each feature

2
114
114
