In [6]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import FastICA, PCA
from sklearn.cluster import KMeans, Birch
from scipy.optimize import minimize_scalar
from sklearn.manifold import TSNE
from sklearn.datasets import load_breast_cancer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from IPython.display import display, Image
from plotly import express as px
import os

def get_acc(y_true:int, y_pred:int)->str:
    '''
    param y_true: int 0 or 1 (1 indicates true)
    param y_pred: int 0 or 1 (1 indicates predicted true value)
    returns str
    Classifiers whetheer a value is a true positive, false positive, true negative or false negative  
    '''
    if all((y_true==1, y_pred==1)):
        return 'TP'
    elif all((y_true == 1, y_pred == 0)):
        return 'FN'
    elif all((y_true == 0, y_pred == 1)):
        return 'FP'
    elif all((y_true == 0, y_pred == 0)):
        return 'TN'
    else:
        raise ValueError(F'unknown input y_true: {y_true}  y_pred: {y_pred}')
# symbol dictionaries for plotting
symbol_map = {'TP':'square', 'FN':'square-x', 'TN': 'circle', 'FP': 'circle-x'}
source_dict = { 'Test Data (Altered)':'orange', 'Test Data':'lightblue',  'Training Data':'blue'}

In [7]:
## Loads wisconson breast cancer data 
bc = load_breast_cancer()
feature_names = bc.feature_names
# creates the data frame of numberic features
X = pd.DataFrame(bc.data, columns=feature_names)
# creates the array of binary targets (source of true to train models)
y = bc.target

X.shape, y.shape

((569, 30), (569,))

In [8]:
# sets the random state for the splitting
random_state =1231

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# train scaler (transforms to mean 0, stdev 1)
scaler = StandardScaler()

# scales all the features
X_scaled =  scaler .fit_transform(X)

#scales the features for the training set
X_train_scaled = scaler.transform(X_train)

# scales the features for tht test set 
X_test_scaled = scaler.transform(X_test)

# train Imputer (missing values imputed with means
imputer = SimpleImputer(strategy='mean').fit(X)

## Build RF Estimator
estimator = RandomForestClassifier(n_estimators=5, max_depth=4, min_samples_split=10,random_state=random_state)

# pipes imputer to random forest model, so that missing values are imputed and then classified, fits the model
pipeline = Pipeline(steps=[('imp', imputer), ('estimator', estimator)]).fit(X_train,y_train)

# build cluster model
cluster_model = KMeans(3, random_state=random_state, n_init='auto')
cluster_model = cluster_model.fit(X_train_scaled)

## Back predict training data 
p_train = pipeline.predict(X_train)

# back predicts test data 
p_test = pipeline.predict(X_test)

# creats areas of probablity outputs from the model
p_prob_train = pipeline.predict_proba(X_train)[:, 1]
p_prob_test = pipeline.predict_proba(X_test)[:, 1]


print(F'n obs: {y.shape[0]}')
print(F'n positive lables: {y.sum()}')
print(F'label mean: {np.round(y.mean(),3)}')
print(F'n training obs: {y_train.shape[0]}')
print(F'n test obs: {y_test.shape[0]}')
print(F'n test: {y_train.shape[0]}')
print(F'train roc auc : {np.round(roc_auc_score(y_train, p_prob_train) , 3)}')
print(F'test roc auc : {np.round(roc_auc_score(y_test, p_prob_test), 3)}')

n obs: 569
n positive lables: 357
label mean: 0.627
n training obs: 284
n test obs: 285
n test: 284
train roc auc : 0.995
test roc auc : 0.99


In [9]:
# plot feature importance 
importance = pd.Series(pipeline.steps[-1][1].feature_importances_, feature_names).sort_values()
fig = px.bar(importance,width=600, height=400,  orientation='h', 
             title='Feature Importance for Random Forest Classifier')
fig.update_xaxes(title='information gain')
fig.update_yaxes(title='features')
fig.update_layout(showlegend=False)
fig.show()

In [10]:

## calcuate train silhouette scores
train_clusters = cluster_model.predict(X_train_scaled)
train_sil_scores = silhouette_samples(X_train_scaled, train_clusters)
print('train sil score',  train_sil_scores.mean())


## calcuate test silhouette scores
test_clusters = cluster_model.predict(X_test_scaled)
test_sil_scores = silhouette_samples(X_test_scaled, test_clusters)
print('test sil score',  test_sil_scores.mean())

## calcuate test (altered) silhouette scores
np.random.seed(random_state)
X_test_altered = X_test.copy()

## Randomly makes 30% of the data null
mask = np.random.choice([np.nan, 1], size=X_test.shape, p=[.3, .7])
X_test_altered = pd.DataFrame(mask *X_test.values, columns=X_test.columns)
X_test_altered_imputed = pd.DataFrame(pipeline.steps[0][1].transform(X_test_altered), columns=X_test.columns)
X_test_altered_imputed_scaled = scaler.transform(X_test_altered_imputed)
test_altered_clusters =  cluster_model.predict(X_test_altered_imputed_scaled)
test_altered_sil_scores = silhouette_samples(X_test_altered_imputed_scaled, test_altered_clusters )

print('test (altered) sil score',  test_altered_sil_scores .mean())


train sil score 0.30253355159138906
test sil score 0.31225034884838787
test (altered) sil score 0.2594701887217008


In [11]:

train_sil_df = pd.DataFrame({'Silhouette Score': train_sil_scores, 'Cluster': train_clusters.astype(str)})\
.sort_values(['Cluster','Silhouette Score' ])\
.reset_index()
train_sil_df.loc[:, 'Source'] = 'Training Data'



test_sil_df = pd.DataFrame({'Silhouette Score': test_sil_scores, 'Cluster': test_clusters.astype(str)})\
.sort_values(['Cluster','Silhouette Score' ])\
.reset_index()

test_sil_df.loc[:, 'Source'] = 'Test Data'



test_altered_sil_df = pd.DataFrame({'Silhouette Score': test_altered_sil_scores, 'Cluster': test_altered_clusters.astype(str)})\
.sort_values(['Cluster','Silhouette Score' ])\
.reset_index()
test_altered_sil_df.loc[:, 'Source'] = 'Test Data (Altered)'


to_plot = pd.concat([train_sil_df, test_sil_df, test_altered_sil_df], axis=0)
to_plot.loc[:, 'Cluster'] =  to_plot.loc[:, 'Cluster'].astype(str)
to_plot = to_plot.sort_values(by=['Silhouette Score', 'Source', 'Cluster'])
to_plot.index = np.arange(to_plot.shape[0])



fig = px.box(to_plot ,
             x = 'Source',
             y ='Silhouette Score', 
             color='Source', 
             facet_col='Cluster', 
              points="all",
             title="Silhouette Score Analysis of <br> Training, Test and Test (Altered) with 30% Missing values)", 
             width=800, height=600, color_discrete_map=source_dict)
fig.show()

In [12]:

label_col = 'y'
train_df = X_train.copy()
train_df.loc[:, 'Silhouette Score'] = train_sil_scores
train_df.loc[:, 'Source'] ='Training Data'
train_df.loc[:, 'Cluster'] =  train_clusters.astype(str)
train_df.loc[:, label_col] = y_train


test_df = X_test.copy()
test_df.loc[:, 'Silhouette Score'] = test_sil_scores
test_df.loc[:, 'Source'] ='Test Data'
test_df.loc[:, 'Cluster'] = test_clusters.astype(str)
test_df.loc[:, label_col] = y_test


test_altered_df = X_test_altered.copy()
test_altered_df.loc[:, 'Silhouette Score'] = test_altered_sil_scores
test_altered_df.loc[:, 'Source'] =  'Test Data (Altered)'
test_altered_df.loc[:, 'Cluster'] = test_altered_clusters.astype(str)
test_altered_df.loc[:, label_col] = y_test

to_plot = pd.concat([train_df, test_df, test_altered_df]).sort_values(by=['Cluster', 'Silhouette Score']).reset_index()

s_score = np.round(to_plot.loc[:, 'Silhouette Score'].mean(), 3)
title = F"Silhouette Analysis Showing Show Poorer Cluister in Altered Data"


fig = px.bar(to_plot, x ='Silhouette Score', facet_col='Source', color='Cluster', title=title, width=1000, height=600)

fig.update_traces(width=3)
fig.show()

In [13]:
px.histogram(to_plot,  x ='Silhouette Score', facet_col='Cluster', color='Source', title=title, width=1000, height=400, color_discrete_map=source_dict)

In [14]:
to_plot.groupby(['Source', 'Cluster']).agg({'Silhouette Score':'mean'}).sort_values(by = 'Silhouette Score', ascending=False).round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Silhouette Score
Source,Cluster,Unnamed: 2_level_1
Test Data,1,0.414
Training Data,1,0.412
Test Data (Altered),1,0.369
Test Data,2,0.212
Training Data,2,0.194
Test Data (Altered),2,0.139
Training Data,0,0.092
Test Data,0,0.056
Test Data (Altered),0,-0.042


In [15]:
## Decomposition (column compression and decompression)
decomp_model = PCA(10).fit(X_train_scaled)
train_comps = decomp_model.transform(X_train_scaled)
test_comps = decomp_model.transform(X_test_scaled)
test_altered_comps = decomp_model.transform(X_test_altered_imputed_scaled)

train_comps_inv  = decomp_model.inverse_transform(train_comps)
test_comps_inv  = decomp_model.inverse_transform(test_comps)
test_altered_comps_inv  = decomp_model.inverse_transform(test_altered_comps)

train_error = X_train_scaled.flatten() - train_comps_inv.flatten()
test_error = X_test_scaled.flatten() -  test_comps_inv.flatten()
test_altered_error = X_test_altered_imputed_scaled.flatten() - test_altered_comps_inv.flatten()



In [16]:

def mae_all(arr1, arr2, names=X_test.columns):
    s = []
    p = []
    for i in np.arange(arr1.shape[1]):
        st = mean_absolute_error(arr1[:, i], arr2[:, i])
        pval = np.nan
        s.append(st)
        p.append(pval)
    return pd.DataFrame({'stat': s, 'pval': p}, index=names)

results_list = []
for c in np.unique(train_clusters):    

    train_anova_df = mae_all(X_train_scaled[train_clusters==c,:], train_comps_inv[train_clusters==c,:], names=X_test.columns)
    train_anova_df.loc[:, 'Source'] = 'Training Data'
    test_anova_df = mae_all(X_test_scaled[test_clusters==c,:], test_comps_inv[test_clusters==c,:],names=X_test.columns)
    test_anova_df.loc[:, 'Source'] = 'Test Data'
    test_altered_anova_df = mae_all(X_test_altered_imputed_scaled[test_clusters==c,:],
                                            test_altered_comps_inv[test_clusters==c,:],names=X_test.columns) 
    test_altered_anova_df.loc[:, 'Source'] = 'Test Data (Altered)'
    combined_anova_df = pd.concat([train_anova_df, test_anova_df, test_altered_anova_df], axis=0)
    combined_anova_df.loc[:, "Cluster"] = c
    results_list.append(combined_anova_df)
    
to_plot = pd.concat(results_list, axis=0)
to_plot.index.name = "Feature"



var_explained = decomp_model.explained_variance_ratio_.sum()
print(F"Percentage of Variance Explained by PCA {np.round(var_explained,3) *100 }%")

fig = px.bar(to_plot.reset_index(), y='Feature', x='stat', color='Source', facet_col='Cluster', barmode="group",
            width=1000, height=600, 
             title='Error between Features and <br> PCA Inverse Representation of Features', color_discrete_map=source_dict)
fig.update_layout(yaxis={'categoryorder':'array', 'categoryarray':importance.index[::-1], })

fig.update_xaxes(title_text='Mean Squared Error')
fig.update_yaxes(title_text='Feature ranked by Model Importance')
fig.show()


Percentage of Variance Explained by PCA 95.6%


In [17]:
to_plot.rename({'stat' :'mae'}, axis=1).drop('pval', axis=1)

Unnamed: 0_level_0,mae,Source,Cluster
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mean radius,0.078165,Training Data,0
mean texture,0.183360,Training Data,0
mean perimeter,0.074717,Training Data,0
mean area,0.083970,Training Data,0
mean smoothness,0.282954,Training Data,0
...,...,...,...
worst compactness,0.349618,Test Data (Altered),2
worst concavity,0.383976,Test Data (Altered),2
worst concave points,0.468526,Test Data (Altered),2
worst symmetry,0.328667,Test Data (Altered),2


In [18]:
train_df = X_train.copy()
train_df.loc[:, 'Source'] = 'Training Data'
train_df.loc[:, 'y'] = y_train
train_df.loc[:, 'pred'] = p_prob_train
train_df.loc[:, 'pred_label']= p_train
train_df.loc[:, 'Cluster'] = train_clusters


test_df = X_test.copy()
test_df.loc[:, 'Source'] = 'Test Data'
test_df.loc[:, 'y'] = y_test
test_df.loc[:, 'pred'] = p_prob_test
test_df.loc[:, 'pred_label']= p_test
test_df.loc[:, 'Cluster'] = test_clusters


test_altered_df = X_test.copy()
test_altered_df.loc[:, 'Source'] = 'Test Data (Altered)'
test_altered_df.loc[:, 'y'] = y_test
test_altered_df.loc[:, 'pred'] = estimator.predict_proba(X_test_altered_imputed.values)[:, 1]
test_altered_df.loc[:, 'pred_label']= estimator.predict(X_test_altered_imputed.values)
test_altered_df.loc[:, 'Cluster'] = test_clusters


to_plot = pd.concat([train_df, test_df, test_altered_df], axis=0)
to_plot.loc[:, 'Source_Cluster'] =to_plot.Source.astype(str) + ' Cluster: '+ to_plot.Cluster.astype(str)

In [19]:
def update_performance_table(df, label_col, prediction_col, group, threshold):
    results = []
    y_true = df.loc[:, label_col]
    y_pred = df.loc[:, prediction_col]
    y_pred_label = pd.Series(y_pred> threshold).replace({True:1, False:0})
    groups =  np.unique(df.loc[:, group])
    groups_evaluated = []
    for g in groups:
        index = df.loc[:, group] == g
        y_true_temp = y_true.loc[index]
        y_pred_temp = y_pred.loc[index]
        y_pred_label_temp = y_pred_label.loc[index]
        try:
            tn, fp, fn, tp = confusion_matrix(y_true_temp, y_pred_label_temp).ravel()

            d = {'auc':  roc_auc_score(y_true_temp, y_pred_temp),
                 'f1' : f1_score(y_true_temp,y_pred_label_temp),
                 'precision' : precision_score(y_true_temp,y_pred_label_temp),
                 'recall': recall_score(y_true_temp, y_pred_label_temp),
                 'accuracy' : accuracy_score(y_true_temp, y_pred_label_temp),
                 'balanced_acc': balanced_accuracy_score(y_true_temp, y_pred_label_temp),
                 'TP': tp,
                 'FP': fp,
                 'TN': tn,
                 'FN': fn,
                 'weight': y_true_temp.shape[0],

                 }
            groups_evaluated.append(g)
            results.append(d)
        except:
            pass

    macro_avg = pd.DataFrame(results, index=groups_evaluated).mean(axis=0)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred_label).ravel()
    d = {'auc': roc_auc_score(y_true, y_pred),
         'f1': f1_score(y_true, y_pred_label),
         'precision': precision_score(y_true, y_pred_label),
         'recall': recall_score(y_true, y_pred_label),
         'accuracy': accuracy_score(y_true, y_pred_label),
         'balanced_acc': balanced_accuracy_score(y_true, y_pred_label),
         'TP': tp,
         'FP': fp,
         'TN': tn,
         'FN': fn,
         'weight': y_true.shape[0],

         }
    output = pd.DataFrame([d,  macro_avg.to_dict()], index=['MicroAverage', 'MacroAverage'
                           ]).round(3).reset_index()

    return output

update_performance_table(test_altered_df, 'y', 'pred' , 'Cluster', threshold=y.mean())

Unnamed: 0,index,auc,f1,precision,recall,accuracy,balanced_acc,TP,FP,TN,FN,weight
0,MicroAverage,0.987,0.96,0.967,0.952,0.947,0.945,178.0,6.0,92.0,9.0,285.0
1,MacroAverage,0.96,0.903,0.954,0.863,0.916,0.834,89.0,3.0,19.5,4.5,116.0


In [20]:
px.box(to_plot, y= 'pred', x='Cluster', color='Source', facet_col='y', color_discrete_map=source_dict,
      title = 'Prediction From Altered (Imputed) Test Data <br> Show Clear Bias in Cluster 0 and 1')

In [21]:
import plotly.graph_objs as go
label_col = 'y'
prediction_col = 'pred'
group =  'Source_Cluster'


def update_roc_auc(df, label_col, prediction_col, group):
    fig = go.Figure()
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )
    y_true = df.loc[:, label_col]
    y_pred = df.loc[:, prediction_col]
    groups =  np.unique(df.loc[:, group])
    for g in groups:
        index = df.loc[:, group] == g
        y_true_temp = y_true.loc[index]
        y_pred_temp = y_pred.loc[index]
        try:
            fpr, tpr, thresholds = roc_curve(y_true_temp, y_pred_temp)
            auc_score = roc_auc_score(y_true_temp, y_pred_temp)
            name = f"{g} (AUC={auc_score:.2f})"
            fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))
        except:
            pass
    fig.update_layout(
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        yaxis=dict(scaleanchor="x", scaleratio=1),
        xaxis=dict(constrain='domain'),
        width=700, height=500
    )
    return fig

fig = update_roc_auc(to_plot, label_col, prediction_col, group)
fig.show()


No positive samples in y_true, true positive value should be meaningless


No positive samples in y_true, true positive value should be meaningless


No positive samples in y_true, true positive value should be meaningless



In [22]:

p_prob_test_altered= estimator.predict_proba(X_test_altered_imputed.values)[:, 1]
p_test_altered = estimator.predict(X_test_altered_imputed.values)

def get_classifcation_metrics(y_true, y_pred_label, y_pred_prob):
    d = {}
    try:
        d['ROC AUC'] = roc_auc_score(y_true,y_pred_prob )
    except ValueError:
       d['ROC AUC'] = np.nan 
    
    d['F1'] = f1_score(y_true, y_pred_label)
    d['Precision (PPV)'] = precision_score(y_true, y_pred_label)
    d['Recall (TPR)'] = recall_score(y_true, y_pred_label)
    d['Accuracy'] = accuracy_score(y_true, y_pred_label)
    d['Precision'] = precision_score(y_true, y_pred_label)
    d['Precision'] = precision_score(y_true, y_pred_label)
    d['Precision'] = precision_score(y_true, y_pred_label)

    tp = np.sum(list(map(all,zip(y_true==1, y_pred_label==1))))
    fp = np.sum(list(map(all,zip(y_true==0, y_pred_label==1))))
    tn = np.sum(list(map(all,zip(y_true==0, y_pred_label==0))))
    fn =  np.sum(list(map(all,zip(y_true==1, y_pred_label==0))))  
    
    d['TP'] = tp
    d['FP'] =  fp
    d['TN'] =  tn
    d['FN'] = fn
    try:
        d['Specificity (TNR)'] = tn/(tn + fp)
    except ZeroDivisionError:
       d['Specificity (TNR)'] = np.nan 
    
    d['NPV'] = tn/(tp + fn)
    return d

def generate_performance_data():
    for c in np.unique(train_clusters):
        train_metrics = get_classifcation_metrics(y_train[train_clusters == c], 
                                                  p_train[train_clusters == c],  
                                                  p_prob_train[train_clusters == c])   
        train_metrics['Source'] =  'Training Data'
        train_metrics['Cluster'] = c

        test_metrics = get_classifcation_metrics(y_test[test_clusters == c],
                                                 p_test[test_clusters == c], 
                                                 p_prob_test[test_clusters == c])   
        test_metrics['Source'] = 'Test Data'
        test_metrics['Cluster'] = c

        test_altered_metrics = get_classifcation_metrics(y_test[test_altered_clusters== c], 
                                                         p_test_altered[test_altered_clusters== c],  
                                                         p_prob_test_altered[test_altered_clusters== c])   
        test_altered_metrics['Source'] = 'Test Data (Altered)'
        test_altered_metrics['Cluster'] = c
        yield pd.DataFrame([train_metrics, test_metrics, test_altered_metrics])
performance_df = pd.concat(list(generate_performance_data()), axis=0)
performance_df.Cluster = performance_df.Cluster.astype(str)



F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 due to no true samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


divide by zero encountered in scalar divide


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill

In [23]:
performance_df.set_index(['Source', "Cluster"]).round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,ROC AUC,F1,Precision (PPV),Recall (TPR),Accuracy,Precision,TP,FP,TN,FN,Specificity (TNR),NPV
Source,Cluster,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Training Data,0,1.0,0.947,1.0,0.9,0.962,1.0,18,0,33,2,1.0,1.65
Test Data,0,0.97,0.842,0.889,0.8,0.882,0.889,16,2,29,4,0.935,1.45
Test Data (Altered),0,0.979,0.821,0.727,0.941,0.865,0.727,16,6,29,1,0.829,1.706
Training Data,1,0.97,0.97,0.961,0.98,0.946,0.961,147,6,10,3,0.625,0.067
Test Data,1,0.961,0.974,0.954,0.994,0.95,0.954,166,8,6,1,0.429,0.036
Test Data (Altered),1,0.959,0.977,0.966,0.988,0.958,0.966,168,6,14,2,0.7,0.082
Training Data,2,,0.0,0.0,0.0,1.0,0.0,0,0,65,0,1.0,inf
Test Data,2,,0.0,0.0,0.0,1.0,0.0,0,0,53,0,1.0,inf
Test Data (Altered),2,,0.0,0.0,0.0,0.977,0.0,0,1,42,0,0.977,inf


In [24]:
yy = y_train[train_clusters == 1]
xx = p_train[train_clusters == 1]

tp = np.sum(list(map(all,zip(yy==1, xx==1))))
#     fp = np.sum(list(map(all,zip(y_true==0, y_pred_label==1))))
#     tn = np.sum(list(map(all,zip(y_true==0, y_pred_label==0))))
#     fn =  np.sum(list(map(all,zip(y_true==1, y_pred_label==0))))  
tp

147

In [25]:
px.bar(performance_df, x='Cluster', y='F1', color='Source', barmode="group", width=600, height=500,
      title='F1 Score by Cluster', color_discrete_map=source_dict)

In [26]:
np.sum(list(map(all,zip(y_test==1, p_test==1))))

np.sum(list(map(all,zip(y_test==1, p_test==1))))

182

In [27]:
tn, fp, fn, tp = confusion_matrix(y_test, p_test).ravel()
tp

182

In [28]:
from sklearn.decomposition import FastICA
def get_decision_boundary(X, y, model, alpha=.05, n = 100):
    model = Pipeline(steps= [('scale', StandardScaler()), ('decomp',FastICA(2) )])
    X_comps = model.fit_transform(X)
    comp_0_grid, comp_1_grid = np.meshgrid(np.linspace( X_comps[:, 0].min(),  X_comps[:, 0].max()), 
                                                       np.linspace( X_comps[:, 1].min(),  X_comps[:, 1].max()))
    grid_comps =  np.vstack([comp_0_grid.ravel(), comp_1_grid .ravel()]).T

    grid_features =  model.inverse_transform(grid_comps)
    grid_pred = estimator.predict_proba(grid_features)[:, 1]
    grid_pred_labels = estimator.predict(grid_features)                                
    col_stats = []
    
    results = pd.DataFrame({'comp0': grid_comps[:, 0],
                            'comp1': grid_comps[:, 1],
                            'pred_prob' : grid_pred,
                            'pred_label':  grid_pred_labels})
    return results

to_plot = get_decision_boundary(X, y, estimator, alpha=.05, n = 100)

In [29]:
px.scatter(to_plot, x='comp0', y='comp1',color='pred_prob', symbol='pred_label', width=800, height=600)