In [1]:
run load_data.py

### Evaluating Model Pipelines

We will evaluate a total of 24 model pipelines:

1. the original data
1. the original data with outliers removed
1. the original data transformed by a PCA with 2 components 
1. the original data with outliers removed transformed by a PCA with 2 components 
1. the original data transformed by a PCA with 3 components 
1. the original data with outliers removed transformed by a PCA with 3 components  
1. scaled data
1. scaled data with outliers removed
1. scaled data transformed by a PCA with 2 components 
1. scaled data with outliers removed transformed by a PCA with 2 components
1. scaled data transformed by a PCA with 3 components  
1. scaled data with outliers removed transformed by a PCA with 3 components  
1. log transformed, scaled data
1. log transformed, scaled data with outliers removed
1. log transformed, scaled data transformed by a PCA with 2 components
1. log transformed, scaled data with outliers removed transformed by a PCA with 2 components 
1. log transformed, scaled data transformed by a PCA with 3 components  
1. log transformed, scaled data with outliers removed transformed by a PCA with 3 components  
1. box-cox transformed, scaled data
1. box-cox transformed, scaled data with outliers removed
1. box-cox transformed, scaled data transformed by a PCA with 2 components 
1. box-cox transformed, scaled data with outliers removed transformed by a PCA with 2 components 
1. box-cox transformed, scaled data transformed by a PCA with 3 components  
1. box-cox transformed, scaled data with outliers removed transformed by a PCA with 3 components  

### Experiment Design

We will pass each of these transformed data sets to a Gaussian Mixture Model and then assess the model using the BIC.

In [2]:
from sklearn.mixture import GaussianMixture

In [3]:
original_data = [
    ('original', customer_features),
    ('original - no outliers', customer_features_outliers_removed),
    ('original - pca, 2 components', customer_features_pca_2),
    ('original - pca, 3 components', customer_features_pca_3),
    ('original - no outliers, pca, 2 components', customer_features_outliers_removed_pca_2),
    ('original - no outliers, pca, 3 components', customer_features_outliers_removed_pca_3)
]

scaled_data = [
    ('scaled', customer_sc),
    ('scaled - no outliers', customer_sc_outliers_removed),
    ('scaled - pca, 2 components', customer_sc_pca_2),
    ('scaled - pca, 3 components', customer_sc_pca_3),
    ('scaled - no outliers, pca, 2 components', customer_sc_outliers_removed_pca_2),
    ('scaled - no outliers, pca, 3 components', customer_sc_outliers_removed_pca_3),
]

log_transformed_data = [
    ('log transformed, scaled', customer_log_sc),
    ('log transformed, scaled - no outliers', customer_log_sc_outliers_removed),
    ('log transformed, scaled - pca, 2 components', customer_log_sc_pca_2),
    ('log transformed, scaled - pca, 3 components', customer_log_sc_pca_3),
    ('log transformed, scaled - no outliers, pca, 2 components', customer_log_sc_outliers_removed_pca_2),
    ('log transformed, scaled - no outliers, pca, 3 components', customer_log_sc_outliers_removed_pca_3),
]

box_cox_transformed_data = [
    ('box-cox transformed, scaled', customer_box_cox_sc),
    ('box-cox transformed, scaled - no outliers', customer_box_cox_sc_outliers_removed),
    ('box-cox transformed, scaled - pca, 2 components', customer_box_cox_sc_pca_2),
    ('box-cox transformed, scaled - pca, 3 components', customer_box_cox_sc_pca_3),
    ('box-cox transformed, scaled - no outliers, pca, 2 components', customer_box_cox_sc_outliers_removed_pca_2),
    ('box-cox transformed, scaled - no outliers, pca, 3 components', customer_box_cox_sc_outliers_removed_pca_3),
]

In [4]:
def fit_and_score(data, n_components=2):
    model = GaussianMixture(n_components=n_components)
    model.fit(data)
    return model.bic(data)

In [5]:
n = 2

results_2_clusters = []

for name, data in original_data:
    results_2_clusters.append({
        'name' : name, 
        'n' : n, 
        '' : fit_and_score(data, n)
    })

print()    

for name, data in scaled_data:
    print("{:60} {} clusters      {}".format(name, n, fit_and_score(data, n)))

print()     
    
for name, data in log_transformed_data:
    print("{:60} {} clusters      {}".format(name, n, fit_and_score(data, n)))

print()        
    
for name, data in box_cox_transformed_data:
    print("{:60} {} clusters      {}".format(name, n, fit_and_score(data, n)))    

original                                                     2 clusters      50650.96907649888
original - no outliers                                       2 clusters      44268.63898204862
original - pca, 2 components                                 2 clusters      18695.3062938465
original - pca, 3 components                                 2 clusters      27114.512185112213
original - no outliers, pca, 2 components                    2 clusters      16482.71332841138
original - no outliers, pca, 3 components                    2 clusters      23814.617362885685

scaled                                                       2 clusters      4533.578646524035
scaled - no outliers                                         2 clusters      2448.4887225648854
scaled - pca, 2 components                                   2 clusters      2586.7916217412167
scaled - pca, 3 components                                   2 clusters      3415.453290941513
scaled - no outliers, pca, 2 components       

In [6]:
n = 3
results
for name, data in original_data:
    print("{:60} {} clusters      {}".format(name, n, fit_and_score(data, n)))

print()    

for name, data in scaled_data:
    print("{:60} {} clusters      {}".format(name, n, fit_and_score(data, n)))

print()     
    
for name, data in log_transformed_data:
    print("{:60} {} clusters      {}".format(name, n, fit_and_score(data, n)))

print()        
    
for name, data in box_cox_transformed_data:
    print("{:60} {} clusters      {}".format(name, n, fit_and_score(data, n)))    

original                                                     3 clusters      50086.67703662636
original - no outliers                                       3 clusters      44127.15891131583
original - pca, 2 components                                 3 clusters      18634.06545070052
original - pca, 3 components                                 3 clusters      27038.82202863803
original - no outliers, pca, 2 components                    3 clusters      16310.814980246992
original - no outliers, pca, 3 components                    3 clusters      23710.092131264664

scaled                                                       3 clusters      3936.9918697426965
scaled - no outliers                                         3 clusters      2255.2184929602445
scaled - pca, 2 components                                   3 clusters      2474.5597943329612
scaled - pca, 3 components                                   3 clusters      3359.8095107302556
scaled - no outliers, pca, 2 components    

### One More Thing ... What About Those Labels?

In [7]:
channel = customers.Channel.astype(int) - 1
# region = customers.Region

In [8]:
from sklearn.metrics import accuracy_score

def fit_and_score_predictions(data, labels, n_components=2):
    model = GaussianMixture(n_components=n_components)
    model.fit(data)
    predictions = model.predict(data)
    labels_pos = labels
    labels_neg = (labels == 0).astype(int)
    return max(accuracy_score(labels_pos, predictions), accuracy_score(labels_neg, predictions))

In [9]:
original_data_with_labels = [
    ('original', customer_features, channel),
    ('original - no outliers', customer_features_outliers_removed, channel_original_outliers_removed),
    ('original - pca, 2 components', customer_features_pca_2, channel),
    ('original - pca, 3 components', customer_features_pca_3, channel),
    ('original - no outliers, pca, 2 components', customer_features_outliers_removed_pca_2, channel_original_outliers_removed),
    ('original - no outliers, pca, 3 components', customer_features_outliers_removed_pca_3, channel_original_outliers_removed)
]

scaled_data_with_labels = [
    ('scaled', customer_sc, channel),
    ('scaled - no outliers', customer_sc_outliers_removed, channel_scaled_outliers_removed),
    ('scaled - pca, 2 components', customer_sc_pca_2, channel),
    ('scaled - pca, 3 components', customer_sc_pca_3, channel),
    ('scaled - no outliers, pca, 2 components', customer_sc_outliers_removed_pca_2, channel_scaled_outliers_removed),
    ('scaled - no outliers, pca, 3 components', customer_sc_outliers_removed_pca_3, channel_scaled_outliers_removed),
]

log_transformed_data_with_labels = [
    ('log transformed, scaled', customer_log_sc, channel),
    ('log transformed, scaled - no outliers', customer_log_sc_outliers_removed, channel_log_outliers_removed),
    ('log transformed, scaled - pca, 2 components', customer_log_sc_pca_2, channel),
    ('log transformed, scaled - pca, 3 components', customer_log_sc_pca_3, channel),
    ('log transformed, scaled - no outliers, pca, 2 components', customer_log_sc_outliers_removed_pca_2, channel_log_outliers_removed),
    ('log transformed, scaled - no outliers, pca, 3 components', customer_log_sc_outliers_removed_pca_3, channel_log_outliers_removed),
]

box_cox_transformed_data_with_labels = [
    ('box-cox transformed, scaled', customer_box_cox_sc, channel),
    ('box-cox transformed, scaled - no outliers', customer_box_cox_sc_outliers_removed, channel_box_cox_outliers_removed),
    ('box-cox transformed, scaled - pca, 2 components', customer_box_cox_sc_pca_2, channel),
    ('box-cox transformed, scaled - pca, 3 components', customer_box_cox_sc_pca_3, channel),
    ('box-cox transformed, scaled - no outliers, pca, 2 components', customer_box_cox_sc_outliers_removed_pca_2, channel_box_cox_outliers_removed),
    ('box-cox transformed, scaled - no outliers, pca, 3 components', customer_box_cox_sc_outliers_removed_pca_3, channel_box_cox_outliers_removed),
]

In [10]:
n = 2

for name, data, label in original_data_with_labels:
    print("{:60} {} clusters      {}".format(name, n, fit_and_score_predictions(data, label, n)))

print()    

for name, data, label in scaled_data_with_labels:
    print("{:60} {} clusters      {}".format(name, n, fit_and_score_predictions(data, label, n)))

print()     
    
for name, data, label in log_transformed_data_with_labels:
    print("{:60} {} clusters      {}".format(name, n, fit_and_score_predictions(data, label, n)))

print()        
    
for name, data, label in box_cox_transformed_data_with_labels:
    print("{:60} {} clusters      {}".format(name, n, fit_and_score_predictions(data, label, n)))    

original                                                     2 clusters      0.6863636363636364
original - no outliers                                       2 clusters      0.8170426065162907
original - pca, 2 components                                 2 clusters      0.865909090909091
original - pca, 3 components                                 2 clusters      0.6636363636363637
original - no outliers, pca, 2 components                    2 clusters      0.5964912280701754
original - no outliers, pca, 3 components                    2 clusters      0.5513784461152882

scaled                                                       2 clusters      0.7431818181818182
scaled - no outliers                                         2 clusters      0.8170426065162907
scaled - pca, 2 components                                   2 clusters      0.7
scaled - pca, 3 components                                   2 clusters      0.6568181818181819
scaled - no outliers, pca, 2 components                

![](complex_pipe_1.png)

![](complex_pipe_2.png)

![](complex_pipe_3.png)

![](complex_pipe_4.png)

![](complex_pipe_5.png)
