In [5]:
import os
import numpy as np
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from copy import deepcopy
from sklearn.datasets import load_iris, load_diabetes
from pypadre.binding.metrics import sklearn_metrics
from pypadre.examples.base_example import example_app

In [6]:
# create example app
app = example_app()

predictions = []
idx= 0
matrix = []
missclassification_matrix = []
metrics = []
names = []
grid_parameters = []
cols = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
                      'petal width (cm)', 'class']
columns = ["Age",
    "Sex",
    "Body mass index",
    "Average blood pressure",
    "S1",
    "S2",
    "S3",
    "S4",
    "S5",
    "S6",
    "progression"]
    
columns_banknotes = [
    "variance",
    "skewness",
    "kurtosis",
    "entropy",
    "class"
]

columns_abalone=[
    
    "Sex",
    "Length",
    "Diameter",
    "Height",
    "Whole weight",
    "Shucked weight",
    "Viscera weight",
    "Shell weight",
    "Rings"
]

columns_wine=[
    "Fixed acidity.",
"Volatile acidity.",
"Citric acid.",
"Residual sugar.",
"Chlorides.",
"Free sulfur dioxide.",
"Total sulfur dioxide.",
"Density.",
"pH.",
"Sulphates.",
"Alcohol.",
"Quality"

]

# Define the dataset
@app.dataset(name="wine", columns=columns_wine, target_features='Quality')
def dataset():
    #data = load_iris().data
    #target = load_iris().target.reshape(-1, 1)
    #return np.append(data, target, axis=1)
    return np.loadtxt('winequality-white.csv', delimiter=";", usecols=range(12))

In [7]:
# Create the reference
from pypadre.core.model.code.code_mixin import PythonPackage, PythonFile, GitIdentifier, Function
from pathlib import Path

path = os.path.abspath('')
git_repo = str(Path(''))
reference = PythonFile(path=str(Path('')), package=path[len(git_repo) + 1:],
                             variable="function_name",
                             repository_identifier=GitIdentifier(path=git_repo))



In [None]:
# Grid search over parameters defined below
@app.parameter_map()
def parameters():
    return {'SKLearnEstimator': {'parameters': {'SVC': {'kernel': ['rbf'], 'C': [0.1, 0.5, 1.5], 'tol': [.1, .3]}}}}

# Create the experiment
@app.experiment(dataset=dataset, reference=reference,
                experiment_name="Iris SVC - Grid Search with Static Seed", seed=1, project_name="Examples", 
                parameters=parameters)
def experiment():
    from sklearn.pipeline import Pipeline
    from sklearn.svm import SVC
    estimators = [('SVC', SVC(probability=True, C=1.0))]
    return Pipeline(estimators)

Following metrics would be available for Evaluation: Confusion Matrix, Classification Metrics
Calculating d3dec3e0-087d-4cbb-8e2b-439f491a3d88 done.
Following metrics would be available for Evaluation: Confusion Matrix, Classification Metrics
Calculating 96637eb0-8e88-4a30-90fb-9950ddeacfb7 done.
Following metrics would be available for Evaluation: Confusion Matrix, Classification Metrics
Calculating ccc02c8e-05e2-44af-b7f5-896155194ba9 done.
Following metrics would be available for Evaluation: Confusion Matrix, Classification Metrics
Calculating fcc3b58d-6a89-462f-b856-d311c0261827 done.
Following metrics would be available for Evaluation: Confusion Matrix, Classification Metrics
Calculating 0e196eda-292e-43ee-9f9a-40bc0d8dfc1e done.
Following metrics would be available for Evaluation: Confusion Matrix, Classification Metrics
Calculating 2b0940bf-3a22-4999-8310-ce3c0debf928 done.
Following metrics would be available for Evaluation: Confusion Matrix, Classification Metrics
Calculating 

In [None]:
results = [f for f in app._pipeline_output_app.list() if f.parent.parent.parent == experiment]
for result in results:
    predictions.append(result.results.get('predictions'))
    metrics.append(result.metrics)
    grid_parameters.append(result.parameter_selection)
    names.append(experiment.name)

In [None]:
# Grid search over parameters defined below
@app.parameter_map()
def parameters():
    return {'SKLearnEstimator': {'parameters': {'k-nn classifier': {'n_neighbors': [1, 3, 5, 7, 9, 11],
                                                        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}}}}

# Create the experiment
@app.experiment(dataset=dataset, reference=reference,
                experiment_name="Iris KNN - Grid Search with Static Seed", seed=1, project_name="Examples", 
                parameters=parameters)
def experiment():
    from sklearn.pipeline import Pipeline
    from sklearn.neighbors import KNeighborsClassifier
    estimators = [('k-nn classifier', KNeighborsClassifier())]
    return Pipeline(estimators)

In [None]:
results = [f for f in app._pipeline_output_app.list() if f.parent.parent.parent == experiment]

for result in results:
    predictions.append(result.results.get('predictions'))
    metrics.append(result.metrics)
    grid_parameters.append(result.parameter_selection)
    names.append(experiment.name)

In [None]:
# Grid search over parameters defined below
@app.parameter_map()
def parameters():
    return {'SKLearnEstimator': {'parameters': {'gaussian process classification': {'max_iter_predict': [50, 100, 110, 125, 130],
                                                                                    'random_state': [0]}}}}

# Create the experiment
@app.experiment(dataset=dataset, reference=reference,
                experiment_name="Iris GPC - Grid Search with Static Seed", seed=1, project_name="Examples", 
                parameters=parameters)
def experiment():
    from sklearn.pipeline import Pipeline
    from sklearn.gaussian_process.gpc import GaussianProcessClassifier
    estimators = [('gaussian process classification', GaussianProcessClassifier())]
    return Pipeline(estimators)

In [None]:
results = [f for f in app._pipeline_output_app.list() if f.parent.parent.parent == experiment]

for result in results:
    predictions.append(result.results.get('predictions')) 
    metrics.append(result.metrics)
    grid_parameters.append(result.parameter_selection)   
    names.append(experiment.name)

In [None]:
# Grid search over parameters defined below
@app.parameter_map()
def parameters():
    return {'SKLearnEstimator': {'parameters': {'decision tree classifier': {'max_depth_tree': [5, 10, 15],
                                                                                    'random_state': [0]}}}}

# Create the experiment
@app.experiment(dataset=dataset, reference=reference,
                experiment_name="Iris Decision Tree - Grid Search with Static Seed", seed=1, project_name="Examples", 
                parameters=parameters)
def experiment():
    from sklearn.pipeline import Pipeline
    from sklearn.tree.tree import DecisionTreeClassifier
    estimators = [('decision tree classifier', DecisionTreeClassifier())]
    return Pipeline(estimators)

In [None]:
results = [f for f in app._pipeline_output_app.list() if f.parent.parent.parent == experiment]

for result in results:
    predictions.append(result.results.get('predictions'))
    metrics.append(result.metrics)
    grid_parameters.append(result.parameter_selection)
    names.append(experiment.name)

In [None]:
# Grid search over parameters defined below
@app.parameter_map()
def parameters():
    return {'SKLearnEstimator': {'parameters': {'random forest classifier': {'num_estimators': [3, 5, 9, 11],
                                                                             'max_depth_tree': [2, 8, 14],
                                                                             'random_state': [0]}}}}

# Create the experiment
@app.experiment(dataset=dataset, reference=reference,
                experiment_name="Iris Random Forest - Grid Search with Static Seed", seed=1, project_name="Examples",
                allow_metrics = True, 
                parameters=parameters)
def experiment():
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble.forest import RandomForestClassifier
    estimators = [('random forest classifier', RandomForestClassifier())]
    return Pipeline(estimators)

In [None]:
results = [f for f in app._pipeline_output_app.list() if f.parent.parent.parent == experiment]

for result in results:
    predictions.append(result.results.get('predictions'))
    grid_parameters.append(result.parameter_selection)
    metrics.append(result.metrics)
    names.append(experiment.name)


In [None]:
# Grid search over parameters defined below
@app.parameter_map()
def parameters():
    return {'SKLearnEstimator': {'parameters': {'multi-layer perceptron classifier': {'activation': ['relu', 'tanh', 'identity'],
                                                                             'solver': ['lbfgs', 'sgd', 'adam'],
                                                                             'batch_size': [2, 15, 20],
                                                                             'learning_rate_init': [0.001, 0.01]}}}}

# Create the experiment
@app.experiment(dataset=dataset, reference=reference, allow_metrics=True,
                experiment_name="Iris MLP - Grid Search with Static Seed", seed=1, project_name="Examples", 
                parameters=parameters)
def experiment():
    from sklearn.pipeline import Pipeline
    from sklearn.neural_network.multilayer_perceptron import MLPClassifier
    estimators = [('multi-layer perceptron classifier', MLPClassifier())]
    return Pipeline(estimators)

In [None]:
results = [f for f in app._pipeline_output_app.list() if f.parent.parent.parent == experiment]

for result in results:
    predictions.append(result.results.get('predictions'))
    metrics.append(result.metrics)
    grid_parameters.append(result.parameter_selection)
    names.append(experiment.name)

In [None]:
# Grid search over parameters defined below
@app.parameter_map()
def parameters():
    return {'SKLearnEstimator': {'parameters': {'AdaBoost classifier': {'random_state': [0]}}}}

# Create the experiment
@app.experiment(dataset=dataset, reference=reference,
                experiment_name="Iris Adaboost - Grid Search with Static Seed", seed=1, project_name="Examples", 
                parameters=parameters)
def experiment():
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble.weight_boosting import AdaBoostClassifier
    estimators = [('AdaBoost classifier', AdaBoostClassifier())]
    return Pipeline(estimators)

In [None]:
results = [f for f in app._pipeline_output_app.list() if f.parent.parent.parent == experiment]

for result in results:
    predictions.append(result.results.get('predictions'))
    metrics.append(result.metrics)
    grid_parameters.append(result.parameter_selection)
    names.append(experiment.name)

In [None]:
metrics

In [None]:
# Get the indices of experiments that satisfy the f1 range
f1_min = 0.00
f1_max = 1.0
idx_list = []
f1_score = []
idx = 0
for metric in metrics:
    for key in metric:
        if len(metric.get(key)) > 0:
            curr_value = metric.get(key)[1].get('f1_score')
            
    if f1_min <= curr_value <= f1_max:
        idx_list.append(idx)
        f1_score.append(curr_value)
        
    idx += 1           


print(idx_list)
print(len(idx_list))
print(len(metrics))
print(f1_score)


In [None]:
modified_predictions = [predictions[idx] for idx in idx_list]
matrix = np.zeros([len(modified_predictions), len(modified_predictions)])
missclassification_matrix = np.zeros([len(modified_predictions), len(modified_predictions)])
for idx in range(0, len(modified_predictions)):
    curr_dictionary = modified_predictions[idx]
    total_predictions = len(list(modified_predictions[idx].keys()))
    curr_missclassification_row = []
    total_missclassifications = 0
    for inner_idx in range(0, len(modified_predictions)):
        
        total_predictions = len(list(modified_predictions[idx].keys()))
        curr_missclassification_row = []
        total_missclassifications = 0
        
        identical_predictions = 0
        missclassified = 0
        missed = 0
        for key in (modified_predictions[idx].keys()):
            target_prediction_dict = modified_predictions[inner_idx].get(key)
            if target_prediction_dict is not None:
                target_prediction = target_prediction_dict.get('predicted')
            source_prediction = modified_predictions[idx].get(key).get('predicted')
            truth  = modified_predictions[idx].get(key).get('truth')
            if source_prediction == target_prediction:
                identical_predictions += 1
                if source_prediction != truth:
                    missclassified += 1
            
            if source_prediction != truth and truth != target_prediction:
                missed += 1
            
            if source_prediction != truth:
                total_missclassifications += 1

        if total_missclassifications == 0:
            total_missclassifications = 1
            missclassified = 1
                
        print(names[idx]+"/"+names[inner_idx])
        print("Both failed in the same way:" + str(missclassified))
        if missed != missclassified:
            print("Both failed: " + str(missed))
        print(total_missclassifications)
        print(missclassified/total_missclassifications)
        print(identical_predictions)
        print(total_predictions)
        print(identical_predictions/total_predictions)
        print("-----------------------------------------------------------------------------")
            
        missclassification_matrix[idx][inner_idx] = (missclassified/total_missclassifications)
        missclassification_matrix[inner_idx][idx] = (missclassified/total_missclassifications)
        #missclassification_matrix[idx][idx] = 0
        matrix[idx][inner_idx] = identical_predictions/total_predictions
        matrix[inner_idx][idx] = identical_predictions/total_predictions                  
        

In [None]:
matrix

In [None]:
missclassification_matrix

In [None]:
modified_name = []
for idx in idx_list:
    name = names[idx]
    for key in grid_parameters[idx]:
        temp = grid_parameters[idx].get(key)
        if temp:
            params = str(grid_parameters[idx].get(key))
            estimator_name_idx = str(grid_parameters[idx].get(key)).find('.')
            estimator_name = str(grid_parameters[idx].get(key))[2:estimator_name_idx+1]
            
    modified_name.append(name.replace('Iris ', "").replace(" - Grid Search with Static Seed", "") + params.replace(estimator_name, ""))
    
print(modified_name)

In [None]:
fig4, ax = plt.subplots(figsize=(10, 10))
fig4.subplots_adjust(top=.94)
plt.suptitle('Missclassification similarity', fontsize=14, fontweight='bold')

sns.heatmap(missclassification_matrix, fmt='.2f', ax=ax, xticklabels=modified_name, yticklabels=modified_name)
plt.savefig("missclassification.png")

In [None]:
fig5, ax = plt.subplots(figsize=(10, 10))
plt.suptitle('Classification similarity', fontsize=14, fontweight='bold')
sns.heatmap(matrix,xticklabels=modified_name, yticklabels=modified_name);
plt.savefig("classification_similarity.png")

In [None]:
# This generates 100 variables that could possibly be assigned to 5 clusters
n_variables = len(modified_name)
n_clusters = 5
n_samples = 1000

# To keep this example simple, each cluster will have a fixed size
cluster_size = n_variables // n_clusters

# Assign each variable to a cluster
belongs_to_cluster = np.repeat(range(n_clusters), cluster_size)
np.random.shuffle(belongs_to_cluster)

# This latent data is used to make variables that belong
# to the same cluster correlated.
latent = np.random.randn(n_clusters, n_samples)

variables = []
"""
for i in range(n_variables):
    variables.append(
        np.random.randn(n_samples) + latent[belongs_to_cluster[i], :]
    )
"""
variables = np.round(matrix * 100)
#variables = np.array(variables)

C = np.cov(variables)

def score(C):
    '''
    Function to assign a score to an ordered covariance matrix.
    High correlations within a cluster improve the score.
    High correlations between clusters decease the score.
    '''
    score = 0
    for cluster in range(n_clusters):
        inside_cluster = np.arange(cluster_size) + cluster * cluster_size
        outside_cluster = np.setdiff1d(range(n_variables), inside_cluster)

        # Belonging to the same cluster
        score += np.sum(C[inside_cluster, :][:, inside_cluster])

        # Belonging to different clusters
        score -= np.sum(C[inside_cluster, :][:, outside_cluster])
        score -= np.sum(C[outside_cluster, :][:, inside_cluster])

    return score


initial_C = C
initial_score = score(C)
initial_ordering = np.arange(n_variables)

plt.figure()
plt.imshow(C, interpolation='nearest')
plt.title('Initial C')
plt.savefig("Before_Clustering.png")
print('Initial ordering:', initial_ordering)
print('Initial covariance matrix score:', initial_score)

# Pretty dumb greedy optimization algorithm that continuously
# swaps rows to improve the score
def swap_rows(C, var1, var2):
    '''
    Function to swap two rows in a covariance matrix,
    updating the appropriate columns as well.
    '''
    D = C.copy()
    D[var2, :] = C[var1, :]
    D[var1, :] = C[var2, :]

    E = D.copy()
    E[:, var2] = D[:, var1]
    E[:, var1] = D[:, var2]

    return E

current_C = C
current_ordering = initial_ordering
current_score = initial_score

max_iter = 1000
for i in range(max_iter):
    # Find the best row swap to make
    best_C = current_C
    best_ordering = current_ordering
    best_score = current_score
    for row1 in range(n_variables):
        for row2 in range(n_variables):
            if row1 == row2:
                continue
            option_ordering = best_ordering.copy()
            option_ordering[row1] = best_ordering[row2]
            option_ordering[row2] = best_ordering[row1]
            option_C = swap_rows(best_C, row1, row2)
            option_score = score(option_C)

            if option_score > best_score:
                best_C = option_C
                best_ordering = option_ordering
                best_score = option_score

    if best_score > current_score:
        # Perform the best row swap
        current_C = best_C
        current_ordering = best_ordering
        current_score = best_score
    else:
        # No row swap found that improves the solution, we're done
        break

# Output the result
plt.figure()
plt.imshow(current_C, interpolation='nearest')
plt.title('Best C')
plt.savefig("clustered.png")
print('Best ordering:', current_ordering)
print('Best score:', current_score)
print('Cluster     [variables assigned to this cluster]')
print('------------------------------------------------')
    

In [None]:
clustered_estimators = []
new_matrix = deepcopy(current_C)
print(new_matrix.shape)
rows_to_delete = []
for cluster in range(n_clusters):
    alternate_estimator_names = []
    last_name = ""
    print('Cluster %02d  %s' % (cluster + 1, current_ordering[cluster*cluster_size:(cluster+1)*cluster_size]))
    for i in current_ordering[cluster*cluster_size:(cluster+1)*cluster_size]:
        temp_name = modified_name[i]
        temp_name = temp_name[:temp_name.find('{')]

        if temp_name != last_name:
            alternate_estimator_names.append(temp_name)
            last_name = temp_name
        else:
            rows_to_delete.append(i)
            new_matrix[:, i] = 0
            new_matrix[i, :] = 0
        #alternate_estimator_names = alternate_estimator_names.union(str(temp_name[:temp_name.find('{')]))
    print(alternate_estimator_names)
    print(len(rows_to_delete))
    clustered_estimators.append(list(alternate_estimator_names))
    
print(clustered_estimators)
print(rows_to_delete)

In [None]:
t = new_matrix[~np.all(new_matrix == 0, axis=1)]
#t = t[~np.all(t == 0, axis=0)]
t = np.delete(t, rows_to_delete, axis=1)
t.shape

In [None]:
fig4, ax = plt.subplots(figsize=(10, 10))
fig4.subplots_adjust(top=.94)
plt.suptitle('Classification similarity', fontsize=14, fontweight='bold')

sns.heatmap(t, fmt='.2f', ax=ax)

In [None]:
name