In [1]:
import os
import time
import warnings
import numpy as np
import random as rnd
import pandas as pd
from collections import defaultdict

# Librería Genética
from deap import base, creator, tools, algorithms

from sklearn.utils import shuffle
# Subfunciones de estimadores
from sklearn.base import clone
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/base.py][30]
from sklearn.base import is_classifier
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/base.py][535]
from sklearn.model_selection._validation import _fit_and_score
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_validation.py][346]
from sklearn.model_selection._search import BaseSearchCV
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_search.py][386]
from sklearn.model_selection._search import check_cv
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_split.py][1866]
from sklearn.model_selection._search import _check_param_grid
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_search.py][343]
from sklearn.metrics.scorer import check_scoring
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/metrics/scorer.py][250]
from sklearn.utils.validation import _num_samples
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/validation.py][105]
from sklearn.utils.validation import indexable
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/validation.py][208]
from multiprocessing import Pool, Manager

# Selección para estimadores
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Metricas para estimadores
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# Estimadores
from sklearn.ensemble import ExtraTreesClassifier#
from sklearn.ensemble import RandomForestClassifier#
from sklearn.ensemble import AdaBoostClassifier#
from sklearn.ensemble import GradientBoostingClassifier#
from sklearn.tree import DecisionTreeClassifier#
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


"""
PYMACH
"""
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import FunctionTransformer

#Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

#Ensembles algorithms
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

test_size = 0.2
num_folds = 10
seed = 7
frecuencias = []
names_ = ['Be01', 'Be02', 'Be03', 'Be04', 'Be05', 'Sector']
frecuencias.append(pd.read_csv('Filtrado/LocalizationNew_Tx1.csv', names=names_))
frecuencias.append(pd.read_csv('Filtrado/LocalizationNew_Tx2.csv', names=names_))
frecuencias.append(pd.read_csv('Filtrado/LocalizationNew_Tx3.csv', names=names_))
frecuencias.append(pd.read_csv('Filtrado/LocalizationNew_Tx4.csv', names=names_))
frecuencias.append(pd.read_csv('Filtrado/LocalizationNew_Tx5.csv', names=names_))
frecuencias.append(pd.read_csv('Filtrado/LocalizationNew_Tx6.csv', names=names_))
num_jobs=8


# The problem to optimize
def getAccuracy( frecuencias, individual, estimator, score_cache ):
	X,y = _createDataset(frecuencias, individual)
	score = 0
	scorer = "accuracy"
	paramkey = str(np.int32(individual)+1)
	if paramkey in score_cache:
		score = score_cache[paramkey]
	else:
		kfold = KFold(n_splits=10, shuffle=False)
		cv_results = cross_val_score(estimator, X, y, cv=kfold, scoring=scorer)
		score = np.mean(cv_results)
		score_cache[paramkey] = score
	return score


def _createDataset(frecuencias, values, seed = 7):
    # crear dataset
    names_ = frecuencias[0].columns.values
    # reestructuracion
    salida_final = pd.DataFrame(columns=names_)
    for sec in range(1,16):
        dataset = pd.DataFrame(columns=names_)
        corte = min([frecuencias[i][frecuencias[i]['Sector']==sec].shape[0] for i in values])
        tx = 0
        dataset[names_[tx]] = dataset[names_[tx]].append(frecuencias[int(values[tx])][frecuencias[int(values[tx])]['Sector']==sec][:corte][names_[tx]])
        for tx in range(1,5):
            dataset[names_[tx]] = frecuencias[int(values[tx])][frecuencias[int(values[tx])]['Sector']==sec][:corte][names_[tx]]
        dataset[names_[tx+1]] = frecuencias[int(values[tx])][frecuencias[int(values[tx])]['Sector']==sec][:corte][names_[tx+1]]
        # join parts
        salida_final = salida_final.append(dataset)
    # shuffle dataset
    salida_final = shuffle(salida_final, random_state=seed).reset_index(drop=True)
    # dataframe to X,y 
    y = salida_final[names_[5]]
    del salida_final[names_[5]]
    X = salida_final
    return X,y

In [None]:

class Evaluate():
    """ A class for resampling and evaluation """



    def __init__(self, definer, preparer, selector):
        self.definer = definer
        self.preparer = preparer
        self.selector = selector
        self.plot_html = None

        self.report = None
        self.raw_report = None
        self.best_pipelines = None
        self.pipelines = None
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None

        self.test_size = 0.2
        self.num_folds = 10
        self.seed = 7

    def pipeline(self):

        #evaluators = []
        self.build_pipelines()
        self.split_data(self.test_size, self.seed)
        self.evaluate_pipelines()
        self.set_best_pipelines()

        #[m() for m in evaluators]
        return self

    def set_models(self):

        rs = 1
        models = []
        # LDA : Warning(Variables are collinear)
        models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
        models.append(('SVC', SVC(random_state=rs)))
        models.append(('GaussianNB', GaussianNB()))
        models.append(('MLPClassifier', MLPClassifier()))
        models.append(('KNeighborsClassifier', KNeighborsClassifier()))
        models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=rs)))
        models.append(('LogisticRegression', LogisticRegression()))

        # Bagging and Boosting
        # models.append(('ExtraTreesClassifier', ExtraTreesClassifier(n_estimators=150)))
        models.append(('ExtraTreesClassifier', ExtraTreesClassifier(random_state=rs)))
        models.append(('AdaBoostClassifier', AdaBoostClassifier(DecisionTreeClassifier(random_state=rs),
                                                                random_state=rs)))
        # models.append(('AdaBoostClassifier', AdaBoostClassifier(DecisionTreeClassifier())))
        models.append(('RandomForestClassifier', RandomForestClassifier(random_state=rs)))
        models.append(('GradientBoostingClassifier',
                       GradientBoostingClassifier(random_state=rs)))
        # models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))

        # Voting
        estimators = []
        estimators.append(("Voting_GradientBoostingClassifier", GradientBoostingClassifier(random_state=rs)))
        estimators.append(("Voting_ExtraTreesClassifier", ExtraTreesClassifier(random_state=rs)))
        voting = VotingClassifier(estimators)
        models.append(('VotingClassifier', voting))

        return models

    def split_data(self, test_size=0.20, seed=7):
        """ Need to fill """

        X_train, X_test, y_train, y_test =  train_test_split(
                self.definer.X, self.definer.y, test_size=test_size, random_state=seed)

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

        # return X_train, X_test, y_train, y_test


    def build_pipelines(self):
        pipelines = []
        models = self.set_models()

        for m in models:
            pipelines.append((m[0],
                Pipeline([
                    #('preparer', FunctionTransformer(self.preparer)),
                    ('preparer', self.preparer),
                    ('selector', self.selector),
                    m,
                ])
            ))

        self.pipelines = pipelines

        return pipelines

    def evaluate_pipelines(self, ax=None):

        test_size = self.test_size
        num_folds = self.num_folds
        seed = self.seed
        scoring = 'accuracy'

        #pipelines = self.build_pipelines(self.set_models())
        #pipelines = self.pipelines


        #self.report = {}
        #report_element = {}
        self.report = [["Model", "Mean", "STD"]]
        results = []
        names = []

        for name, model in self.pipelines:
            print("Modeling...", name)

            kfold = KFold(n_splits=num_folds, random_state=seed)
            #cv_results = cross_val_score(model, self.definer.data.ix[:,:-1], self.definer.data.ix[:,-1], cv=kfold, \
                    #scoring=scoring)
            cv_results = cross_val_score(model, self.X_train, self.y_train, cv=kfold, \
                    scoring=scoring)

            # save the model to disk
            #filename = name+'.ml'
            #pickle.dump(model, open('./models/'+filename, 'wb'))

            #results.append(cv_results)
            mean = cv_results.mean()
            std = cv_results.std()

            d = {'name': name, 'values': cv_results, 'mean': round(mean, 3), 'std': round(std, 3)}
            results.append(d)
            #results['result'] = cv_results
            #names.append(name)
            #report_element[name] = {'mean':mean, 'std':std}
            #self.report.update(report_element)

            #report_print = "Model: {}, mean: {}, std: {}".format(name,
                    #mean, std)
            self.report.append([name, round(mean,3), round(std,3)])
            print("Score ", mean)
            print("---------------------")
            #print(report_print)

        self.raw_report = sorted(results, key=lambda k: k['mean'], reverse=True)
        #print(self.raw_report)
        headers = self.report.pop(0)
        df_report = pd.DataFrame(self.report, columns=headers)
        #print(df_report)

        #print(self.report)
        #self.sort_report(self.report)
        self.sort_report(df_report)
        #self.plotModels(results, names)


    def sort_report(self, report):
        """" Choose the best two algorithms"""

        #sorted_t = sorted(report.items(), key=operator.itemgetter(1))
        report.sort_values(['Mean'], ascending=[False], inplace=True)
        #self.bestAlgorithms = sorted_t[-2:]
        self.report = report.copy()

        #print(self.report)

    def set_best_pipelines(self):
        alg = list(self.report.Model)[0:2]
        best_pipelines = []

        for p in self.pipelines:
            if p[0] in alg:
                best_pipelines.append(p)

        self.best_pipelines = best_pipelines

        #print(self.best_pipelines)

    def plot_to_html(self, fig):
        plotly_html_div, plotdivid, width, height = _plot_html(
                figure_or_data=fig,
                config="",
                validate=True,
                default_width='90%',
                default_height="100%",
                global_requirejs=False)

        return plotly_html_div

    def plot_models(self):
        """" Plot the algorithms by using box plots"""
        #df = pd.DataFrame.from_dict(self.raw_report)
        #print(df)

        results = self.raw_report
        data = []
        N = len(results)
        c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 270, N)]

        for i, d in enumerate(results):
            trace = go.Box(
                y=d['values'],
                name=d['name'],
                marker=dict(
                    color=c[i],
                ),
                boxmean='sd'
            )
            data.append(trace)

        text_scatter = go.Scatter(
                x=[d['name'] for d in results],
                y=[d['mean'] for d in results],
                name='score',
                mode='markers',
                text=['Explanation' for _ in results]
        )
        data.append(text_scatter)
        layout = go.Layout(
            #showlegend=False,
            title='Hover over the bars to see the details',
            annotations=[
                dict(
                    x=results[0]['name'],
                    y=results[0]['mean'],
                    xref='x',
                    yref='y',
                    text='Best model',
                    showarrow=True,
                    arrowhead=7,
                    ax=0,
                    ay=-40
                ),
                dict(
                    x=results[-1]['name'],
                    y=results[-1]['mean'],
                    xref='x',
                    yref='y',
                    text='Worst model',
                    showarrow=True,
                    arrowhead=7,
                    ax=0,
                    ay=-40
                )
            ]
        )

        fig = go.Figure(data=data, layout=layout)

        self.plot_html = self.plot_to_html(fig)
        return self.plot_html

    def save_plot(self, path):
        with open(path, "w") as plot:
            plot.write(self.plot_html)

    def save_report(self, path):
        # with open(path, "w") as plot:
        self.report.to_csv(path, index=False)
        # plot.write(valuate.report.to_csv())

    class CustomFeature(TransformerMixin):
        """ A custome class for modeling """

        def transform(self, X, **transform_params):
            #X = pd.DataFrame(X)
            return X

        def fit(self, X, y=None, **fit_params):
            return self