In [14]:
import os
import time
import warnings
import numpy as np
import random as rnd
import pandas as pd
from collections import defaultdict

# Librería Genética
from deap import base, creator, tools, algorithms

from sklearn.utils import shuffle
# Subfunciones de estimadores
from sklearn.base import clone
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/base.py][30]
from sklearn.base import is_classifier
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/base.py][535]
from sklearn.model_selection._validation import _fit_and_score
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_validation.py][346]
from sklearn.model_selection._search import BaseSearchCV
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_search.py][386]
from sklearn.model_selection._search import check_cv
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_split.py][1866]
from sklearn.model_selection._search import _check_param_grid
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_search.py][343]
from sklearn.metrics.scorer import check_scoring
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/metrics/scorer.py][250]
from sklearn.utils.validation import _num_samples
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/validation.py][105]
from sklearn.utils.validation import indexable
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/validation.py][208]
from multiprocessing import Pool, Manager

# Selección para estimadores
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Metricas para estimadores
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# Estimadores
from sklearn.ensemble import ExtraTreesClassifier#
from sklearn.ensemble import RandomForestClassifier#
from sklearn.ensemble import AdaBoostClassifier#
from sklearn.ensemble import GradientBoostingClassifier#
from sklearn.tree import DecisionTreeClassifier#
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


"""
PYMACH
"""
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import FunctionTransformer

#Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

#Ensembles algorithms
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier



# The problem to optimize
def evaluate( frecuencias, individual, estimator, score_cache={}, error_cache={}, n_splits = 10, shuffle = False, scorer = "accuracy"):
	X,y = _createDataset(frecuencias, individual)
	score = 0
	paramkey = str(np.int32(individual)+1)
	if paramkey in score_cache:
		score = score_cache[paramkey]
		error = error_cache[paramkey]
	else:
		kfold = KFold(n_splits=n_splits, shuffle=shuffle)
		cv_results = cross_val_score(estimator, X, y, cv=kfold, scoring=scorer)
		score = cv_results.mean()
		error = cv_results.std()
		score_cache[paramkey] = score
		error_cache[paramkey] = error
	return score, error


def _createDataset(frecuencias, values, seed = 7):
    # crear dataset
    names_ = frecuencias[0].columns.values
    # reestructuracion
    salida_final = pd.DataFrame(columns=names_)
    for sec in range(1,16):
        dataset = pd.DataFrame(columns=names_)
        corte = min([frecuencias[i][frecuencias[i]['Sector']==sec].shape[0] for i in values])
        tx = 0
        dataset[names_[tx]] = dataset[names_[tx]].append(frecuencias[int(values[tx])][frecuencias[int(values[tx])]['Sector']==sec][:corte][names_[tx]])
        for tx in range(1,5):
            dataset[names_[tx]] = frecuencias[int(values[tx])][frecuencias[int(values[tx])]['Sector']==sec][:corte][names_[tx]]
        dataset[names_[tx+1]] = frecuencias[int(values[tx])][frecuencias[int(values[tx])]['Sector']==sec][:corte][names_[tx+1]]
        # join parts
        salida_final = salida_final.append(dataset)
    # shuffle dataset
    salida_final = shuffle(salida_final, random_state=seed).reset_index(drop=True)
    salida_final = salida_final.apply(pd.to_numeric)
    # dataframe to X,y 
    X = salida_final[names_[:-1]]
    y = salida_final[names_[-1]]
    return X,y

def set_models():
    rs = 1
    models = []
    # LDA : Warning(Variables are collinear)
    models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
    models.append(('SVC', SVC(random_state=rs)))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('MLPClassifier', MLPClassifier()))
    models.append(('KNeighborsClassifier', KNeighborsClassifier()))
    models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=rs)))
    models.append(('LogisticRegression', LogisticRegression()))
    # Bagging and Boosting
    # models.append(('ExtraTreesClassifier', ExtraTreesClassifier(n_estimators=150)))
    models.append(('ExtraTreesClassifier', ExtraTreesClassifier(random_state=rs)))
    models.append(('AdaBoostClassifier', AdaBoostClassifier(DecisionTreeClassifier(random_state=rs),
                                                            random_state=rs)))
    # models.append(('AdaBoostClassifier', AdaBoostClassifier(DecisionTreeClassifier())))
    models.append(('RandomForestClassifier', RandomForestClassifier(random_state=rs)))
    models.append(('GradientBoostingClassifier',
                   GradientBoostingClassifier(random_state=rs)))
    # models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
    # Voting
    estimators = []
    estimators.append(("Voting_GradientBoostingClassifier", GradientBoostingClassifier(random_state=rs)))
    estimators.append(("Voting_ExtraTreesClassifier", ExtraTreesClassifier(random_state=rs)))
    voting = VotingClassifier(estimators)
    models.append(('VotingClassifier', voting))
    return models

In [15]:
test_size = 0.2
num_folds = 10
seed = 7
frecuencias = []
names_ = ['Be01', 'Be02', 'Be03', 'Be04', 'Be05', 'Sector']
frecuencias.append(pd.read_csv('Filtrado/LocalizationNew_Tx1.csv', names=names_))
frecuencias.append(pd.read_csv('Filtrado/LocalizationNew_Tx2.csv', names=names_))
frecuencias.append(pd.read_csv('Filtrado/LocalizationNew_Tx3.csv', names=names_))
frecuencias.append(pd.read_csv('Filtrado/LocalizationNew_Tx4.csv', names=names_))
frecuencias.append(pd.read_csv('Filtrado/LocalizationNew_Tx5.csv', names=names_))
frecuencias.append(pd.read_csv('Filtrado/LocalizationNew_Tx6.csv', names=names_))
num_jobs=8
estimadores = set_models()
salida = {}

for name, model in estimadores:
    print("Modeling...", name, "\n")
    splits = 10
    simetricas = [[i]*5 for i in range(6)]
    for individual in simetricas:
        acc, err = evaluate( frecuencias, individual, model, n_splits = splits, shuffle = False, scorer = "accuracy")
        salida[str(name)+"-"+str(individual)] = str(acc) +"\t"+ str(err)
        print(str(name) + " " + str(individual) + "\t" + str(acc) + "\t" + str(err))


Modeling... LinearDiscriminantAnalysis 

LinearDiscriminantAnalysis [0, 0, 0, 0, 0]	0.638416766467	0.0187798305257




LinearDiscriminantAnalysis [1, 1, 1, 1, 1]	0.520026421949	0.0105850241566
LinearDiscriminantAnalysis [2, 2, 2, 2, 2]	0.586515323426	0.0181231830843




LinearDiscriminantAnalysis [3, 3, 3, 3, 3]	0.637316773242	0.0172482980064
LinearDiscriminantAnalysis [4, 4, 4, 4, 4]	0.568553958315	0.0196238575959




LinearDiscriminantAnalysis [5, 5, 5, 5, 5]	0.676727161251	0.0181642713004
Modeling... SVC 

SVC [0, 0, 0, 0, 0]	0.638416766467	0.0187798305257
SVC [1, 1, 1, 1, 1]	0.520026421949	0.0105850241566
SVC [2, 2, 2, 2, 2]	0.586515323426	0.0181231830843
SVC [3, 3, 3, 3, 3]	0.637316773242	0.0172482980064
SVC [4, 4, 4, 4, 4]	0.568553958315	0.0196238575959
SVC [5, 5, 5, 5, 5]	0.676727161251	0.0181642713004
Modeling... GaussianNB 

GaussianNB [0, 0, 0, 0, 0]	0.638416766467	0.0187798305257
GaussianNB [1, 1, 1, 1, 1]	0.520026421949	0.0105850241566
GaussianNB [2, 2, 2, 2, 2]	0.586515323426	0.0181231830843
GaussianNB [3, 3, 3, 3, 3]	0.637316773242	0.0172482980064
GaussianNB [4, 4, 4, 4, 4]	0.568553958315	0.0196238575959
GaussianNB [5, 5, 5, 5, 5]	0.676727161251	0.0181642713004
Modeling... MLPClassifier 

MLPClassifier [0, 0, 0, 0, 0]	0.638416766467	0.0187798305257
MLPClassifier [1, 1, 1, 1, 1]	0.520026421949	0.0105850241566
MLPClassifier [2, 2, 2, 2, 2]	0.586515323426	0.0181231830843
MLPClassifier [3, 

In [5]:
"""
    def build_pipelines(self):
        pipelines = []
        models = self.set_models()

        for m in models:
            pipelines.append((m[0],
                Pipeline([
                    ('preparer', self.preparer),
                    m,
                ])
            ))

        self.pipelines = pipelines

        return pipelines
"""


"\n    def build_pipelines(self):\n        pipelines = []\n        models = self.set_models()\n\n        for m in models:\n            pipelines.append((m[0],\n                Pipeline([\n                    ('preparer', self.preparer),\n                    m,\n                ])\n            ))\n\n        self.pipelines = pipelines\n\n        return pipelines\n"