In [14]:
import os
import time
import warnings
import numpy as np
import random as rnd
import pandas as pd
from collections import defaultdict

# Librería Genética
from deap import base, creator, tools, algorithms

from sklearn.utils import shuffle
# Subfunciones de estimadores
from sklearn.base import clone
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/base.py][30]
from sklearn.base import is_classifier
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/base.py][535]
from sklearn.model_selection._validation import _fit_and_score
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_validation.py][346]
from sklearn.model_selection._search import BaseSearchCV
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_search.py][386]
from sklearn.model_selection._search import check_cv
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_split.py][1866]
from sklearn.model_selection._search import _check_param_grid
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_search.py][343]
from sklearn.metrics.scorer import check_scoring
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/metrics/scorer.py][250]
from sklearn.utils.validation import _num_samples
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/validation.py][105]
from sklearn.utils.validation import indexable
# [https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/validation.py][208]
from multiprocessing import Pool, Manager, cpu_count

# Selección para estimadores
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Metricas para estimadores
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# Estimadores
from sklearn.ensemble import ExtraTreesClassifier#
from sklearn.ensemble import RandomForestClassifier#
from sklearn.ensemble import AdaBoostClassifier#
from sklearn.ensemble import GradientBoostingClassifier#
from sklearn.tree import DecisionTreeClassifier#
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

"""
PYMACH
"""
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import FunctionTransformer

#Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

#Ensembles algorithms
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
warnings.filterwarnings("ignore")

# find distance error al 0.2%
def distance_error(estimator, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 7)
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    # coord pred
    x1 = np.int32((y_pred + 2) % 3)
    y1 = np.int32((y_pred - 1) / 3)
    # coord real
    x2 = np.int32((y_test + 2) % 3)
    y2 = np.int32((y_test - 1) / 3)
    # pasar variacion a distancias metros
    vx = np.abs(x1 - x2)
    vy = np.abs(x1 - x2)
    vx = vx*0.5 + (vx-1)*(vx>0)
    vy = vy*0.5 + (vy-1)*(vy>0)
    # pitagoras
    err_distance = np.mean(np.sqrt(vx*vx + vy*vy))
    return err_distance

# The problem to optimize
def evaluate( frecuencias, individual, estimator, score_cache={}, error_cache={}, 
             n_splits = 10, shuffle = False, scorer = "accuracy"):
    X, y = _createDataset(frecuencias, individual)
    metric_err = distance_error(estimator, X, y)
    score = 0
    paramkey = str(np.int32(individual)+1)
    if paramkey in score_cache:
        score = score_cache[paramkey]
        error = error_cache[paramkey]
    else:
        kfold = KFold(n_splits=n_splits, shuffle=shuffle)
        cv_results = cross_val_score(estimator, X, y, cv=kfold, scoring=scorer)
        score = cv_results.mean()
        error = cv_results.std()
        #score_cache[paramkey] = score
        #error_cache[paramkey] = error
    return score, error, metric_err


def _createDataset(frecuencias, values, seed = 7):
    # crear dataset
    names_ = frecuencias[0].columns.values
    # reestructuracion
    salida_final = pd.DataFrame(columns=names_)
    for sec in range(1,16):
        dataset = pd.DataFrame(columns=names_)
        corte = min([frecuencias[i][frecuencias[i]['Sector']==sec].shape[0] for i in values])
        tx = 0
        dataset[names_[tx]] = dataset[names_[tx]].append(frecuencias[int(values[tx])][frecuencias[int(values[tx])]['Sector']==sec][:corte][names_[tx]])
        for tx in range(1,5):
            dataset[names_[tx]] = frecuencias[int(values[tx])][frecuencias[int(values[tx])]['Sector']==sec][:corte][names_[tx]]
        dataset[names_[tx+1]] = frecuencias[int(values[tx])][frecuencias[int(values[tx])]['Sector']==sec][:corte][names_[tx+1]]
        # join parts
        salida_final = salida_final.append(dataset)
    # shuffle dataset
    salida_final = shuffle(salida_final, random_state=seed).reset_index(drop=True)
    salida_final = salida_final.apply(pd.to_numeric)
    # dataframe to X,y 
    X = salida_final[names_[:-1]]
    y = salida_final[names_[-1]]
    return X,y

def set_models():
    rs = 1
    models = []
    # LDA : Warning(Variables are collinear)
    models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
    models.append(('SVC', SVC(random_state=rs)))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('MLPClassifier', MLPClassifier()))
    models.append(('KNeighborsClassifier', KNeighborsClassifier()))
    models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=rs)))
    models.append(('LogisticRegression', LogisticRegression()))
    # Bagging and Boosting
    # models.append(('ExtraTreesClassifier', ExtraTreesClassifier(n_estimators=150)))
    models.append(('ExtraTreesClassifier', ExtraTreesClassifier(random_state=rs)))
    models.append(('AdaBoostClassifier', AdaBoostClassifier(DecisionTreeClassifier(random_state=rs),
                                                            random_state=rs)))
    # models.append(('AdaBoostClassifier', AdaBoostClassifier(DecisionTreeClassifier())))
    models.append(('RandomForestClassifier', RandomForestClassifier(random_state=rs)))
    models.append(('GradientBoostingClassifier',
                   GradientBoostingClassifier(random_state=rs)))
    # models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
    # Voting
    estimators = []
    estimators.append(("Voting_GradientBoostingClassifier", GradientBoostingClassifier(random_state=rs)))
    estimators.append(("Voting_ExtraTreesClassifier", ExtraTreesClassifier(random_state=rs)))
    voting = VotingClassifier(estimators)
    models.append(('VotingClassifier', voting))
    return models

In [15]:
test_size = 0.2
num_folds = 10
seed = 7
frecuencias = []
names_ = ['Be01', 'Be02', 'Be03', 'Be04', 'Be05', 'Sector']
frecuencias.append(pd.read_csv('sinFiltro/Tx_0x01'))#, names=names_))
frecuencias.append(pd.read_csv('sinFiltro/Tx_0x02'))#, names=names_))
frecuencias.append(pd.read_csv('sinFiltro/Tx_0x03'))#, names=names_))
frecuencias.append(pd.read_csv('sinFiltro/Tx_0x04'))#, names=names_))
frecuencias.append(pd.read_csv('sinFiltro/Tx_0x05'))#, names=names_))
frecuencias.append(pd.read_csv('sinFiltro/Tx_0x06'))#, names=names_))
num_jobs=4
estimadores = set_models()
salida = {}

In [16]:
"""
    def build_pipelines(self):
        pipelines = []
        models = self.set_models()

        for m in models:
            pipelines.append((m[0],
                Pipeline([
                    ('preparer', self.preparer),
                    m,
                ])
            ))

        self.pipelines = pipelines

        return pipelines
"""

for name, model in estimadores:
    print("\nModeling...", name)
    splits = 10
    simetricas = [[i]*5 for i in range(6)]
    for individual in simetricas:
        acc, desv, err = evaluate(frecuencias, individual, model)
        salida[str(name)+"-"+str(individual)] = str(acc) + "-"+ str(desv) + "-" + str(err)
        print(name," ", individual, "\t", acc, "\t", desv, "\t", err)



Modeling... LinearDiscriminantAnalysis
LinearDiscriminantAnalysis   [0, 0, 0, 0, 0] 	 0.638416766467 	 0.0187798305257 	 0.32423777479
LinearDiscriminantAnalysis   [1, 1, 1, 1, 1] 	 0.519638313341 	 0.0214421123982 	 0.443793684573
LinearDiscriminantAnalysis   [2, 2, 2, 2, 2] 	 0.578655533782 	 0.0199728943849 	 0.359756081656
LinearDiscriminantAnalysis   [3, 3, 3, 3, 3] 	 0.625618358477 	 0.0210842743655 	 0.358028749968
LinearDiscriminantAnalysis   [4, 4, 4, 4, 4] 	 0.543107562491 	 0.0243615675336 	 0.50400164191
LinearDiscriminantAnalysis   [5, 5, 5, 5, 5] 	 0.666508125923 	 0.0209476336107 	 0.426789450073

Modeling... SVC
SVC   [0, 0, 0, 0, 0] 	 0.722175648703 	 0.0168578958543 	 0.305871364889
SVC   [1, 1, 1, 1, 1] 	 0.687189749182 	 0.0209334522397 	 0.253211571168
SVC   [2, 2, 2, 2, 2] 	 0.748960126097 	 0.023039128306 	 0.240810358918
SVC   [3, 3, 3, 3, 3] 	 0.740944395143 	 0.0111034605247 	 0.289177067282
SVC   [4, 4, 4, 4, 4] 	 0.671918069228 	 0.0250041277738 	 0.2911173

In [18]:
df = pd.DataFrame.from_dict(salida, orient='index')
df.reset_index(level=0, inplace=True)
#display(df.sort_values([0],ascending=False).head(30))
# Guarda el dataframe en un archivo csv
#df.sort_values([0],ascending=False).to_csv('eas.csv', sep=',', index=False) 
#df = pd.DataFrame(df.index.str.split('-',1).tolist(), columns = ['index'])
s = df['index'].apply(lambda x: x.split('-'))
df['Modelo'] = s.apply(lambda x: x[0])
df['Configuracion'] = s.apply(lambda x: x[1])
t = df[0].apply(lambda x: x.split('-'))
df['Precision'] = t.apply(lambda x: x[0])
df['desvPrecision'] = t.apply(lambda x: x[1])
df['errorMetrico'] = t.apply(lambda x: x[2])
df = df.drop(['index', 0], axis=1)
df = df.sort_values(['Precision'],ascending=False)
df.to_csv('resultados_simetricos_sinFiltro.csv', sep=',', index=False) 
display(df)

Unnamed: 0,Modelo,Configuracion,Precision,desvPrecision,errorMetrico
65,GradientBoostingClassifier,"[5, 5, 5, 5, 5]",0.860644391408,0.0171459146416,0.173409520148
71,VotingClassifier,"[5, 5, 5, 5, 5]",0.846824639164,0.0189944969668,0.204555890272
47,ExtraTreesClassifier,"[5, 5, 5, 5, 5]",0.842061597909,0.0165258632528,0.209606652995
53,AdaBoostClassifier,"[5, 5, 5, 5, 5]",0.836588816911,0.021882318952,0.234860466608
59,RandomForestClassifier,"[5, 5, 5, 5, 5]",0.836121718377,0.022600658083,0.226442528737
63,GradientBoostingClassifier,"[3, 3, 3, 3, 3]",0.822946200347,0.0189784485903,0.167309588927
62,GradientBoostingClassifier,"[2, 2, 2, 2, 2]",0.819985089887,0.0249329851551,0.141567301909
60,GradientBoostingClassifier,"[0, 0, 0, 0, 0]",0.816516167665,0.0182266763725,0.1582336853
29,KNeighborsClassifier,"[5, 5, 5, 5, 5]",0.810627912263,0.0178548783715,0.227284322524
69,VotingClassifier,"[3, 3, 3, 3, 3]",0.800936355155,0.0197620375111,0.170063656235
