In [1]:
import sys
sys.path.append("./lib/")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from lib.Methods import GeneralMethods
from lib.edasSearch import EdasHyperparameterSearch
from lib.Hiperparametros import HyperparameterSwitcher
from lib.ImportacionModelos import getClassifierNames
from lib.ImportacionModelos import getClassifierModels
from lib.ImportacionModelos import getRegressorNames
from lib.ImportacionModelos import getRegressorModels
from lib.graphicGenerator import GraphicBuilder
from sklearn.model_selection import train_test_split

__Distribución de data 80% data para entrenamiento y 20% para validación__

In [11]:
from sklearn.model_selection import KFold
seed = 9
xSize = 1055
kf = KFold(n_splits=10)
df = pd.read_csv("data/filtred.csv")
X = df[df.columns[:xSize]]
Y = df[df.columns[xSize:]]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)
gbTrain = GraphicBuilder(pd.concat([X_train, y_train],axis=1))
gbTest = GraphicBuilder(pd.concat([X_test, y_test],axis=1))

In [13]:
Y

Unnamed: 0,BUILDINGID,FLOOR,LATITUDE,LONGITUDE
0,1,2,4.864921e+06,-7541.264300
1,1,2,4.864934e+06,-7536.621200
2,1,2,4.864950e+06,-7519.152400
3,1,2,4.864934e+06,-7524.570400
4,1,2,4.864939e+06,-7533.896200
5,1,2,4.864950e+06,-7519.152400
6,1,2,4.864929e+06,-7527.451100
7,1,2,4.864888e+06,-7559.497300
8,1,2,4.864949e+06,-7510.437173
9,1,2,4.864959e+06,-7528.816402


## Algorithm Modeling

__RandomizedSearch: Probando con un modelo de Clasificación, los demás se probarán en Servidor__

In [6]:
from sklearn.model_selection import RandomizedSearchCV

estimadorDictionary = getClassifierModels(includeEnsambled=True)
hypSwitcher = HyperparameterSwitcher()
process = 'randomized'
n_iteraciones = 2
idModeloPrueba = 7

result = {}
modelName = getClassifierNames(includeEnsambled=True)[idModeloPrueba]
    
estimador = estimadorDictionary[modelName]
parametros = hypSwitcher.getHyperparameters(modelName)(isDummy=False)
random_search = RandomizedSearchCV(estimador, param_distributions=parametros, 
                                   n_iter=n_iteraciones, cv=kf, scoring="accuracy", 
                                   return_train_score=False, n_jobs=-1)
random_search.fit(X_train, y_train.FLOOR)
result[modelName] = random_search.cv_results_

df1 = pd.DataFrame(np.array([result[modelName]['mean_test_score'], result[modelName]['std_test_score'],
                             result[modelName]['mean_fit_time'], result[modelName]['std_fit_time'],
                             result[modelName]['mean_score_time'], result[modelName]['std_score_time']
                            ]).T, columns = ['Accuracy', 'stdAccuracy', 'FitTime', 'stdFitTime', 'ScoreTime', 'stdScoreTime'])
df2 = pd.DataFrame(result[modelName]['params'])
dff = pd.concat([df1,df2], axis=1).sort_values(['Accuracy', 'FitTime'], ascending=[False, True])
dff.to_csv("result/" + process + "/" + modelName + ".csv", index=False)

__ExhaustiveSearch: Probando con un modelo de Clasificación, los demás se probarán en Servidor__

In [10]:
from sklearn.model_selection import GridSearchCV

process = 'exhaustive'
n_iteraciones = 2

result = {}
modelName = getClassifierNames(includeEnsambled=True)[idModeloPrueba]
    
estimador = estimadorDictionary[modelName]
parametros = hypSwitcher.getHyperparameters(modelName)(isDummy=False)
grid_search = GridSearchCV(estimador, param_grid=parametros, 
                                   cv=kf, scoring="accuracy", 
                                   return_train_score=False, n_jobs=-1)
grid_search.fit(X_train, y_train.FLOOR)
result[modelName] = grid_search.cv_results_

df1 = pd.DataFrame(np.array([result[modelName]['mean_test_score'], result[modelName]['std_test_score'],
                             result[modelName]['mean_fit_time'], result[modelName]['std_fit_time'],
                             result[modelName]['mean_score_time'], result[modelName]['std_score_time']
                            ]).T, columns = ['Accuracy', 'stdAccuracy', 'FitTime', 'stdFitTime', 'ScoreTime', 'stdScoreTime'])
df2 = pd.DataFrame(result[modelName]['params'])
dff = pd.concat([df1,df2], axis=1).sort_values(['Accuracy', 'FitTime'], ascending=[False, True])
dff.to_csv("result/" + process + "/" + modelName + ".csv", index=False)

__EdasSearch: Probando con un modelo de Clasificación, los demás se probarán en Servidor__

In [27]:
idModeloPrueba = 7
#hypSwitcher = HyperparameterSwitcher()
process = 'edas'
estimadorDictionary = getClassifierModels(includeEnsambled=True)
modelName = getClassifierNames(includeEnsambled=True)[idModeloPrueba]
estimador = estimadorDictionary[modelName]
parametros = hypSwitcher.getHyperparameters(modelName)(isDummy=False)

gm = GeneralMethods(estimador, X_train, y_train.FLOOR, seed=seed) ## manage drop duplicates in sample generation
test = EdasHyperparameterSearch(
    gm, parametros, estimador, iterations=2, sample_size=2, select_ratio=0.5, debug=True) # sample_size*select_ratio>=1
test.run()
dff = pd.DataFrame(list(test.resultados)).sort_values(['Accuracy'], ascending=False).reset_index(drop=True)
dff.to_csv("result/" + process + "/" + modelName + ".csv", index=False)

indice	Accuracy class_weight criterion
0	0.962077         None   entropy
1	0.955255         None      gini
2	0.962077         None   entropy


__EasSearch: Probando con un modelo de Clasificación, los demás se probarán en Servidor__

In [7]:
from lib.easSearch import GeneticSearchCV
idModeloPrueba = 7
estimadorDictionary = getClassifierModels(includeEnsambled=True)
hypSwitcher = HyperparameterSwitcher()
process = 'eas'

idModeloPrueba = 7
result = {}
modelName = getClassifierNames(includeEnsambled=True)[idModeloPrueba]
estimador = estimadorDictionary[modelName]
parametros = hypSwitcher.getHyperparameters(modelName)(isDummy=False)

gs2 = GeneticSearchCV(estimador, parametros, cv=kf, n_jobs=4, verbose=1, scoring='accuracy', refit=False
                     , generations_number=2, population_size=3)
result = gs2.fit(X_train, y_train.FLOOR)
dff = pd.DataFrame(list(gs2.result_cache)).sort_values(['Accuracy'], ascending=False).reset_index(drop=True)
dff.to_csv("result/" + process + "/" + modelName + ".csv", index=False)

Tipos: [1, 1], rangos: [1, 1]
--- Evolve in 4 possible combinations ---
gen	nevals	avg     	min     	max     	std       
0  	3     	0.959967	0.958082	0.962077	0.00163875
1  	2     	0.960746	0.958082	0.962077	0.0018833 
2  	2     	0.962077	0.962077	0.962077	1.11022e-16
Best individual is: {'criterion': 'entropy', 'class_weight': None}
with fitness: 0.9620774431468961


In [33]:
list(test.resultados)

[{'criterion': 'entropy',
  'class_weight': None,
  'Accuracy': 0.9620774431468961,
  'stdAccuracy': 0.005414661214317645,
  'Runtime': 2.373400092124939},
 {'criterion': 'gini',
  'class_weight': 'balanced',
  'Accuracy': 0.9597418561770128,
  'stdAccuracy': 0.0066779850547021204,
  'Runtime': 2.697361779212952},
 {'criterion': 'gini',
  'class_weight': None,
  'Accuracy': 0.9552550706822374,
  'stdAccuracy': 0.0030706812530539177,
  'Runtime': 2.5444828271865845},
 {'criterion': 'gini',
  'class_weight': None,
  'Accuracy': 0.9552550706822374,
  'stdAccuracy': 0.0030706812530539177,
  'Runtime': 2.5315757513046266}]

In [None]:
gbTest.graphicMap3D(columns = ["LATITUDE", "LONGITUDE", "FLOOR"], filename="buildingsMap3dTest")

## Model Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn import metrics as scoreMetrics
import geopy.distance
from functools import reduce
#from sklearn.metrics import roc_auc_score # binary
#from sklearn.metrics import auc # binary


kf = KFold(n_splits=10)
modelClassifier = RandomForestClassifier(n_jobs=-1, random_state=seed)
modelRegressor = RandomForestRegressor(n_jobs=-1, random_state=seed)

def accert(y_true, y_pred): 
    cm = confusion_matrix(y_true, y_pred)
    return (cm.diagonal()/cm.sum(0)).mean()

_meanLat = 39.9926853
_meanLon = -0.0673033
_minLongitude = -7705
_maxLongitude = -7290
_minLatitude = 4864735
_maxLatitude = 4865023
_maxLatitudeGPS = 39.993720
_maxLongitudeGPS = -0.069254
_minLatitudeGPS = 39.991626
_minLongitudeGPS = -0.065425

def longitudeToGPS(x):
    return (_maxLongitudeGPS - _minLongitudeGPS) * (x - _minLongitude) / (_maxLongitude - _minLongitude) + _minLongitudeGPS

def latitudeToGPS(x):
    return (_maxLatitudeGPS - _minLatitudeGPS) * (x - _minLatitude) / (_maxLatitude - _minLatitude) + _minLatitudeGPS

def latitudeListDistance(y_true, y_pred):
    return list(map(lambda yt,yp : geopy.distance.vincenty((_meanLon, yt),(_meanLon, yp)).m , latitudeToGPS(y_true), latitudeToGPS(y_pred)))

def longitudeListDistance(y_true, y_pred):
    return list(map(lambda yt,yp : geopy.distance.vincenty((yt, _meanLat),(yp, _meanLat)).m , longitudeToGPS(y_true), longitudeToGPS(y_pred)))
    
def distance2d(y_true, y_pred):
    ldis = []
    if ((y_true>0).sum()>0):
        #ldis = list(map(lambda yt,yp : geopy.distance.vincenty((_meanLon, yt),(_meanLon, yp)).m , latitudeToGPS(y_true), latitudeToGPS(y_pred)))
        ldis = latitudeListDistance(y_true, y_pred)
    else:
        #ldis = list(map(lambda yt,yp : geopy.distance.vincenty((yt, _meanLat),(yp, _meanLat)).m , longitudeToGPS(y_true), longitudeToGPS(y_pred)))
        ldis = longitudeListDistance(y_true, y_pred)
    return reduce(lambda x,y: x+y, ldis) / len(ldis)

def mse(y_true, y_pred):
    return scoreMetrics.mean_squared_error(y_true, y_pred)
    
def mae(y_true, y_pred):
    return scoreMetrics.mean_absolute_error(y_true, y_pred)
    
scoring_acc = {
    #'average_precision' : 'average_precision_weighted',
    #'precision': 'precision',
    #'recall': 'recall',
    #'balanced_accuracy': 'balanced_accuracy',
    #'roc_auc': 'roc_auc',
    'nbaccuracy' : make_scorer(accert),
    'accuracy': 'accuracy'
}

scoring_reg = {
    'mae': make_scorer(mae),# 'mean_absolute_error',
    'mse': make_scorer(mse),#'mean_squared_error',
    'distance': make_scorer(distance2d),
    'r2': 'r2'
}

## TODO: Add and modify some metrics
## http://scikit-learn.org/stable/modules/model_evaluation.html
## https://www.icmla-conference.org/icmla10/CFP_Tutorial_files/jose.pdf