In [15]:
import pandas as pd
import numpy as np
from functools import partial
from scipy.optimize import fmin
from sklearn import metrics

import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn import ensemble
from sklearn import linear_model
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier




In [95]:
class OptimizeACC:
    """Class for optimizing Accuracy: computes the optimal coefficients/weights for
    ensemble model predictions"""

    def __init__(self):
        self.coef_ = 1 #initialize coefficients/weights

    def _acc(self, coef, X,y):
        """Calculates and return accuracy obtained
        from the weighted sum of predicions"""
        #Create predictions multiplying X tiems coefficients
        x_coef = X * coef #pondero cada prediccion por su coef

        #Create predictions by taking the sume of rows
        predictions = np.sum(x_coef, axis = 1)  #suma ponderada da prediccion final
        #Calculate accuracy score
        acc_score = metrics.accuracy_score(y, predictions.round())

        return -1 * acc_score #minimizaremos -acc para maximizar acc

    def fit(self, X,y): #usando loss partial obtenemos los coef que minimizan -acc
        loss_partial = partial(self._acc, X = X, y = y)
        initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size = 1)
        self.coef_ = fmin(loss_partial, initial_coef, disp = True)
        return self.coef_

    def predict(self, X):
        x_coef = X * self.coef_
        predictions = np.sum(x_coef, axis =1)
        return predictions
        
    def fold_predict(self, x_train, y_train,x_val, y_val, models):
        models_pred = [] #saves the predictions of each model
        #Fit all models in fold1
        for model in models:
            models[model].fit(x_train,y_train)
        #Take a prediction of probability on fold 2 using the fitted models
            models_pred.append(models[model].predict_proba(x_val)[:,1])

        models_pred = np.array(models_pred).T

        #As reference, let´s take the average
        avg_pred = np.mean(models_pred, axis = 1)

        #Stack all predicions in a 2D array
        y_val_pred= np.column_stack((models_pred, avg_pred))

        #Aproximates the values of the prediction with threshold of 0.5
        y_val_pred = y_val_pred.round()

        #Hacemos un reporte de las accuracies
        acc_fold = []
        for i in range(y_val_pred.shape[1]):
            acc = metrics.accuracy_score(y_val, y_val_pred[:,i])
            acc_fold.append(acc)

        print(f"Fold: LR ACC = {acc_fold[0]}")
        print(f"Fold: RF ACC = {acc_fold[1]}")
        print(f"Fold: XGB ACC = {acc_fold[2]}")
        print(f"Fold: Average Pred ACC = {acc_fold[3]}")

        return y_val_pred #returns the array of predictions for all algorithms plus the avg prediction
    
    def opt_coef(self, y1_pred, y2_pred): ##Now we find the optimal weights/coefficients to adjust using the optimizer
        print("COEFFICIENTS  AND ACCURACY FOLD 1:")
        
        #Calculo los coefficientes para ajustarme a y2 usando y2_pred
        coef_1= self.fit(y2_pred[:, :-1], y2) #me da los coeficientes que mejor maximizan la accuracy de la predicción de y2 al valor real y2

        #Uso estos coeficientes pára predecir el valor de y1
        y1_opt_predict = self.predict(y1_pred[:,:-1])

        #XCalculo el accuracy de esa predicción

        acc_1 = metrics.accuracy_score(y1, y1_opt_predict.round())
        coef_1 = np.array(coef_1)

        #Reporte
        print(f"Optimized ACC, Fold 1 = {acc_1}")
        print(f"Coefficients = {coef_1}")

        print("")

        #Ahora en Fold_2
        print("COEFFICIENTS  AND ACCURACY FOLD 2:")
        #Calculo los coefficientes para ajustarme a y1 usando y1_pred
        coef_2 = self.fit(y1_pred[:, :-1], y1) #me da los coeficientes que mejor maximizan la accuracy de la predicción de y2 al valor real y2

        #Uso estos coeficientes pára predecir el valor de y1
        y2_opt_predict = self.predict(y2_pred[:,:-1])

        #Calculo el accuracy de esa predicción

        acc_2 = metrics.accuracy_score(y2, y2_opt_predict.round())
        coef_2 = np.array(coef_2)

        #Reporte
        print(f"Optimized ACC, Fold 2 = {acc_2}")
        print(f"Coefficients = {coef_2}")

        #Averaging the coefficients
        avg_coef = (coef_1 + coef_2) / 2
        print("")
        print("Average of coefficients:", avg_coef)

        return avg_coef
    
    def run(self, X, y, models):# recibe X,y, diccionario de modelos ya instanciados
        #Split into two folds for training/validation 
        x1, x2,y1, y2 = model_selection.train_test_split(X,y, test_size = 0.5, stratify = y, random_state = 42)
        y2_p = self.fold_predict(x_train = x1, y_train = y1, x_val = x2, y_val = y2 , models = models)
        y1_p = self.fold_predict(x_train = x2, y_train = y2, x_val = x1, y_val = y1 , models = models)
        coef = self.opt_coef(y1_p,y2_p)

In [101]:
##Testing zone

# from sklearn.datasets import load_iris
# iris = load_iris()
# y = iris.target
# X = iris.data
X, y = make_classification(n_samples= 10000, n_features= 25)


#Models
# logistic regression, random forest and xgboost
logreg = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
xgbc = xgb.XGBClassifier()

models = {"logreg":logreg, "rf":rf, "xgbc":xgbc}

#Split into two folds for training/validation 
x1, x2,y1, y2 = model_selection.train_test_split(X,y, test_size = 0.5, stratify = y, random_state = 42)


ot = OptimizeACC()
# ot.run(X,y, models)
# y2_p = ot.fold_predict(x_train = x1, y_train = y1, x_val = x2, y_val = y2 , models = models)
# y1_p = ot.fold_predict(x_train = x2, y_train = y2, x_val = x1, y_val = y1 , models = models)
# coef = ot.opt_coef(y1_p,y2_p)

print("")
print("WITH RUN METHOD:")
ot.run(X,y, models)


WITH RUN METHOD:
Fold: LR ACC = 0.8528
Fold: RF ACC = 0.8614
Fold: XGB ACC = 0.8508
Fold: Average Pred ACC = 0.8598
Fold: LR ACC = 0.847
Fold: RF ACC = 0.8594
Fold: XGB ACC = 0.8496
Fold: Average Pred ACC = 0.8564
COEFFICIENTS  AND ACCURACY FOLD 1:
Optimization terminated successfully.
         Current function value: -0.859600
         Iterations: 9
         Function evaluations: 44
Optimized ACC, Fold 1 = 0.8574
Coefficients = [0.2037062  0.40574453 0.39054926]

COEFFICIENTS  AND ACCURACY FOLD 2:
Optimization terminated successfully.
         Current function value: -0.857400
         Iterations: 9
         Function evaluations: 44
Optimized ACC, Fold 2 = 0.8596
Coefficients = [0.34378226 0.35208292 0.30413482]

Average of coefficients: [0.27374423 0.37891373 0.34734204]


# NEXT:Convertir ahora a un módulo importable desde cualquier otro código:
