In [46]:
import pandas as pd
import numpy as np
from functools import partial
from scipy.optimize import fmin
from sklearn import metrics

import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn import ensemble
from sklearn import linear_model
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier



class OptimizeACC:
    """Class for optimizing Accuracy"""

    def __init__(self):
        self.coef_ = 0 #initialize coefficients/weights

    def _acc(self, coef, X,y):
        """Calculates and return accuracy obtained
        from the weighted sum of predicions"""
        #Create predictions multiplying X tiems coefficients
        x_coef = X * coef #pondero cada prediccion por su coef

        #Create predictions by taking the sume of rows
        predictions = np.sum(x_coef, axis = 1)  #suma ponderada da prediccion final
        #Calculate accuracy score
        acc_score = metrics.accuracy_score(y, predictions.round())

        return -1 * acc_score #minimizaremos -acc para maximizar acc

    def fit(self, X,y): #usando loss partial obtenemos los coef que minimizan -acc
        loss_partial = partial(self._acc, X = X, y = y)
        initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size = 1)
        self.coef_ = fmin(loss_partial, initial_coef, disp = True)

    def predict(self, X):
        x_coef = X * self.coef_
        predictions = np.sum(x_coef, axis =1)
        return predictions

In [47]:
%store -r mods
%store -r train_mae
%store -r y

In [48]:
X = train_mae.copy()

In [49]:
mods

{'gsRFC': GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
              estimator=RandomForestClassifier(), n_jobs=1,
              param_grid={'bootstrap': [False], 'criterion': ['gini'],
                          'max_depth': [None], 'max_features': [1, 3, 10],
                          'min_samples_leaf': [1, 3, 10],
                          'min_samples_split': [2, 3, 10],
                          'n_estimators': [100, 300]},
              scoring='accuracy', verbose=1),
 'gsGBC': GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
              estimator=GradientBoostingClassifier(), n_jobs=1,
              param_grid={'learning_rate': [0.1, 0.05, 0.01, 1],
                          'loss': ['deviance'], 'max_depth': [4, 8],
                          'max_features': [0.3, 0.1],
                          'min_samples_leaf': [100, 150],
                          'n_estimators': [100, 200, 300, 500, 1000]},
              s

In [50]:
#df = pd.read_csv("../input/train1/train_1.csv")
#y = df.pop("Survived")
#X = df

# X, y = make_classification(n_samples= 1000, n_features= 25)

In [51]:
#Make a binary dataset with 10k examples and 25 features

# X, y = make_classification(n_samples= 10000, n_features= 25)



#Models instatiation
# logreg = linear_model.LogisticRegression()
# rf = ensemble.RandomForestClassifier(bootstrap=False, max_features=3, min_samples_leaf=10,
# #                        min_samples_split=10)
# # xgbc = xgb.XGBClassifier()

# ##en este punto le puedo conectar los modelos ya entrenados con gridsearch

# models = {"logreg":logreg, "rf":rf, "xgbc":xgbc}

#Models instatiation
rf = ensemble.RandomForestClassifier(bootstrap=False, max_features=3, min_samples_leaf=3,
                       min_samples_split=3)
gbc = ensemble.GradientBoostingClassifier(learning_rate=1, max_depth=4, max_features=0.1,
                           min_samples_leaf=150, n_estimators=1000)
xtrees = ensemble.ExtraTreesClassifier(max_features=1, min_samples_split=10, n_estimators=300)

# ##en este punto le puedo conectar los modelos ya entrenados con gridsearch

models = {"xtrees":xtrees, "rf":rf, "gbc":gbc}

#Split into two folds for training/validation 
x1, x2,y1, y2 = model_selection.train_test_split(X,y, test_size = 0.5, stratify = y, random_state = 111)
    

def fold_predict(x_train, y_train,x_val, y_val, models):
   
    ##WORK  WITH FOLD 1
    models_pred = [] #saves the predictions of each model
    #Fit all models in fold1
    for model in models:
        models[model].fit(x_train,y_train)
    #Take a prediction of probability on fold 2 using the fitted models
        models_pred.append(models[model].predict_proba(x_val)[:,1])
        
    models_pred = np.array(models_pred).T
    
    #As reference, let´s take the average
    avg_pred = np.mean(models_pred, axis = 1)
    print("models_pred shape", models_pred.shape)
    print("Average shape", avg_pred.shape)
    
    #Stack all predicions in a 2D array
    y_val_pred= np.column_stack((models_pred, avg_pred))

    #Aproximates the values of the prediction with threshold of 0.5
    y_val_pred = y_val_pred.round()

    #Hacemos un reporte de las accuracies
    acc_fold = []
    for i in range(y_val_pred.shape[1]):
        acc = metrics.accuracy_score(y_val, y_val_pred[:,i])
        acc_fold.append(acc)

    print(f"Fold-2: LR ACC = {acc_fold[0]}")
    print(f"Fold-2: RF ACC = {acc_fold[1]}")
    print(f"Fold-2: XGB ACC = {acc_fold[2]}")
    print(f"Fold-2: Average Pred ACC = {acc_fold[3]}")

    return y_val_pred #returns the array of predictions for all algorithms plus the avg prediction

In [52]:
y2_pred = fold_predict(x_train = x1, y_train = y1, x_val = x2, y_val = y2 , models = models)

models_pred shape (446, 3)
Average shape (446,)
Fold-2: LR ACC = 0.8139013452914798
Fold-2: RF ACC = 0.804932735426009
Fold-2: XGB ACC = 0.7869955156950673
Fold-2: Average Pred ACC = 0.8004484304932735


In [53]:
y1_pred = fold_predict(x_train = x2, y_train = y2, x_val = x1, y_val = y1 , models = models)

models_pred shape (445, 3)
Average shape (445,)
Fold-2: LR ACC = 0.8449438202247191
Fold-2: RF ACC = 0.8269662921348314
Fold-2: XGB ACC = 0.802247191011236
Fold-2: Average Pred ACC = 0.8314606741573034


In [54]:
def opt_coef(y1_pred, y2_pred): ##Now we find the optimal weights/coefficients to adjust using the optimizer
    print("COEFFICIENTS  AND ACCURACY FOLD 1:")
    #Instanciamos
    opt = OptimizeACC()
    #Calculo los coefficientes para ajustarme a y2 usando y2_pred
    opt.fit(y2_pred[:, :-1], y2) #me da los coeficientes que mejor maximizan la accuracy de la predicción de y2 al valor real y2

    #Uso estos coeficientes pára predecir el valor de y1
    y1_opt_predict = opt.predict(y1_pred[:,:-1])
    # y1_opt_predict = aproximate(y1_opt_predict) #aproximo

    #XCalculo el accuracy de esa predicción

    acc_1 = metrics.accuracy_score(y1, y1_opt_predict.round())
    coef_1 = np.array(opt.coef_)

    #Reporte
    print(f"Optimized ACC, Fold 1 = {acc_1}")
    print(f"Coefficients = {opt.coef_}")

    print("")
    
    #Ahora en Fold_2
    print("COEFFICIENTS  AND ACCURACY FOLD 2:")
    #Instanciamos
    opt = OptimizeACC()
    #Calculo los coefficientes para ajustarme a y1 usando y1_pred
    opt.fit(y1_pred[:, :-1], y1) #me da los coeficientes que mejor maximizan la accuracy de la predicción de y2 al valor real y2

    #Uso estos coeficientes pára predecir el valor de y1
    y2_opt_predict = opt.predict(y2_pred[:,:-1])
    # y2_opt_predict = aproximate(y2_opt_predict) #aproximo

    #Calculo el accuracy de esa predicción

    acc_2 = metrics.accuracy_score(y2, y2_opt_predict.round())
    coef_2 = np.array(opt.coef_)

    #Reporte
    print(f"Optimized ACC, Fold 2 = {acc_2}")
    print(f"Coefficients = {opt.coef_}")

    #Averaging the coefficients
    avg_coef = (coef_1 + coef_2) / 2
    print("")
    print("Average of coefficients:", avg_coef)

    return avg_coef
# 
#final_predict = ponderar acá. Tomar del diccionario los algorithms y ponderar el predict de test por los coeficientes model1*avg_coef[0] + ...

In [55]:
opt_coef(y1_pred, y2_pred)

COEFFICIENTS  AND ACCURACY FOLD 1:
Optimization terminated successfully.
         Current function value: -0.807175
         Iterations: 9
         Function evaluations: 44
Optimized ACC, Fold 1 = 0.8359550561797753
Coefficients = [0.39178346 0.24905981 0.35915673]

COEFFICIENTS  AND ACCURACY FOLD 2:
Optimization terminated successfully.
         Current function value: -0.802247
         Iterations: 10
         Function evaluations: 49
Optimized ACC, Fold 2 = 0.7869955156950673
Coefficients = [0.14776237 0.14819389 0.70404375]

Average of coefficients: [0.26977291 0.19862685 0.53160024]


array([0.26977291, 0.19862685, 0.53160024])

In [10]:
#######CLASE INTEGRADA

In [11]:
import pandas as pd
import numpy as np
from functools import partial
from scipy.optimize import fmin
from sklearn import metrics

import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn import ensemble
from sklearn import linear_model
from sklearn import model_selection



class OptimizeACC:
    """Class for optimizing Accuracy"""

    def __init__(self):
        self.coef_ = 0 #initialize coefficients/weights

    def _acc(self, coef, X,y):
        """Calculates and return accuracy obtained
        from the weighted sum of predicions"""
        #Create predictions multiplying X tiems coefficients
        x_coef = X * coef #pondero cada prediccion por su coef

        #Create predictions by taking the sume of rows
        predictions = np.sum(x_coef, axis = 1)  #suma ponderada da prediccion final
        #Calculate accuracy score
        acc_score = metrics.accuracy_score(y, predictions.round())

        return -1 * acc_score #minimizaremos -acc para maximizar acc

    def fit(self, X,y): #usando loss partial obtenemos los coef que minimizan -acc
        loss_partial = partial(self._acc, X = X, y = y)
        initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size = 1)
        self.coef_ = fmin(loss_partial, initial_coef, disp = True)

    def predict(self, X):
        x_coef = X * self.coef_
        predictions = np.sum(x_coef, axis =1)
        return predictions
    
    
    
    
    def run(self):
        pass

In [25]:
# make a binary classification dataset with 10k samples
# and 25 features
X, y = make_classification(n_samples=10000, n_features=25)
# split into two folds (for this example)
xfold1, xfold2, yfold1, yfold2 = model_selection.train_test_split(
X,
y,
test_size=0.5,
stratify=y
)

In [44]:
###codigo de abishek en libro


class OptimizeAUC:

    def __init__(self):
        self.coef_ = np.random.randn()
    def _auc(self, coef, X, y):
        """
        This functions calulates and returns AUC.
        :param coef: coef list, of the same length as number of models
        :param X: predictions, in this case a 2d array
        :param y: targets, in our case binary 1d array
        """
        # multiply coefficients with every column of the array
        # with predictions.
        # this means: element 1 of coef is multiplied by column 1
        # of the prediction array, element 2 of coef is multiplied
        # by column 2 of the prediction array and so on!
        x_coef = X * coef
        # create predictions by taking row wise sum
        predictions = np.sum(x_coef, axis=1)
        # calculate auc score
        auc_score = metrics.roc_auc_score(y, predictions)
        # return negative auc
        return -1.0 * auc_score
    def fit(self, X, y):
        # remember partial from hyperparameter optimization chapter?
        loss_partial = partial(self._auc, X=X, y=y)
        # dirichlet distribution. you can use any distribution you want
        # to initialize the coefficients
        # we want the coefficients to sum to 1
        initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size=1)
        # use scipy fmin to minimize the loss function, in our case auc
        self.coef_ = fmin(loss_partial, initial_coef, disp=True)
    def predict(self, X):
        # this is similar to _auc function
        x_coef = X * self.coef_
        predictions = np.sum(x_coef, axis=1)
        return predictions

import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection

# fit models on fold 1 and make predictions on fold 2
# we have 3 models:
# logistic regression, random forest and xgboost
logreg = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
xgbc = xgb.XGBClassifier()
# fit all models on fold 1 data
logreg.fit(xfold1, yfold1)
rf.fit(xfold1, yfold1)
xgbc.fit(xfold1, yfold1)
# predict all models on fold 2
# take probability for class 1
pred_logreg = logreg.predict_proba(xfold2)[:, 1]
pred_rf = rf.predict_proba(xfold2)[:, 1]
pred_xgbc = xgbc.predict_proba(xfold2)[:, 1]

avg_pred = (pred_logreg + pred_rf + pred_xgbc) / 3
# a 2d array of all predictions
fold2_preds = np.column_stack((
pred_logreg,
pred_rf,
pred_xgbc,
avg_pred
))
# calculate and store individual AUC values
aucs_fold2 = []
for i in range(fold2_preds.shape[1]):
    auc = metrics.roc_auc_score(yfold2, fold2_preds[:, i])
    aucs_fold2.append(auc)
print(f"Fold-2: LR AUC = {aucs_fold2[0]}")
print(f"Fold-2: RF AUC = {aucs_fold2[1]}")
print(f"Fold-2: XGB AUC = {aucs_fold2[2]}")
print(f"Fold-2: Average Pred AUC = {aucs_fold2[3]}")
# now we repeat the same for the other fold
# this is not the ideal way, if you ever have to repeat code,
# create a function!
# fit models on fold 2 and make predictions on fold 1
logreg = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
xgbc = xgb.XGBClassifier()
logreg.fit(xfold2, yfold2)
rf.fit(xfold2, yfold2)
xgbc.fit(xfold2, yfold2)
pred_logreg = logreg.predict_proba(xfold1)[:, 1]
pred_rf = rf.predict_proba(xfold1)[:, 1]
pred_xgbc = xgbc.predict_proba(xfold1)[:, 1]
avg_pred = (pred_logreg + pred_rf + pred_xgbc) / 3
fold1_preds = np.column_stack((
pred_logreg,
pred_rf,
pred_xgbc,
avg_pred
))
aucs_fold1 = []
for i in range(fold1_preds.shape[1]):
    auc = metrics.roc_auc_score(yfold1, fold1_preds[:, i])
    aucs_fold1.append(auc)
print(f"Fold-1: LR AUC = {aucs_fold1[0]}")
print(f"Fold-1: RF AUC = {aucs_fold1[1]}")
print(f"Fold-1: XGB AUC = {aucs_fold1[2]}")
print(f"Fold-1: Average prediction AUC = {aucs_fold1[3]}")
# find optimal weights using the optimizer
opt = OptimizeAUC()
# dont forget to remove the average column
opt.fit(fold1_preds[:, :-1], yfold1)
opt_preds_fold2 = opt.predict(fold2_preds[:, :-1])
auc = metrics.roc_auc_score(yfold2, opt_preds_fold2)
coef_1 = np.array(opt.coef_)
print(f"Optimized AUC, Fold 2 = {auc}")
print(f"Coefficients = {opt.coef_}")
opt = OptimizeAUC()
opt.fit(fold2_preds[:, :-1], yfold2)
opt_preds_fold1 = opt.predict(fold1_preds[:, :-1])
auc = metrics.roc_auc_score(yfold1, opt_preds_fold1)
coef_2 = np.array(opt.coef_)
print(f"Optimized AUC, Fold 1 = {auc}")
print(f"Coefficients = {opt.coef_}")


avg_coef = (coef_1 + coef_2) / 2
print("")
print("Average of coefficients:", avg_coef)

Fold-2: LR AUC = 0.964611994337919
Fold-2: RF AUC = 0.9923053587688575
Fold-2: XGB AUC = 0.9912477585996413
Fold-2: Average Pred AUC = 0.9917532786805245
Fold-1: LR AUC = 0.961189575161328
Fold-1: RF AUC = 0.9939656761380328
Fold-1: XGB AUC = 0.9932695956925413
Fold-1: Average prediction AUC = 0.9926220752781282
Optimization terminated successfully.
         Current function value: -0.994231
         Iterations: 41
         Function evaluations: 93
Optimized AUC, Fold 2 = 0.9921812787490046
Coefficients = [-0.01191033  0.70127859  0.57698455]
Optimization terminated successfully.
         Current function value: -0.992574
         Iterations: 42
         Function evaluations: 80
Optimized AUC, Fold 1 = 0.9940579161970663
Coefficients = [-0.00420418  0.04571499  0.1488837 ]

Average of coefficients: [-0.00805725  0.37349679  0.36293412]


In [43]:
np.random.randn()

0.6667582589343684