Comecei a brincar com o hyperopt, comparei com o sklearn random search

# Definições iniciais

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, RandomizedSearchCV, train_test_split, StratifiedKFold,cross_val_score
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import joblib
import os

import matplotlib.pyplot as plt
%matplotlib inline


from tempfile import mkdtemp
from shutil import rmtree
from joblib import Memory


## Configurando random pra deixar reprodutível

In [2]:
import random
random.seed(42)
np.random.seed(42)
random_global = 42

# Custom transformers

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

class VIFFilterTransformer(BaseEstimator, TransformerMixin):
	
	def __init__(self, max_vif=3.0, max_iter = np.inf, verbose = True):
		self.max_vif = max_vif
		self.verbose = verbose
		self.max_iter = max_iter        
		
	def fit(self, X, y=None):
		assert isinstance(X, pd.DataFrame), "X should be a Pandas Dataframe."
		remaining_vars = list(X.columns.values)
		
		self._dropped_vars = []
		self._vif_dropped_vars = []
		
		n_iter = 0
		
		if self.verbose:
			print(f"\nLog for VIFFilterTransformer {self.get_params()}")
			
		while (len(remaining_vars) > 1) and (n_iter < self.max_iter):
		
			vif_iter = np.diag(np.linalg.pinv(np.corrcoef(X.drop(self._dropped_vars, axis=1).values, rowvar=False))).tolist()
			max_vif_iter = max(vif_iter)

			if max_vif_iter <= self.max_vif:
				break

			dropped_var = remaining_vars[vif_iter.index(max_vif_iter)]

			if self.verbose:
				print('Iteration %d, Dropped var: %s, vif = %f\n' % (n_iter, dropped_var, max_vif_iter))

			remaining_vars.remove(dropped_var)
			self._dropped_vars.append(dropped_var)
			self._vif_dropped_vars.append(max_vif_iter)

			n_iter += 1

		if self.verbose and (n_iter == 0):
			print(f"There are no variables with VIF above {self.max_vif}")
		
		return self
    
	def transform(self, X, y=None):
		assert hasattr(self, '_dropped_vars'), "This vif instance is not fitted yet. Call 'fit' with appropriate arguments before using this method."
		assert isinstance(X, pd.DataFrame), "X should be a Pandas Dataframe."
		
		return X.copy().drop(self._dropped_vars, axis=1)

# Extracted from https://github.com/jem1031/pandas-pipelines-custom-transformers/blob/master/code/custom_transformers.py
class DFImputer(BaseEstimator, TransformerMixin):
	# Imputer but for pandas DataFrames

	def __init__(self, strategy='mean'):
		self.strategy = strategy
		self.imp = None
		self.statistics_ = None

	def fit(self, X, y=None):
		self.imp = Imputer(strategy=self.strategy)
		self.imp.fit(X)
		self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
		return self

	def transform(self, X, y=None):
		# assumes X is a DataFrame
		Ximp = self.imp.transform(X)
		Xfilled = pd.DataFrame(Ximp, index=X.index, columns=X.columns)
		return Xfilled
		
# This class don't accept null values
class DFVarianceThreshold_rascunho(BaseEstimator, TransformerMixin):
	# VarianceThreshold but for pandas DataFrames

	def __init__(self, threshold=0.0):
		self.threshold = threshold

	def fit(self, X, y=None):
		self.selector = VarianceThreshold(threshold=self.threshold)
		self.selector.fit(X)
		self.variances_ = pd.Series(self.selector.variances_, index=X.columns)
		return self

	def transform(self, X, y=None):
		# assumes X is a DataFrame
		return pd.DataFrame(self.selector.transform(X), index = X.index, columns=X.columns[self.selector.get_support()].tolist())
		# , columns = X.columns)

class DFVarianceThreshold(BaseEstimator, TransformerMixin):
	# VarianceThreshold but for pandas DataFrames

	def __init__(self, threshold=0.0, verbose=True):
		self.threshold = threshold
		self.verbose = verbose

	def fit(self, X, y=None):
		self.variances_ = X.var()
		self.selected_cols_ = self.variances_[self.variances_ > self.threshold].index.tolist()
		
		if self.verbose:
			print(f"\nLog for DFVarianceThreshold {self.get_params()}")
			dropped_cols = set(X.columns) - set(self.selected_cols_)
			if len(dropped_cols) == 0:
				print(f"There are no vars with variance lower than the threshold of {self.threshold}")
			else:
				print(f"The following variables with variance less or equal the {self.threshold} threshold were removed:")
				print(self.variances_[self.variances_ <= self.threshold])
		
		return self

	def transform(self, X, y=None):
		# assumes X is a DataFrame
		return pd.DataFrame(X[self.selected_cols_], index = X.index, columns = self.selected_cols_)
		# return pd.DataFrame(self.selector.transform(X), index = X.index, columns=X.columns[self.selector.get_support()].tolist())
		# , columns = X.columns)
		
class PipelineCheckpoint(BaseEstimator, TransformerMixin):	
	'''Chekcpoint, only gives type and dimensions of object on given point'''
	def __init__(self, print_on_fit=True, print_on_transform=True):
		self.print_on_fit = print_on_fit
		self.print_on_transform = print_on_transform
		pass
	
	def fit(self, X, y=None):
		if self.print_on_fit:
			print(f"\nLog for PipelineCheckpoint.fit")
			print(X.info())
		return self
	
	def transform(self, X, y=None):
		if self.print_on_transform:
			print(f"\nLog for PipelineCheckpoint.transform")
			print(X.info())
		return pd.DataFrame(X, index=X.index, columns=X.columns)

class DFCorrelationFilter(BaseEstimator, TransformerMixin):
	'''Removes pairwise correlated features. By default removes perfect correlated features, e.g. with correlation with absolute value equal to 1.0
		
		Idea extracted from https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
	'''

	def __init__(self, threshold=1.0, verbose=True):
		self.threshold = threshold
		self.verbose = verbose
		
	def fit(self, X, y=None):
		# Create correlation matrix
		corr_matrix = X.corr().abs()

		# Select upper triangle of correlation matrix
		upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

		# Find index of feature columns with correlation greater than 0.95
		self.to_drop_ = [column for column in upper.columns if any(upper[column] >= self.threshold)]
		
		if self.verbose:
			print(f"\nLog for DFCorrelationFilter {self.get_params()}")
			print(f"The following variables have correlation greater or equal than then {self.threshold} threshold and will be dropped")
			print(self.to_drop_)
		
		return self
	
	def transform(self, X, y=None):
		return X.copy().drop(self.to_drop_, axis=1)

		
		
		
	




# Carrega dados


In [15]:
df_train_val.values.astype('O')

array([[1, 0, 3, ..., 7.25, nan, 'S'],
       [2, 1, 1, ..., 71.2833, 'C85', 'C'],
       [3, 1, 3, ..., 7.925, nan, 'S'],
       ...,
       [889, 0, 3, ..., 23.45, nan, 'S'],
       [890, 1, 1, ..., 30.0, 'C148', 'C'],
       [891, 0, 3, ..., 7.75, nan, 'Q']], dtype=object)

In [19]:
from category_encoders.hashing import HashingEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.binary import BinaryEncoder

In [30]:
OneHotEncoder().fit_transform(df_train_val[['survived']].astype('O'))

Unnamed: 0,survived_1,survived_2
0,1,0
1,0,1
2,0,1
3,0,1
4,1,0
5,1,0
6,1,0
7,1,0
8,0,1
9,0,1


In [24]:
df_train_val[['embarked', 'survived']]

Unnamed: 0,embarked,survived
0,S,0
1,C,1
2,S,1
3,S,1
4,S,0
5,Q,0
6,S,0
7,S,0
8,S,1
9,C,1


In [4]:
df_train_val = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")
df_train_val.columns = [x.lower() for x in df_train_val.columns]
df_test.columns = [x.lower() for x in df_test.columns]

print(f'Print shape df_train_val: {df_train_val.shape}')
print(f'Print shape df_test: {df_test.shape}')


Print shape df_train_val: (891, 12)
Print shape df_test: (418, 11)


# Criação de algumas features novas

In [5]:
class ColumnExtractor(TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xcols = X[self.cols]
        return Xcols
    
class CreateFamilySize(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
#         return X['sibsp'].values + X['parch'].values
#         return X.iloc[:,0].values + X.iloc[:,1].values
#         return X[['sibsp']] + X[['parch']]
#         return (X['sibsp'] + X['parch']).ravel()
#         return np.ndarray(shape=(X['sibsp'].values + X['parch'].values)

#         family_size = X['sibsp'] +  X['parch']
        family_size = (X[['parch', 'sibsp']]).sum(axis=1, skipna=True).fillna(0)
        return pd.DataFrame(family_size)
    
# CreateFamilySize().fit_transform(df_train[['parch', 'sibsp']])

# Isso aqui faria sentido se fosse uma variável que fosse criada usando dados do treino, mas aqui não faz sentido

## Tamanho de família

In [6]:
full_data = [df_train_val, df_test]
for dataset in full_data:
    dataset['family_size'] = 1 + dataset['parch'] + dataset['sibsp']
    if 'survived' in dataset.columns:
        display(dataset.fillna(-999).groupby('family_size')['survived'].agg(['size', 'mean']).reset_index())
    else:
        display(dataset.fillna(-999).groupby('family_size').size())

Unnamed: 0,family_size,size,mean
0,1,537,0.303538
1,2,161,0.552795
2,3,102,0.578431
3,4,29,0.724138
4,5,15,0.2
5,6,22,0.136364
6,7,12,0.333333
7,8,6,0.0
8,11,7,0.0


family_size
1     253
2      74
3      57
4      14
5       7
6       3
7       4
8       2
11      4
dtype: int64

Aparentemente depois de 4 familiares, talvez seja bom juntar depois

## Extrai o título

In [7]:
full_data = [df_train_val, df_test]
for dataset in full_data:
    dataset['title'] = dataset['name'].str.findall('([A-Z][a-z]+)\.').map(lambda x: x[0])
    if 'survived' in dataset.columns:
        display(dataset.fillna(-999).groupby('title')['survived'].agg(['size', 'mean']).reset_index().sort_values(by='mean', ascending=False))
    else:
        display(dataset.fillna(-999).groupby('title').size())
        

Unnamed: 0,title,size,mean
16,Sir,1,1.0
2,Countess,1,1.0
14,Ms,1,1.0
11,Mme,1,1.0
6,Lady,1,1.0
10,Mlle,2,1.0
13,Mrs,125,0.792
9,Miss,182,0.697802
8,Master,40,0.575
1,Col,2,0.5


title
Col         2
Dona        1
Dr          1
Master     21
Miss       78
Mr        240
Mrs        72
Ms          1
Rev         2
dtype: int64

## Acha aspas ou parênteses no nome

In [8]:
full_data = [df_train_val, df_test]
for dataset in full_data:
    dataset['name_aspas'] = dataset['name'].str.findall('\"').map(lambda x: len(x) >= 1 ).astype('int')
    dataset['name_parenteses'] = dataset['name'].str.findall('\(').map(lambda x: len(x) >= 1 ).astype('int')
    if 'survived' in dataset.columns:
        display(dataset.fillna(-999).groupby('name_aspas')['survived'].agg(['size', 'mean']).reset_index().sort_values(by='mean', ascending=False))
        display(dataset.fillna(-999).groupby('name_parenteses')['survived'].agg(['size', 'mean']).reset_index().sort_values(by='mean', ascending=False))
    else:
        display(dataset.fillna(-999).groupby('name_aspas').size())
        display(dataset.fillna(-999).groupby('name_parenteses').size())
        

Unnamed: 0,name_aspas,size,mean
1,1,53,0.716981
0,0,838,0.362768


Unnamed: 0,name_parenteses,size,mean
1,1,143,0.769231
0,0,748,0.31016


name_aspas
0    396
1     22
dtype: int64

name_parenteses
0    340
1     78
dtype: int64

# Separa as amostras para treino e validação

In [13]:
df_train, df_val = train_test_split(df_train_val, test_size = 100, shuffle=True, stratify=df_train_val['survived'], random_state=random_global)
print(f'Print shape df_train: {df_train.shape}')
print(f'Print shape df_val: {df_val.shape}')

Print shape df_train: (791, 16)
Print shape df_val: (100, 16)


# Definição do pipeline básico

In [9]:
num_cols = ['age', 'sibsp', 'parch', 'fare', 'family_size']
cat_cols = ['sex', 'embarked', 'title', 'name_aspas', 'name_parenteses']

In [10]:
cat_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value='CAT_MISSING'), OneHotEncoder(handle_unknown='ignore', sparse=False))
num_pipeline = make_pipeline(SimpleImputer(strategy='median'))
full_pipe = Pipeline([('preprocessing', 
                        make_column_transformer(
                            (cat_pipeline, cat_cols)
                            ,(num_pipeline, num_cols)))
                    ,('clf', RandomForestClassifier(random_state=random_global, n_estimators=100))])

In [11]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_global)

In [14]:
crossval = cross_validate(full_pipe, df_train, df_train['survived'], return_train_score=True, cv=kf, scoring='roc_auc')

print(f"score médio nas partições de treino : {crossval['train_score'].mean()}")
print(f"score médio nas partições de validação : {crossval['test_score'].mean()}")
      
# No teste
full_pipe.fit(df_train, df_train['survived'])
print(f"score no teste: {roc_auc_score(df_val['survived'], full_pipe.predict_proba(df_val)[:,1])}")

score médio nas partições de treino : 0.9982494643370107
score médio nas partições de validação : 0.8527953182286943
score no teste: 0.8367996604414261


Esse é o score antigo sem criar nenhuma feature

score médio nas partições de treino : 0.9979837284291866

score médio nas partições de validação : 0.8600773224618562

score no teste: 0.8247028862478777

## Como usar opção memory no Pipeline?

In [95]:
# Teste do memory
class sleep_on_fit(BaseEstimator, TransformerMixin):
	# Imputer but for pandas DataFrames

	def __init__(self):
		pass        

	def fit(self, X, y=None):
		time.sleep(2)
		return self

	def transform(self, X, y=None):
		# assumes X is a DataFrame
		return 

In [96]:
cachedir = mkdtemp(dir='./')
memory = Memory(location=cachedir, verbose=10)
pipe_teste = make_pipeline(sleep_on_fit(), memory=memory)
rmtree(cachedir)

# Testando diferentes estratégias para tunar os hiperparâmetros

## Random grid search

In [168]:
random_gs_param_grid = {'clf__bootstrap': [False, True]
,'clf__class_weight': ['balanced', 'balanced_subsample', None]
,'clf__criterion': ['gini', 'entropy']
,'clf__max_depth': list(range(1,11))
,'clf__max_features': ['auto','sqrt', None]
,'clf__max_leaf_nodes': [x for x in range(2,2**10 +1)]
# ,'clf__min_impurity_decrease': [None]
# ,'clf__min_impurity_split': [None]
# ,'clf__min_samples_leaf': [None]
,'clf__min_samples_split': [2 ** x for x in range(1,11)]
,'clf__min_weight_fraction_leaf': [x * 0.1 for x in range(1,6)]
,'clf__n_estimators': [x * 100 for x in range(1,11)]
,'clf__n_jobs': [-1]
# ,'clf__oob_score': [False]
# ,'clf__random_state': [None]
# ,'clf__warm_start': [False]
}


In [169]:
random_gs = RandomizedSearchCV(param_distributions=random_gs_param_grid, estimator=full_pipe, n_iter=100, cv = kf, random_state=1, scoring='roc_auc')
random_gs.fit(df_train, df_train['survived'])



RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
                   error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('preprocessing',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('pipeline-1',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('simpleimputer',
                                                                                                SimpleImputer(

In [220]:
# Coloca melhores parâmetros no pipe
full_pipe.set_params(**random_gs.best_params_)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='CAT_MISSING',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                    

In [221]:
# Cross validation no treino
crossval = cross_validate(full_pipe, df_train, df_train['survived'], return_train_score=True, cv=kf, scoring='roc_auc')

print(f"score médio nas partições de treino : {crossval['train_score'].mean()}")
print(f"score médio nas partições de validação : {crossval['test_score'].mean()}")
      
full_pipe.fit(df_train, df_train['survived'])

print('AUC treino: %f' % roc_auc_score(df_train['survived'], full_pipe.predict_proba(df_train)[:,1]))
print('AUC validação: %f' % roc_auc_score(df_val['survived'], full_pipe.predict_proba(df_val)[:,1]))

score médio nas partições de treino : 0.85921131797813
score médio nas partições de validação : 0.8461187354779668
AUC treino: 0.854030
AUC validação: 0.806452


## Random Grid search na mão pra usar tqdm progress bar

In [302]:
def GridSearch_tqdm(estimator, params_grid, n_iter):
    best_score = 0
    best_params = {}
    for param in tqdm(ParameterSampler(params_grid, n_iter=n_iter)):
        
#         print(param)
    
        crossval = cross_validate(full_pipe, df_train, df_train['survived'], return_train_score=True, cv=kf, scoring='roc_auc')
        
        score_iter = crossval['test_score'].mean()
        if score_iter > best_score:
            best_score = score_iter
            best_params = param
            print(best_score, best_params)
#         print(crossval['test_score'].mean())
    
    print(best_score, best_params)
    
    return {'best_params':best_params, 'best_score':best_score}
    
    
best_params = GridSearch_tqdm(estimator=None, params_grid=random_gs_param_grid, n_iter=100)

NameError: name 'random_gs_param_grid' is not defined

## Random grid search com oob score

Pra esse aqui não vai ter cross validation, vou usar o oob escore da Random Forest. Necessariamente vamos precisar de bootstrap na random forest.

Nesse aqui o preprocessing foi feito em tudo, ao contrário dos outros com cross-validation, em que o pipeline garante que as transformações são fitadas nas partições de treino e aplicadas nas de validação.

In [182]:
grid_forest_oob = {'bootstrap': [True]
,'class_weight': ['balanced', 'balanced_subsample', None]
,'criterion': ['gini', 'entropy']
,'max_depth': list(range(1,11))
,'max_features': ['auto','sqrt', None]
,'max_leaf_nodes': [x for x in range(2,2**10 +1)]
# ,'min_impurity_decrease': [None]
# ,'min_impurity_split': [None]
# ,'min_samples_leaf': [None]
,'min_samples_split': [2 ** x for x in range(1,11)]
,'min_weight_fraction_leaf': [x * 0.1 for x in range(1,6)]
,'n_estimators': [x * 100 for x in range(1,11)]
,'n_jobs': [-1]
,'oob_score': [True]
# ,'random_state': [None]
# ,'warm_start': [False]
}

def GridSearch_oob(params_grid, n_iter):
    best_score = 0
    best_params = {}
    X = Pipeline([('preprocessing', make_column_transformer((cat_pipeline, cat_cols),(num_pipeline, num_cols)))]).fit_transform(df_train)
    
    i=0
    for param in tqdm(ParameterSampler(params_grid, n_iter=n_iter, random_state=random_global)):
        i+=1
        clf = RandomForestClassifier(**param)
        clf.oob_score  = True
        clf.bootstrap  = True
        
        clf.fit(X, df_train['survived'])
        
        score_iter = clf.oob_score_

        if score_iter > best_score:
            best_score = score_iter
            best_params = param
#             print("Iteração %d" % i)
#             print(best_score, best_params)
#         print(crossval['test_score'].mean())
    
#     print(best_score, best_params)
    
    return {'best_params':best_params, 'best_score':best_score}
GridSearch_oob(grid_forest_oob, n_iter = 2)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.17it/s]


{'best_params': {'oob_score': True,
  'n_jobs': -1,
  'n_estimators': 700,
  'min_weight_fraction_leaf': 0.4,
  'min_samples_split': 2,
  'max_leaf_nodes': 982,
  'max_features': None,
  'max_depth': 7,
  'criterion': 'entropy',
  'class_weight': 'balanced_subsample',
  'bootstrap': True},
 'best_score': 0.7850821744627055}

In [203]:
best_params_oob = GridSearch_oob(grid_forest_oob, n_iter = 1000)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [10:29<00:00,  2.05it/s]


In [223]:
full_pipe.named_steps['clf'].set_params(**best_params_oob['best_params'])

RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
                       criterion='entropy', max_depth=9, max_features='sqrt',
                       max_leaf_nodes=42, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=16, min_weight_fraction_leaf=0.1,
                       n_estimators=900, n_jobs=-1, oob_score=True,
                       random_state=42, verbose=0, warm_start=False)

In [224]:
# Cross validation no treino
crossval = cross_validate(full_pipe, df_train, df_train['survived'], return_train_score=True, cv=kf, scoring='roc_auc')

print(f"score médio nas partições de treino : {crossval['train_score'].mean()}")
print(f"score médio nas partições de validação : {crossval['test_score'].mean()}")
      
full_pipe.fit(df_train, df_train['survived'])

print('AUC treino: %f' % roc_auc_score(df_train['survived'], full_pipe.predict_proba(df_train)[:,1]))
print('AUC validação: %f' % roc_auc_score(df_val['survived'], full_pipe.predict_proba(df_val)[:,1]))

score médio nas partições de treino : 0.8551928346406388
score médio nas partições de validação : 0.8436748087776602
AUC treino: 0.853328
AUC validação: 0.826825


In [7]:
# TODO: remover combinações ilegais para os parâmetros. Ver o que vem primeiro, max_leaf_nodes or max_depth
# Não consegui fazer de um jeito eficiente e sem travar. Vou fazer isso depois

0.7914032869785083 {'n_jobs': -1, 'n_estimators': 800, 'min_weight_fraction_leaf': 0.2, 'min_samples_split': 256, 'max_leaf_nodes': 423, 'max_features': None, 'max_depth': 3, 'criterion': 'gini', 'class_weight': None}

## Testando com Hyperopt TPE

In [15]:
from hyperopt import fmin, hp, tpe, rand, Trials, space_eval, STATUS_OK, anneal
from hyperopt.pyll import scope as ho_scope
from hyperopt.pyll.stochastic import sample as ho_sample
from functools import partial

In [16]:
# Hyperparameter space
params_grid_ho = {
    'clf__bootstrap': hp.choice('clf__bootstrap', [False, True]),
    'clf__class_weight': hp.choice('clf__class_weight',['balanced', 'balanced_subsample', None]),
    'clf__criterion': hp.choice('clf__criterion', ['gini', 'entropy']),
    'clf__max_depth': ho_scope.int(hp.uniform('clf__max_depth', low=1, high=11)),
    'clf__max_features': hp.choice('clf__max_features', ['auto','sqrt', None]),
    'clf__max_leaf_nodes': ho_scope.int(hp.uniform('clf__max_leaf_nodes', low=2, high=1024)),
    'clf__min_samples_split': ho_scope.int(hp.uniform('clf__min_samples_split', low=2, high=1024)),
    'clf__min_weight_fraction_leaf': 0.1 * ho_scope.int(hp.uniform('clf__min_weight_fraction_leaf', low=0, high=5)),
    'clf__n_estimators': 100 * ho_scope.int(hp.uniform('clf__n_estimators', low=1, high=10)),
    'clf__oob_score': False
    }

# Função a ser minimizada
def f_to_min1(hps, model, X, y, cv):
    """
    Target function for optimization
    
    Parameters:
    ----------------
    hps : sample point from search space
    X : feature matrix
    y : target array
    ncv : number of folds for cross-validation
    
    Returns:
    ----------------
    : target function value (negative mean cross-val ROC-AUC score)
    """
    

    model.set_params(**hps)
        
    crossval = cross_validate(model, X, y, return_train_score=True, cv=cv, scoring='roc_auc')

    return {'loss': -crossval['test_score'].mean(), 'status': STATUS_OK}


In [18]:
trials_clf1 = Trials()


best_clf1=fmin(partial(f_to_min1,model=full_pipe, X=df_train, y=df_train['survived'], cv=kf)
     ,params_grid_ho, algo=tpe.suggest, max_evals=500, trials=trials_clf1, rstate=np.random.RandomState(random_global))


100%|█████████████████████████████████████████████████| 500/500 [24:35<00:00,  2.16s/it, best loss: -0.872310551058348]


In [None]:
return_estimators pra pegar todos modelos 

In [44]:
space_eval(params_grid_ho, best_clf1)

{'clf__bootstrap': True,
 'clf__class_weight': 'balanced',
 'clf__criterion': 'entropy',
 'clf__max_depth': 7,
 'clf__max_features': None,
 'clf__max_leaf_nodes': 751,
 'clf__min_samples_split': 2,
 'clf__min_weight_fraction_leaf': 0.0,
 'clf__n_estimators': 400,
 'clf__oob_score': False}

In [45]:
full_pipe.set_params(**space_eval(params_grid_ho, best_clf1))

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='CAT_MISSING',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                    

In [26]:
df_train_val.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked', 'family_size', 'title',
       'name_aspas', 'name_parenteses'],
      dtype='object')

In [24]:
# Cross validation no treino
crossval = cross_validate(full_pipe, df_train, df_train['survived'], return_train_score=True, cv=kf, scoring='roc_auc')

print(f"score médio nas partições de treino : {crossval['train_score'].mean()}")
print(f"score médio nas partições de validação : {crossval['test_score'].mean()}")
      
full_pipe.fit(df_train, df_train['survived'])

print('AUC treino: %f' % roc_auc_score(df_train['survived'], full_pipe.predict_proba(df_train)[:,1]))
print('AUC validação: %f' % roc_auc_score(df_val['survived'], full_pipe.predict_proba(df_val)[:,1]))

score médio nas partições de treino : 0.968869405619136
score médio nas partições de validação : 0.872310551058348
AUC treino: 0.964147
AUC validação: 0.832767


## Testando com Hyperopt Random

In [241]:
trials_ho_rand = Trials()

best_ho_rand=fmin(partial(f_to_min1,model=full_pipe, X=df_train, y=df_train['survived'], cv=kf)
     ,params_grid_ho, algo=rand.suggest, max_evals=100, trials=trials_ho_rand, rstate=np.random.RandomState(random_global))


100%|████████████████████████████████████████████████| 100/100 [03:47<00:00,  2.98s/it, best loss: -0.8626739511082444]


In [242]:
full_pipe.set_params(**space_eval(params_grid_ho, best_ho_rand))

# Cross validation no treino
crossval = cross_validate(full_pipe, df_train, df_train['survived'], return_train_score=True, cv=kf, scoring='roc_auc')

print(f"score médio nas partições de treino : {crossval['train_score'].mean()}")
print(f"score médio nas partições de validação : {crossval['test_score'].mean()}")
      
full_pipe.fit(df_train, df_train['survived'])

print('AUC treino: %f' % roc_auc_score(df_train['survived'], full_pipe.predict_proba(df_train)[:,1]))
print('AUC validação: %f' % roc_auc_score(df_val['survived'], full_pipe.predict_proba(df_val)[:,1]))

score médio nas partições de treino : 0.9082833625451698
score médio nas partições de validação : 0.8626739511082444
AUC treino: 0.909279
AUC validação: 0.842530


## Testando com Simulated annealing

In [243]:
trials_ho_anneal = Trials()

best_ho_anneal=fmin(partial(f_to_min1,model=full_pipe, X=df_train, y=df_train['survived'], cv=kf)
     ,params_grid_ho, algo=anneal.suggest, max_evals=100, trials=trials_ho_anneal, rstate=np.random.RandomState(random_global))


100%|████████████████████████████████████████████████| 100/100 [04:12<00:00,  3.09s/it, best loss: -0.8650579444216422]


In [244]:
full_pipe.set_params(**space_eval(params_grid_ho, best_ho_anneal))

# Cross validation no treino
crossval = cross_validate(full_pipe, df_train, df_train['survived'], return_train_score=True, cv=kf, scoring='roc_auc')

print(f"score médio nas partições de treino : {crossval['train_score'].mean()}")
print(f"score médio nas partições de validação : {crossval['test_score'].mean()}")
      
full_pipe.fit(df_train, df_train['survived'])

print('AUC treino: %f' % roc_auc_score(df_train['survived'], full_pipe.predict_proba(df_train)[:,1]))
print('AUC validação: %f' % roc_auc_score(df_val['survived'], full_pipe.predict_proba(df_val)[:,1]))

score médio nas partições de treino : 0.9216896370892614
score médio nas partições de validação : 0.8650579444216422
AUC treino: 0.919047
AUC validação: 0.843379


## Testando com skopt (abortado, vou usar tudo no hyperopt porque achei mais fácil)

In [250]:
params_grid_ho = {
    'clf__bootstrap': hp.choice('clf__bootstrap', [False, True]),
    'clf__class_weight': hp.choice('clf__class_weight',['balanced', 'balanced_subsample', None]),
    'clf__criterion': hp.choice('clf__criterion', ['gini', 'entropy']),
    'clf__max_depth': ho_scope.int(hp.uniform('clf__max_depth', low=1, high=11)),
    'clf__max_leaf_nodes': ho_scope.int(hp.uniform('clf__max_leaf_nodes', low=2, high=1024)),
    'clf__min_samples_split': ho_scope.int(hp.uniform('clf__min_samples_split', low=2, high=1024)),
    'clf__min_weight_fraction_leaf': 0.1 * ho_scope.int(hp.uniform('clf__min_weight_fraction_leaf', low=0, high=5)),
    'clf__n_estimators': 100 * ho_scope.int(hp.uniform('clf__n_estimators', low=1, high=10)),
    'clf__oob_score': False
    }

In [None]:
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import BayesSearchCV

# skopt_space  = [Integer(1, 5, name='max_depth'),
#           Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
#           Integer(1, n_features, name='max_features'),
#           Integer(2, 100, name='min_samples_split'),
#           Integer(1, 100, name='min_samples_leaf')]

skopt_space = [
Categorical((False, True), name='clf__bootstrap'),
Categorical(('balanced', 'balanced_subsample', None), name='clf__class_weight'),
Integer(1, 11, name='clf__max_depth'),
Categorical((['auto','sqrt', None]), name='clf__max_features'),
Integer(1, 1024, name='clf__max_leaf_nodes'),
Integer(2, 1024, name='clf__min_samples_split'),
Real(0.1, 0.5, name='clf__min_weight_fraction_leaf'),
Integer(100,1000, name='clf__n_estimators')
]

opt=BayesSearchCV(full_pipe, )

# Modelo final pra mandar pro Kaggle

## Função pra submeter os resultados e salvar os arquivos necessários pra replicar

In [95]:
class SaveModel(object):
    def __init__(self, folder_to_save, data_train=None, data_val=None, data_test=None, model=None, str_readme=None, submission_file='submission.csv'):
        self.folder_to_save = folder_to_save
        self.data_train = data_train
        self.data_val = data_val
        self.data_test = data_test
        self.model = model
        self.str_readme = str_readme
        self.submission_file = submission_file
    
    def save_model(self):
#     Create folder if not exists:
        try:
            os.makedirs(self.folder_to_save)
        except:
            pass

    #     Salva os dados usados no treino
        if self.data_train is not None:
            joblib.dump(self.data_train, self.folder_to_save+'/train_data')


    #     Salva dados usados na validação
        if self.data_val is not None:
            joblib.dump(self.data_test, self.folder_to_save+'/validation_data')

    #     Salva dados usados no teste
        if self.data_test is not None:
            joblib.dump(self.data_test, self.folder_to_save+'/test_data')

    #     Salva modelo 
        if self.model is not None:
            joblib.dump(self.model, self.folder_to_save+'/model')   

    #     Arquivo README (é o que vai escrito pro commit)
        with open(self.folder_to_save+'/README.txt', "w") as text_file:
            text_file.write(self.str_readme)
            
#         Salva os predictions
        

    def commit_kaggle(self):
        predictions = self.model.predict(self.data_test)
        submission = pd.DataFrame({'PassengerId':self.data_test['passengerid'],'Survived':predictions})
        submission.to_csv(self.folder_to_save+'/'+self.submission_file,index=False)
#         print(f"kaggle competitions submit -c titanic -f submission.csv -m \"{self.str_readme}\"")
#         !! f"kaggle competitions submit -c titanic -f {self.folder_to_save+'/'+self.submission_file} -m \"{self.str_readme}\""
        if os.system(f"kaggle competitions submit -c titanic -f {self.folder_to_save+'/'+self.submission_file} -m \"{self.str_readme}\"") != 0:
            print('Erro submetendo o arquivo no kaggle!')
            
        print(f"kaggle competitions submit -c titanic -f {self.folder_to_save+'/'+self.submission_file} -m \"{self.str_readme}\"")
    
        

## Treinando modelo só na base de treino

In [48]:
full_pipe.set_params(**space_eval(params_grid_ho, best_clf1))
full_pipe.fit(df_train, df_train['survived'])

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='CAT_MISSING',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                    

In [74]:
modelo1 = SaveModel(folder_to_save='./versions_submissions/versao1',
                  data_train=df_train,
                  data_val=df_val,
                  data_test=df_test,
                  model = full_pipe,
                str_readme='Random forest, fitado no train data. One hot encoder pra binários, mediana pra missing numérico',
                submission_file='submission_versao1.csv'
                   )

modelo1.save_model()
modelo1.commit_kaggle()

kaggle competitions submit -c titanic -f ./versions_submissions/versao1/submission_versao1.csv -m "Random forest, fitado no train data. One hot encoder pra binários, mediana pra missing numérico"


In [70]:
!kaggle competitions submit -c titanic -f ./versions_submissions/versao1/submission_versao1.csv -m "Random forest, fitado no train data. One hot encoder pra binários, mediana pra missing numérico"

Successfully submitted to Titanic: Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:05<00:00, 623B/s]


## Treinando na base toda (treino + validação)

In [97]:
full_pipe.set_params(**space_eval(params_grid_ho, best_clf1))
full_pipe.fit(df_train_val, df_train_val['survived'])

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='CAT_MISSING',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                    

In [98]:
modelo2 = SaveModel(folder_to_save='./versions_submissions/versao2',
                  data_train=df_train,
                  data_val=df_val,
                  data_test=df_test,
                  model = full_pipe,
                str_readme='Random forest, fitado no train+val data. One hot encoder pra binários, mediana pra missing numérico',
                submission_file='submission_versao2.csv'
                   )

modelo2.save_model()
modelo2.commit_kaggle()

kaggle competitions submit -c titanic -f ./versions_submissions/versao2/submission_versao2.csv -m "Random forest, fitado no train+val data. One hot encoder pra binários, mediana pra missing numérico"
