In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb

target_cols = ['ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1','ind_ctma_fin_ult1',
               'ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1',
               'ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1','ind_plan_fin_ult1','ind_pres_fin_ult1',
               'ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1','ind_nomina_ult1',
               'ind_nom_pens_ult1','ind_recibo_ult1']

# load data
trainX = np.load("trainX.dat")
trainY = np.load("trainY.dat")
testX  = np.load("testX.npy")
print("train X is: ", trainX.shape)
print("train Y is: ", trainY.shape)
print("testX is: ", testX.shape)

# parameters
params = {'seed': 125,
          'colsample_bytree': 0.7,
          'silent': 1,
          'subsample': 0.7,
          'eta': 0.05,
          'objective': 'multi:softprob',
          'max_depth': 5,
          'min_child_weight': 1,
          'eval_metric': 'mlogloss',
          'num_class' : 22
         }
num_rounds = 1000

dtrain = xgb.DMatrix(trainX, label=trainY)
dtest  = xgb.DMatrix(testX)

model = xgb.train(params, dtrain, num_rounds)

del trainX, trainY

preds = model.predict(dtest)
del testX

np.save("preds", preds)

preds[preds < 0.045] = 0

target_cols = np.array(target_cols)

test_ids = np.array(np.load("test_ids.npy"))
final_preds = [" ".join(list(target_cols[np.nonzero(pred)])) for pred in preds]    
final_df = pd.DataFrame({'ncodpers':test_ids, 'added_products':final_preds})
final_df.to_csv("final_df.csv", index=False)


train X is:  (78713, 41)
train Y is:  (78713,)
testX is:  (929615, 41)


In [16]:
print(preds.shape)
print(type(preds))
print(preds[2:3,:])

(929615, 7)
<class 'numpy.ndarray'>
[[ 0 21  2 20 19 16  9]]


### For cross validation

In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb

from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold

seed = 135

# load data
trainX = np.load("trainX.dat")
trainY = np.load("trainY.dat")
testX  = np.load("testX.npy")
print("train X is: ", trainX.shape)
print("train Y is: ", trainY.shape)
print("testX is: ", testX.shape)

#params_grid = {'learning_rate': [0.01, 0.03],
#               'max_depth': [8, 10, 12]
#              }
#Best accuracy obtained: -1.7230578250995143
#Parameters
#	max_depth: 8
#	learning_rate: 0.03

#params_grid = {'learning_rate': [0.03, 0.05],
#               'max_depth': [5, 8]
#              }
#Best accuracy obtained: -1.6337699265994767
#Parameters
#	max_depth: 5
#	learning_rate: 0.05

#params_grid = {'learning_rate': [0.05, 0.7],
#               'max_depth': [3, 5]
#              }

#Best accuracy obtained: -1.6337699265994767
#Parameters
#	max_depth: 5
#	learning_rate: 0.05

#params_grid = {'min_child_weight': [1, 10, 50]
#              }
#Best accuracy obtained: -1.6337699265994767
#Parameters
#	min_child_weight: 1

#params_grid = {'colsample_bytree': [0.5, 0.7, 0.9]
#              }
#Best accuracy obtained: -1.6292829095958359
#Parameters
#	colsample_bytree: 0.9
#params_grid = {'subsample': [0.5, 0.7, 0.9]
#              }
#Best accuracy obtained: -1.6292448011262861
#Parameters
#	subsample: 0.9

params_grid = {'learning_rate': [0.04, 0.05],
               'max_depth': [5, 6],
               'subsample': [0.8, 0.9, 1],
               'colsample_bytree': [0.8, 0.9, 1]
              }

#Best accuracy obtained: -1.6251510798293574
#Parameters
#	colsample_bytree: 1
#	max_depth: 6
#	subsample: 0.8
#	learning_rate: 0.05

params_fixed = {'seed': 125,
                #'learning_rate': 0.05,
                #'max_depth':  5,
                #'colsample_bytree': 0.9,
                'silent': 1,
                #'subsample': 0.7,
                'objective': 'multi:softprob',
                'min_child_weight': 1,
                #'eval_metric': 'mlogloss',
                #'num_class' : 22
               }

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
#watchlist = [(dtrain, 'train'), (dtest, 'eval')]

bst_grid = GridSearchCV(estimator=XGBClassifier(**params_fixed),
                        param_grid=params_grid,
                        cv=cv,
                        scoring='neg_log_loss')
bst_grid.fit(trainX, trainY)

bst_grid.grid_scores_

print("Best accuracy obtained: {0}".format(bst_grid.best_score_))
print("Parameters")
for key, value in bst_grid.best_params_.items():
    print("\t{}: {}".format(key, value))

train X is:  (78713, 41)
train Y is:  (78713,)
testX is:  (929615, 41)
Best accuracy obtained: -1.6251510798293574
Parameters
	colsample_bytree: 1
	max_depth: 6
	subsample: 0.8
	learning_rate: 0.05




In [13]:
bst_grid

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=135, shuffle=True),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=125, silent=1, subsample=0.7),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [8, 10, 12], 'learning_rate': [0.01, 0.03]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='log_loss', verbose=0)