In [1]:
import numpy as np 
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder

def reduce_dimen(dataset,column,toreplace):
    for index,i in dataset[column].duplicated(keep=False).iteritems():
        if i==False:
            dataset.set_value(index,column,toreplace)
    return dataset
    
def act_data_treatment(dsname):
    dataset = dsname
    
    for col in list(dataset.columns):
        if col not in ['people_id', 'activity_id', 'date', 'char_38', 'outcome']:
            if dataset[col].dtype == 'object':
                dataset[col].fillna('type 0', inplace=True)
                dataset[col] = dataset[col].apply(lambda x: x.split(' ')[1]).astype(np.int32)
            elif dataset[col].dtype == 'bool':
                dataset[col] = dataset[col].astype(np.int8)
    
    dataset['year'] = dataset['date'].dt.year
    dataset['month'] = dataset['date'].dt.month
    dataset['day'] = dataset['date'].dt.day
    dataset['isweekend'] = (dataset['date'].dt.weekday >= 5).astype(int)
    dataset = dataset.drop('date', axis = 1)
    
    return dataset

act_train_data = pd.read_csv("input/act_train.csv",dtype={'people_id': np.str, 'activity_id': np.str, 'outcome': np.int8}, parse_dates=['date'])
act_test_data  = pd.read_csv("input/act_test.csv", dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
people_data    = pd.read_csv("input/people.csv", dtype={'people_id': np.str, 'activity_id': np.str, 'char_38': np.int32}, parse_dates=['date'])

act_train_data=act_train_data.drop('char_10',axis=1)
act_test_data=act_test_data.drop('char_10',axis=1)

print("Train data shape: " + format(act_train_data.shape))
print("Test data shape: " + format(act_test_data.shape))
print("People data shape: " + format(people_data.shape))

act_train_data  = act_data_treatment(act_train_data)
act_test_data   = act_data_treatment(act_test_data)
people_data = act_data_treatment(people_data)

train = act_train_data.merge(people_data, on='people_id', how='left', left_index=True)
test  = act_test_data.merge(people_data, on='people_id', how='left', left_index=True)

del act_train_data
del act_test_data
del people_data

train=train.sort_values(['people_id'], ascending=[1])
test=test.sort_values(['people_id'], ascending=[1])

train_columns = train.columns.values
test_columns = test.columns.values
features = list(set(train_columns) & set(test_columns))

train.fillna('NA', inplace=True)
test.fillna('NA', inplace=True)

y = train.outcome
train=train.drop('outcome',axis=1)

whole=pd.concat([train,test],ignore_index=True)
categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
for category in categorical:
    whole=reduce_dimen(whole,category,9999999)

Len = int(0.3*len(train))
X_train=whole[:Len]
Y_train=y[:Len]
X=whole[:len(train)]
Y=y[:len(train)]
X_test=whole[len(train):]

del train
del whole
    
X=X.sort_values(['people_id'], ascending=[1])
X_train = X_train.sort_values(['people_id'], ascending=[1])

X_train = X_train[features].drop(['people_id', 'activity_id'], axis = 1)
X = X[features].drop(['people_id', 'activity_id'], axis = 1)
X_test = X_test[features].drop(['people_id', 'activity_id'], axis = 1)

categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
not_categorical=[]
for category in X.columns:
    if category not in categorical:
        not_categorical.append(category)

# enc = OneHotEncoder(handle_unknown='ignore')
# enc=enc.fit(pd.concat([X[categorical],X_test[categorical]]))
# X_cat_sparse=enc.transform(X[categorical])
# X_test_cat_sparse=enc.transform(X_test[categorical])

# from scipy.sparse import hstack
# X_sparse=hstack((X[not_categorical], X_cat_sparse))
# X_test_sparse=hstack((X_test[not_categorical], X_test_cat_sparse))

# print("Training data: " + format(X_sparse.shape))
# print("Test data: " + format(X_test_sparse.shape))
# print("###########")
# print("One Hot enconded Test Dataset Script")

# dtrain = xgb.DMatrix(X_sparse,label=y)
# dtest = xgb.DMatrix(X_test_sparse)

# param = {'max_depth':10, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
# param['nthread'] = 4
# param['eval_metric'] = 'auc'
# param['subsample'] = 0.7
# param['colsample_bytree']= 0.7
# param['min_child_weight'] = 0
# param['booster'] = "gblinear"

# watchlist  = [(dtrain,'train')]
# num_round = 300
# early_stopping_rounds=10
# bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

dtrain2 = xgb.DMatrix(X_train,label=Y_train)
dtrain = xgb.DMatrix(X,label=Y)
dtest = xgb.DMatrix(X_test)

eta = 0.9
max_depth = 5
subsample = 0.8
colsample_bytree = 0.8

print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
params = {
    "objective": "binary:logistic",
    "booster" : "gbtree",
    "eval_metric": "auc",
    "eta": eta,
    "max_depth": max_depth,
    "subsample": subsample,
    "colsample_bytree": colsample_bytree,
    "silent": 1,
    "seed": 19960429
}

watchlist  = [(dtrain,'train'),(dtrain2,'val')]
num_round = 300
early_stopping_rounds=10
bst = xgb.train(params, dtrain, num_round, watchlist, early_stopping_rounds=early_stopping_rounds)

ypred = bst.predict(dtest)
output = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred })
output.head()
output.to_csv('output/without_leak.csv', index = False)

Train data shape: (2197291, 14)
Test data shape: (498687, 13)
People data shape: (189118, 41)


  if __name__ == '__main__':


XGBoost params. ETA: 0.9, MAX_DEPTH: 5, SUBSAMPLE: 0.8, COLSAMPLE_BY_TREE: 0.8
[0]	train-auc:0.914123	val-auc:0.912273
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 10 rounds.
[1]	train-auc:0.921904	val-auc:0.919891
[2]	train-auc:0.924052	val-auc:0.923247
[3]	train-auc:0.930314	val-auc:0.928975
[4]	train-auc:0.933408	val-auc:0.932319
[5]	train-auc:0.93707	val-auc:0.936188
[6]	train-auc:0.939046	val-auc:0.93745
[7]	train-auc:0.940209	val-auc:0.938412
[8]	train-auc:0.941683	val-auc:0.940202
[9]	train-auc:0.94309	val-auc:0.941547
[10]	train-auc:0.944024	val-auc:0.941847
[11]	train-auc:0.944903	val-auc:0.943164
[12]	train-auc:0.945512	val-auc:0.943586
[13]	train-auc:0.946775	val-auc:0.94491
[14]	train-auc:0.947603	val-auc:0.945508
[15]	train-auc:0.947978	val-auc:0.945747
[16]	train-auc:0.948387	val-auc:0.946162
[17]	train-auc:0.949227	val-auc:0.947387
[18]	train-auc:0.949438	val-auc:0.947772
[19]	train-auc:0.

[194]	train-auc:0.987038	val-auc:0.986407
[195]	train-auc:0.987089	val-auc:0.986437
[196]	train-auc:0.987168	val-auc:0.986504
[197]	train-auc:0.987229	val-auc:0.986552
[198]	train-auc:0.987309	val-auc:0.986604
[199]	train-auc:0.987354	val-auc:0.986641
[200]	train-auc:0.987425	val-auc:0.986724
[201]	train-auc:0.987492	val-auc:0.986783
[202]	train-auc:0.987562	val-auc:0.986839
[203]	train-auc:0.987615	val-auc:0.986902
[204]	train-auc:0.987641	val-auc:0.986955
[205]	train-auc:0.987679	val-auc:0.986986
[206]	train-auc:0.987734	val-auc:0.987065
[207]	train-auc:0.987771	val-auc:0.987122
[208]	train-auc:0.987818	val-auc:0.987155
[209]	train-auc:0.98784	val-auc:0.987169
[210]	train-auc:0.987901	val-auc:0.987267
[211]	train-auc:0.987986	val-auc:0.987329
[212]	train-auc:0.988062	val-auc:0.987395
[213]	train-auc:0.988119	val-auc:0.987467
[214]	train-auc:0.988149	val-auc:0.987511
[215]	train-auc:0.9882	val-auc:0.987541
[216]	train-auc:0.988271	val-auc:0.987643
[217]	train-auc:0.988303	val-auc:0.98

IOError: [Errno 2] No such file or directory: 'output/without_leak.csv'