In [44]:
import numpy as np
import xgboost as xgb
import pandas as pd
import math

from sklearn.cross_validation import train_test_split
from ml_metrics import rmsle

print ('')
print ('Loading Data...')

def evalerror(preds, dtrain):

    labels = dtrain.get_label()
    assert len(preds) == len(labels)
    labels = labels.tolist()
    preds = preds.tolist()
    terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0,preds[i]) + 1)) ** 2.0 for i,pred in enumerate(labels)]
    return 'error', (sum(terms_to_sum) * (1.0/len(preds))) ** 0.5

def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
        i = i + 1
    outfile.close()

train = pd.read_csv('../kaggle/train_downsample.csv')
#eight_ave = (train[train['Semana'] == 8]['Demanda_uni_equil']).mean(axis=1)
#nine_ave = (train[train['Semana'] == 9]['Demanda_uni_equil']).mean(axis=1)
#train = train[train['Semana'] > 8].reset_index()
#print(train)
#train = train.drop(['Semana'])
test = pd.read_csv('../kaggle/test.csv')

freq_dict = {}
freq_file = open('../kaggle/frequent.csv')

first = True
for line in freq_file:
    if first:
        first = False
        continue
    key, value = line.split(',')
    key = int(key)
    value = int(value)
    freq_dict[key] = value
    
#print(freq_dict)
    
train['id_f'] = train['Producto_ID'].map(freq_dict)
test['id_f'] = test['Producto_ID'].map(freq_dict)
#train = train.drop(['Producto_ID'], axis=1)
#test = test.drop(['Producto_ID'], axis=1)

print ('')
print ('Training_Shape:', train.shape)

ids = test['id']
test = test.drop(['id'],axis = 1)

y = train['Demanda_uni_equil']
X = train[test.columns.values]

features = list(X.keys())
create_feature_map(features)

params = {}
params['objective'] = "reg:linear"
params['booster'] = "gbtree"
params['eta'] = 0.1
params['max_depth'] = 10
params['subsample'] = 0.85
params['colsample_bytree'] = 0.7
params['silent'] = True

from sklearn.cross_validation import KFold

n_folds = 5
num_rounds = 100
rmsle_scores = []

for train_index, test_index in KFold(n=len(X), n_folds=n_folds, shuffle=True, random_state=1729):
    # Xをnparrayへ変更
    x_train = X.as_matrix()[train_index]
    y_train = y[train_index]
    x_test = X.as_matrix()[test_index]
    y_test = y[test_index]
    
    test_preds = np.zeros(test.shape[0])
    xg_train = xgb.DMatrix(x_train, label=y_train)
    xg_test = xgb.DMatrix(x_test)
    
    watchlist = [(xg_train, 'train')]
    
    xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = evalerror, early_stopping_rounds= 200, verbose_eval = 10)
    preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration)
    
    print ('RMSLE Score:', rmsle(y_test, preds))
    rmsle_scores.append
    
importance = xgclassifier.get_fscore(fmap='xgb.fmap')
#importance = sorted(importance.items(), key=operator.itemgetter(1))
print('importances:')
print(importance)


Loading Data...

Training_Shape: (499999, 12)
[0]	train-error:1.1303
Will train until train-error hasn't improved in 200 rounds.
[10]	train-error:0.639165
[20]	train-error:0.655765
[30]	train-error:0.669052
[40]	train-error:0.6665
[50]	train-error:0.661348
[60]	train-error:0.659004
[70]	train-error:0.655437
[80]	train-error:0.653366
[90]	train-error:0.650769
RMSLE Score: 0.644997678648
[0]	train-error:1.13001
Will train until train-error hasn't improved in 200 rounds.
[10]	train-error:0.640128
[20]	train-error:0.656878
[30]	train-error:0.667428
[40]	train-error:0.663573
[50]	train-error:0.659006
[60]	train-error:0.658155
[70]	train-error:0.653571
[80]	train-error:0.651238
[90]	train-error:0.649863
RMSLE Score: 0.648579874353
[0]	train-error:1.13087
Will train until train-error hasn't improved in 200 rounds.
[10]	train-error:0.639809
[20]	train-error:0.656741
[30]	train-error:0.668701
[40]	train-error:0.664869
[50]	train-error:0.659389
[60]	train-error:0.657056
[70]	train-error:0.65289