In [75]:
import pandas as pd


import numpy as np


import xgboost as xgb
# machine learning
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation


In [76]:
# get training & test csv files as a DataFrame
df_train = pd.read_csv("../input/train.csv" )
df_test    = pd.read_csv("../input/test.csv")

In [77]:
y_train = np.log(df_train['loss'].ravel())

X_train = df_train.copy()
X_test = df_test.copy()

X_train.drop(['id', 'loss'], axis=1, inplace=True)
X_test.drop(['id'], axis=1, inplace=True)



In [78]:
print("{},{}".format(X_train.shape, X_test.shape))

ntrain = X_train.shape[0]
train_test = pd.concat((X_train, X_test)).reset_index(drop=True)

features = X_train.columns

cats = [feat for feat in features if 'cat' in feat]
for feat in cats:
    train_test[feat] = pd.factorize(train_test[feat], sort=True)[0]

print(train_test.head())

X_train = np.array(train_test.iloc[:ntrain,:])
X_test = np.array(train_test.iloc[ntrain:,:])

print("{},{}".format(X_train.shape, X_test.shape))


(188318, 130),(125546, 130)
   cat1  cat2  cat3  cat4  cat5  cat6  cat7  cat8  cat9  cat10    ...     \
0     0     1     0     1     0     0     0     0     1      0    ...      
1     0     1     0     0     0     0     0     0     1      1    ...      
2     0     1     0     0     1     0     0     0     1      1    ...      
3     1     1     0     1     0     0     0     0     1      0    ...      
4     0     1     0     1     0     0     0     0     1      1    ...      

      cont5     cont6     cont7    cont8    cont9   cont10    cont11  \
0  0.310061  0.718367  0.335060  0.30260  0.67135  0.83510  0.569745   
1  0.885834  0.438917  0.436585  0.60087  0.35127  0.43919  0.338312   
2  0.397069  0.289648  0.315545  0.27320  0.26076  0.32446  0.381398   
3  0.422268  0.440945  0.391128  0.31796  0.32128  0.44467  0.327915   
4  0.704268  0.178193  0.247408  0.24564  0.22089  0.21230  0.204687   

     cont12    cont13    cont14  
0  0.594646  0.822493  0.714843  
1  0.366307  0

In [79]:
# Find the features that really matter in data set using Random Forest Classifier
'''
feat_labels = X_train.columns
forest = RandomForestRegressor(n_estimators=500, random_state=0, n_jobs=-1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
importances
'''

'\nfeat_labels = X_train.columns\nforest = RandomForestRegressor(n_estimators=500, random_state=0, n_jobs=-1)\nforest.fit(X_train, y_train)\nimportances = forest.feature_importances_\nindices = np.argsort(importances)[::-1]\nimportances\n'

In [80]:
# identify the list of top features
'''
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
'''

'\nfor f in range(X_train.shape[1]):\n    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))\n'

In [81]:
# Use only top features
'''
X_train = forest.transform(X_train, threshold=.005)
X_test = forest.transform(X_test, threshold=.005)
'''

'\nX_train = forest.transform(X_train, threshold=.005)\nX_test = forest.transform(X_test, threshold=.005)\n'

In [82]:
# in addition to the original data sets for training (train_orig)and testing (test_orig)
# split train_orig data into training and testing sets randomly so we can obtain a practice test set with outcomes
'''
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.20, random_state=0)
'''

'\nfrom sklearn.cross_validation import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.20, random_state=0)\n'

In [83]:
# Xgboost 

params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 6,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'mae',
}


T_train_xgb = xgb.DMatrix(X_train, label=y_train)
X_test_xgb  = xgb.DMatrix(X_test)





In [84]:
from sklearn.metrics import mean_absolute_error

def xg_eval_mae(yhat, T_train_xgb):
    y = T_train_xgb.get_label()
    return 'mae', mean_absolute_error(np.exp(y), np.exp(yhat))

res = xgb.cv(params, T_train_xgb, num_boost_round=750, nfold=4, seed=2016, stratified=False,
             early_stopping_rounds=15, verbose_eval=10, show_stdv=True, feval=xg_eval_mae, maximize=False)

best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]
print('CV-Mean: {0}+{1}'.format(cv_mean, cv_std))

[0]	train-mae:1520.58+1513.93	test-mae:1520.58+1513.93
[10]	train-mae:1466.45+1463.4	test-mae:1466.45+1463.4
[20]	train-mae:1227.98+1226.57	test-mae:1228.13+1226.73
[30]	train-mae:908.754+908.031	test-mae:910.17+909.449
[40]	train-mae:714.059+713.55	test-mae:717.94+717.434
[50]	train-mae:633.703+633.252	test-mae:639.877+639.427
[60]	train-mae:601.668+601.235	test-mae:609.71+609.277
[70]	train-mae:587.401+586.975	test-mae:597.233+596.806
[80]	train-mae:579.622+579.201	test-mae:590.932+590.507
[90]	train-mae:574.819+574.401	test-mae:587.372+586.949
[100]	train-mae:571.05+570.635	test-mae:584.806+584.384
[110]	train-mae:568.044+567.631	test-mae:582.942+582.521
[120]	train-mae:565.44+565.028	test-mae:581.534+581.115
[130]	train-mae:563.407+562.997	test-mae:580.494+580.075
[140]	train-mae:561.364+560.955	test-mae:579.554+579.135
[150]	train-mae:559.599+559.192	test-mae:578.813+578.395
[160]	train-mae:557.825+557.418	test-mae:578.073+577.656
[170]	train-mae:556.212+555.807	test-mae:577.494+5

In [86]:
gbm = xgb.train(params, T_train_xgb, best_nrounds)
y_pred = gbm.predict(X_test_xgb)

In [88]:
y_pred = np.exp(y_pred)
output = pd.DataFrame({
        "ID": df_test["id"],
        "loss": y_pred[:]
    })
output.to_csv("../input/xgb_output.csv", index=False)