In [1]:
import sys
sys.path.append("../")
import pandas as pd
import numpy as np
from dateutil.parser import parse
from sklearn import metrics
import datetime

import model_ml as mm
import feat_engineering as fe
from param_config import config

In [2]:
dfTrain = pd.read_csv(config.original_train_data_path)
dfPred = pd.read_csv(config.original_pred_data_path)
predictors = dfPred.columns.tolist()[1:]

In [5]:
def FeatAll(train,pred):
    dfAll = pd.concat([train,pred])
    dfAll['date'] = (pd.to_datetime(dfAll['date']) - parse('2017-10-09')).dt.days
    dfPcent = fe.pcent_single_col(dfAll,predictors)
    
    dfAll['age'] = pd.qcut(dfAll['age'],q=10,labels=['age_bin_%d'%i for i in range(10)])
    dfAll = pd.concat([dfAll,pd.get_dummies(dfAll['age'])],axis=1)
    del dfAll['age']
    dfAll['date'] = pd.qcut(dfAll['date'],q=10,labels=['date_bin_%d'%i for i in range(10)])
    dfAll = pd.concat([dfAll,pd.get_dummies(dfAll['date'])],axis=1)
    del dfAll['date']
    dfAll = pd.concat([dfAll,dfPcent],axis=1)
    
    for ratio in [['PartI_6','PartI_5'],['PartI_7','PartI_5'],['PartII_3','PartII_2'],['PartII_4','PartII_2']]:
        dfAll[ratio[0]+'_divided_'+ratio[1]] = dfAll[ratio[0]]/dfAll[ratio[1]]
        
    for key,group in {'PartI':['PartI_1','PartI_2','PartI_3','PartI_4'],'PartII':['PartII_1','PartII_3','PartII_4']}.items():
        dfAll['group_'+key+'_std'] = dfAll[group].std(axis=1)
        dfAll['group_'+key+'_mean'] = dfAll[group].mean(axis=1)
        dfAll['group_'+key+'_median'] = dfAll[group].median(axis=1)
    
    
    dfTrain = dfAll.loc[dfAll['ID'].isin(train['ID'])]
    dfPred = dfAll.loc[dfAll['ID'].isin(pred['ID'])]
    
    return dfTrain,dfPred
    

In [6]:
dfTrain,dfPred = FeatAll(dfTrain,dfPred)
predictors = dfPred.columns.tolist()
predictors.remove('ID')
predictors.remove('Y')

In [None]:
test_result,result,imp = mm.xgb_kfold(dfTrain,dfPred,predictors)

[0]	train-rmse:5.26819	eval-rmse:5.47414
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 10 rounds.
[100]	train-rmse:2.28926	eval-rmse:2.63201
[200]	train-rmse:1.44285	eval-rmse:1.91475
[300]	train-rmse:1.2433	eval-rmse:1.77715
[400]	train-rmse:1.17687	eval-rmse:1.74853
[500]	train-rmse:1.13721	eval-rmse:1.73594
[600]	train-rmse:1.10709	eval-rmse:1.73079
[700]	train-rmse:1.08008	eval-rmse:1.72596
Stopping. Best iteration:
[691]	train-rmse:1.0827	eval-rmse:1.72582

Best tree is 692, performance is 1.082703, 1.725818
[0]	train-rmse:5.34442	eval-rmse:5.16923
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 10 rounds.
[100]	train-rmse:2.37039	eval-rmse:2.17527
[200]	train-rmse:1.53004	eval-rmse:1.3759
[300]	train-rmse:1.32505	eval-rmse:1.25087
Stopping. Best iteration:
[352]	train-rmse:1.28005	eval-rmse:1.24197

Best tree is 

In [None]:
dfTrain_bin = dfTrain.copy()
dfTrain_bin['bin'] = 0
dfTrain_bin.loc[dfTrain_bin['Y']>=dfTrain_bin['Y'].quantile(0.85),'bin']=1
dfTrain_bin['Y'] = dfTrain_bin['bin']
del dfTrain_bin['bin']

In [None]:
params = {'max_depth':3, 'eta':0.01, 'silent':0,'objective':'binary:logistic','lambda':1,'subsample':0.8,'colsample_bytree':0.8,'eval_metric':'logloss'}
test_result_bin,result_bin,imp = mm.xgb_kfold(dfTrain_bin,dfPred,predictors,params=params)

In [None]:
test_result_bin['bin_score'] = test_result_bin['score']
total_test = test_result.merge(test_result_bin[['ID','bin_score']],'inner','ID')


#change_value = total_test.loc[total_test['target']>=total_test['target'].quantile(0.85),'target'].median()
change_value = total_test['target'].quantile(0.85)
total_test.loc[(total_test['bin_score']>=total_test['bin_score'].quantile(0.85))&(total_test['score']<change_value),'score'] = change_value
total_test.loc[(total_test['bin_score']<total_test['bin_score'].quantile(0.7))&(total_test['score']>=change_value),'score'] = change_value


print("Test MSE:",metrics.mean_squared_error(total_test['target'], total_test['score']))

In [None]:
test_result_bin['bin_score'] = test_result_bin['score']
total_test = test_result.merge(test_result_bin[['ID','bin_score']],'inner','ID')

In [None]:
n_splits=5
other_note=''
result['score']=result[['Score_%d'%i for i in range(1,n_splits+1)]].mean(axis=1)
result_bin['bin_score']=result_bin[['Score_%d'%i for i in range(1,n_splits+1)]].mean(axis=1)
total_result = result.merge(result_bin[['ID','bin_score']],'inner','ID')

total_result.loc[(total_result['bin_score']>=total_result['bin_score'].quantile(0.85))&(total_result['score']<change_value),'score'] = change_value
total_result.loc[(total_result['bin_score']<total_result['bin_score'].quantile(0.7))&(total_result['score']>=change_value),'score'] = change_value


submit = total_result[['ID','score']]
today = datetime.date.today().strftime('%Y-%m-%d')
submit.to_csv('../../Submission/submit_%s'%today+other_note+'.csv',header=False,index=False)
total_test.to_csv('../../Submission/test/test_result_%s'%today+other_note+'.csv',index=False)

In [None]:
tmp = test_result.loc[test_result['target']>7]
tmp['power'] = np.power(tmp['target']-tmp['score'],2)
print(tmp['power'].mean())

In [None]:
tmp = test_result.loc[test_result['target']<=7]
tmp['power'] = np.power(tmp['target']-tmp['score'],2)
print(tmp['power'].mean())

In [None]:
test_result.describe()

In [None]:
test_result.sort_values('score',ascending=False)