In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold as SKF
from sklearn.metrics import log_loss as ll
import gc
%matplotlib inline
from matplotlib import pyplot as plt

In [4]:
df_train=pd.read_csv('data/train.csv')
df_test=pd.read_csv('data/test.csv')

In [5]:
df_train.describe()

Unnamed: 0,AngleOfSign,SignAspectRatio,SignWidth,SignHeight
count,38485.0,38485.0,38485.0,38485.0
mean,132.930986,0.901241,92.922957,104.99314
std,98.042472,0.226068,52.399274,53.347424
min,1.0,0.26,19.0,23.0
25%,42.0,0.77,54.0,65.0
50%,80.0,0.93,80.0,96.0
75%,228.0,1.04,118.0,132.0
max,360.0,2.79,589.0,513.0


In [7]:
df_train.head(1)

Unnamed: 0,Id,DetectedCamera,AngleOfSign,SignAspectRatio,SignWidth,SignHeight,SignFacing (Target)
0,2c9180975a056a64015a1e0a52e57021,Rear,195,1.02,46,45,Rear


In [8]:
df_train.shape

(38485, 7)

In [9]:
df_test.shape

(31485, 6)

In [10]:
df_test.head(1)

Unnamed: 0,Id,DetectedCamera,AngleOfSign,SignAspectRatio,SignWidth,SignHeight
0,2c9180975a056a64015a1e10d3f270fe,Right,67,0.63,107,169


In [17]:
df_train.columns

Index([u'Id', u'DetectedCamera', u'AngleOfSign', u'SignAspectRatio',
       u'SignWidth', u'SignHeight', u'Target'],
      dtype='object')

In [15]:
df_train.columns=['Id','DetectedCamera','AngleOfSign','SignAspectRatio','SignWidth','SignHeight','Target']

In [16]:
train_id=df_train.Id.values
test_id=df_test.Id.values
Y=df_train.Target

In [18]:
df_train.drop(['Id','Target'], axis=1, inplace=True)
df_test.drop(['Id'], axis=1, inplace=True)

In [19]:
df_train['Id_Perc']=pd.Series(range(len(df_train))).astype('float') / len(df_train)
df_test['Id_Perc']=pd.Series(range(len(df_test))).astype('float') / len(df_test)

In [21]:
df_full=df_train.append(df_test)

In [22]:
df_full.DetectedCamera.replace({'Front':0,'Left':1, 'Rear':2, 'Right':3}, inplace=True)

In [25]:
list(set(df_full.DetectedCamera.values))

[0, 1, 2, 3]

In [26]:
Y.replace({'Front':0,'Left':1, 'Rear':2, 'Right':3}, inplace=True)
Y=Y.values

In [28]:
list(set(Y))

[0, 1, 2, 3]

In [29]:
df_full.AngleOfSign=np.radians(df_full.AngleOfSign)

In [32]:
df_full.AngleOfSign[:5]

0    3.403392
1    3.543018
2    0.453786
3    3.473205
4    3.630285
Name: AngleOfSign, dtype: float64

In [33]:
df_full['Sin']=np.sin(df_full.AngleOfSign) 
df_full['Cos']=np.cos(df_full.AngleOfSign) 
df_full['SignArea']=df_full.SignWidth * df_full.SignHeight 


In [34]:
df_full['MirrorAngle']=max(df_full.AngleOfSign)-df_full.AngleOfSign #360 - Angle
df_full['sqrtAngle']=df_full.AngleOfSign ** 0.5
df_full['Angle_Ht']=df_full.AngleOfSign * df_full.SignHeight
df_full['Angle_AR']=df_full.AngleOfSign * df_full.SignAspectRatio
df_full['Tan']=df_full.Sin / df_full.Cos
df_full['SinCos']=df_full.Sin * df_full.Cos

In [35]:
df_full=pd.get_dummies(df_full, columns=['DetectedCamera'], sparse=False)

In [36]:
df_train=df_full[:len(df_train)]
df_test=df_full[len(df_train):]

In [37]:
dtest=xgb.DMatrix(df_test)
xgb_params = {
    'seed': 619, 
    'colsample_bytree': 0.67,
    'silent': 1,
    'subsample': 0.9,
    'learning_rate': 0.05,
    'objective': 'multi:softprob',
    'num_class': 4,
    'max_depth': 4, 
    'min_child_weight': 3, 
    'alpha': 0.02,
    'eval_metric' : 'mlogloss'
    
}

In [38]:
nrounds = 2000  
kfolds = 5 

In [39]:
oof_train=pd.DataFrame({'ID': train_id, 'Front':0, 'Left':0, 'Rear':0, 'Right':0})


In [40]:
best=[]
score=[]


In [41]:
skf = SKF( n_splits=kfolds, shuffle=True,random_state=123)
i=0
for train_index, test_index in skf.split(df_train, Y):
    print('Fold {0}'.format(i + 1))
    X_train, X_val = df_train.iloc[train_index], df_train.iloc[test_index]
    y_train, y_val = Y[train_index],Y[test_index]

    dtrain = xgb.DMatrix(X_train,y_train)
    dval = xgb.DMatrix(X_val,y_val)
    watchlist = [(dtrain, 'train'), (dval, 'eval')]

    gbdt = xgb.train(xgb_params, dtrain, nrounds, watchlist,
                         verbose_eval=50,
                         early_stopping_rounds=25)  
    bst=gbdt.best_ntree_limit
    pred=gbdt.predict(dval, ntree_limit=bst)
    oof_train.loc[test_index,"Front"]= pred[:,0]
    oof_train.loc[test_index,"Left"]= pred[:,1]
    oof_train.loc[test_index,"Rear"]= pred[:,2]
    oof_train.loc[test_index,"Right"]= pred[:,3]
    scr=ll(y_val,pred) 
    
    best.append(bst)    
    score.append(scr)
    i+=1
    
    del dtrain
    del dval
    del gbdt
    gc.collect()

print(np.mean(score))
print(np.mean(best))


Fold 1
[0]	train-mlogloss:1.2953	eval-mlogloss:1.29548
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[50]	train-mlogloss:0.167513	eval-mlogloss:0.172438
[100]	train-mlogloss:0.094189	eval-mlogloss:0.101351
[150]	train-mlogloss:0.083485	eval-mlogloss:0.093707
[200]	train-mlogloss:0.078747	eval-mlogloss:0.092174
[250]	train-mlogloss:0.074537	eval-mlogloss:0.091433
[300]	train-mlogloss:0.070751	eval-mlogloss:0.091239
[350]	train-mlogloss:0.067008	eval-mlogloss:0.090772
Stopping. Best iteration:
[352]	train-mlogloss:0.066848	eval-mlogloss:0.090737

Fold 2
[0]	train-mlogloss:1.29517	eval-mlogloss:1.29536
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 25 rounds.
[50]	train-mlogloss:0.165759	eval-mlogloss:0.175093
[100]	train-mlogloss:0.091945	eval-mlogloss:0.108717
[150]	train-mlogloss:0.081377	ev

In [43]:
best_nrounds=int(round(np.mean(best)))
dtrain=xgb.DMatrix(df_train,Y)

watchlist = [(dtrain, 'train')]
gbdt = xgb.train(xgb_params, dtrain, best_nrounds,watchlist,verbose_eval=50,early_stopping_rounds=25)
pred=gbdt.predict(dtest)
pred=pd.DataFrame(pred, columns=['Front','Left','Rear','Right'])
pred['Id']=test_id
pred=pred[['Id','Front','Left','Rear','Right']]
pred.to_csv('xgb_submission.csv', index=False)

[0]	train-mlogloss:1.29538
Will train until train-mlogloss hasn't improved in 25 rounds.
[50]	train-mlogloss:0.167823
[100]	train-mlogloss:0.094742
[150]	train-mlogloss:0.084706
[200]	train-mlogloss:0.080272
[250]	train-mlogloss:0.076562
