In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import gc

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


from tqdm import tqdm_notebook

import time
import xgboost as xgb

sns.set_style('dark')

SEED = 53525
np.random.seed(SEED)

%run ../src/data/make_dataset.py
%run ../src/features/util.py
%run ../src/models/cross_validation.py



In [2]:
# map for detected camera

detected_camera_map = {
    'Front': 0,
    'Left': 1,
    'Rear': 2,
    'Right':3
}

In [3]:
def reload_data():
    data = load_file('../data/processed/processed.feather')
    data['DetectedCamera'] = data.DetectedCamera.map(detected_camera_map)

    train_mask = data.Target.notnull()
    
    return data, train_mask

In [4]:
def prepare_data(data):
    
    def get_closest_qudrant(angle):
        if angle >= 0 and angle <= 90:
            diff_from_0  = angle - 0
            diff_from_90 = 90 - angle

            if diff_from_0 < diff_from_90:
                return 0
            else:
                return 90

        elif angle > 90 and angle <= 180:
            diff_from_90  = angle - 90
            diff_from_180 = 180 - angle

            if diff_from_90 < diff_from_180:
                return 90
            else:
                return 180

        elif angle > 180 and angle <= 270:
            diff_from_180 = angle - 180
            diff_from_270 = 270 - angle

            if diff_from_180 < diff_from_270:
                return 180
            else:
                return 270

        else:
            diff_from_270 = angle - 270
            diff_from_360 = 360 - angle

            if diff_from_270 < diff_from_360:
                return 270
            else:
                return 360
    
    st = time.time()
    
    sign_area = data.SignHeight * data.SignWidth
    data      = data.assign(sign_area=sign_area)
    
    sign_perimeter = 2 * (data.SignHeight + data.SignWidth)
    data           = data.assign(sign_perimeter=sign_perimeter)
    
    diff_height_width = data.SignHeight - data.SignWidth
    data              = data.assign(diff_height_width=diff_height_width)
    
    data = data.assign(closest_quadrant=data.AngleOfSign.map(get_closest_qudrant))
    mean_target_by_quadrant = data.loc[train_mask, :]\
                              .groupby(['DetectedCamera', 'closest_quadrant'])['Target']\
                              .mean()

    data = data.assign(mean_target=data[['DetectedCamera', 'closest_quadrant']]\
                              .apply(lambda x: mean_target_by_quadrant.ix[x[0], x[1]], axis=1))
    
    et = time.time()
    print('Took: {} seconds to prepare data'.format((et - st)))
    
    return data

In [5]:
data, train_mask = reload_data()
data = prepare_data(data)

Took: 8.300810098648071 seconds to prepare data


In [6]:
def get_rf_predictions(X_train, y_train, X_test, params=None, ntrees=1000):
    if not 'n_estimators' in params:
        params['n_estimators'] = ntrees
        
    clf = RandomForestClassifier(**params)
    clf.fit(X_train, y_train)
    
    return clf.predict_proba(X_test)

def get_xgb_predictions(X_train, y_train, X_test, params=None, ntrees=1000):
    if not 'n_estimators' in params:
        params['n_estimators'] = ntrees
    
    clf = xgb.XGBClassifier(**params)
    clf.fit(X_train, y_train)
    
    return clf.predict_proba(X_test)

def get_etc_predictions(X_train, y_train, X_test, params=None, ntrees=1000):
    if not 'n_estimators' in params:
        params['n_estimators'] = ntrees
        
    clf = ExtraTreesClassifier(**params)
    clf.fit(X_train, y_train)
    
    return clf.predict_proba(X_test)

In [7]:
def predict_out_of_folds(X, y, Xtest=None, params=None, func_name='rf', n_splits=10):
    if Xtest is None:
        kf = StratifiedKFold(n_splits=n_splits)
        y_pred = np.zeros(shape=(len(y), 4))
        
        for train_inds, test_inds in tqdm_notebook(kf.split(X, y)):
            if func_name == 'rf':
                y_pred[test_inds, :] = get_rf_predictions(X.iloc[train_inds], y.iloc[train_inds],
                                                       X.iloc[test_inds], params
                                                      )
            elif func_name == 'xgb':
                y_pred[test_inds, :] = get_xgb_predictions(X.iloc[train_inds], y.iloc[train_inds],
                                                       X.iloc[test_inds], params
                                                      )
                
            else:
                y_pred[test_inds, :] = get_etc_predictions(X.iloc[train_inds], y.iloc[train_inds],
                                                       X.iloc[test_inds], params
                                                      )
                
        return y_pred
    else:
        if func_name == 'rf':
            return get_rf_predictions(X, y, Xtest, params)
        elif func_name == 'xgb':
            return get_xgb_predictions(X, y, Xtest, params)
        else:
            return get_etc_predictions(X, y, Xtest, params)

In [8]:
ff = ['AngleOfSign', 'DetectedCamera',
      'mean_target', 'sign_area', 'sign_perimeter',
      'diff_height_width'
     ]

X = data.loc[train_mask, ff]
y = data.loc[train_mask, 'Target']

Xtest = data.loc[~train_mask, ff]

In [9]:
params = {
    'stratify': y,
    'test_size': .2,
    'random_state': SEED
}

X_train, X_test, y_train, y_test = get_train_test_split(X, y, **params)

In [20]:
st = time.time()

train_stack = pd.DataFrame(index=X_train.index, 
                           columns=['y_xgb_0', 'y_xgb_1', 'y_xgb_2', 'y_xgb_3', 
                                    'y_rf_0', 'y_rf_1', 'y_rf_2', 'y_rf_3',
                                    'y_etc_0', 'y_etc_1', 'y_etc_2', 'y_etc_3'
                                   ])

test_stack  = pd.DataFrame(index=X_test.index, 
                           columns=['y_xgb_0', 'y_xgb_1', 'y_xgb_2', 'y_xgb_3', 
                                    'y_rf_0', 'y_rf_1', 'y_rf_2', 'y_rf_3',
                                    'y_etc_0', 'y_etc_1', 'y_etc_2', 'y_etc_3'
                                   ])

params_xgb = {
    'n_estimators': 300, 
    'learning_rate': .1, 
    'max_depth': 2, 
    'seed': SEED
}

train_stack.loc[:, ['y_xgb_0', 'y_xgb_1', 'y_xgb_2', 'y_xgb_3']] = predict_out_of_folds(X_train, y_train, Xtest=None, 
                                            params=params_xgb, func_name='xgb')

test_stack.loc[:, ['y_xgb_0', 'y_xgb_1', 'y_xgb_2', 'y_xgb_3']] = predict_out_of_folds(X_train, y_train, Xtest=X_test, 
                                            params=params_xgb, func_name='xgb')

params_rf = {
    'n_estimators': 1000, 
    'max_depth': 10, 
    'min_samples_split': 2, 
    'random_state': SEED,
    'n_jobs': -1
}

train_stack.loc[:, ['y_rf_0', 'y_rf_1', 'y_rf_2', 'y_rf_3']] = predict_out_of_folds(X_train, y_train, Xtest=None, 
                                            params=params_rf, func_name='rf')

test_stack.loc[:, ['y_rf_0', 'y_rf_1', 'y_rf_2', 'y_rf_3']]  = predict_out_of_folds(X_train, y_train, Xtest=X_test, 
                                            params=params_rf, func_name='rf')

params_etc = {
    'n_estimators': 1000, 
    'max_depth': 15, 
    'min_samples_split': 2, 
    'random_state': SEED,
    'n_jobs': -1
}

train_stack.loc[:, ['y_etc_0', 'y_etc_1', 'y_etc_2', 'y_etc_3']] = predict_out_of_folds(X_train, y_train, Xtest=None, 
                                            params=params_etc, func_name='etc')

test_stack.loc[:, ['y_etc_0', 'y_etc_1', 'y_etc_2', 'y_etc_3']]  = predict_out_of_folds(X_train, y_train, Xtest=X_test, 
                                            params=params_etc, func_name='etc')


et = time.time()

print('Took: {} seconds to create out of fold predictions'.format((et - st)))




Took: 233.0376102924347 seconds to create out of fold predictions


In [21]:
print('XGB: ', log_loss(y_train, train_stack[['y_xgb_0', 'y_xgb_1', 'y_xgb_2', 'y_xgb_3']]))
print('RF: ', log_loss(y_train, train_stack[['y_rf_0', 'y_rf_1', 'y_rf_2', 'y_rf_3']]))
print('ETC: ', log_loss(y_train, train_stack[['y_etc_0','y_etc_1','y_etc_2','y_etc_3']]))

XGB:  0.104972179347
RF:  0.106489975217
ETC:  0.116166066104


** Fit a linear model **

In [22]:
lr = LogisticRegression(C=1.)
lr.fit(train_stack, y_train)
y_pred = lr.predict_proba(test_stack)
print(log_loss(y_test, y_pred))

0.120741755188


In [13]:
pd.DataFrame(lr.coef_)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.382144,-0.750422,-1.741581,-0.119068,2.321783,-0.356215,-2.517021,-0.677472,2.176221,-1.482651,0.247159,-2.169655
1,-0.07488,3.354511,-3.261247,-1.543044,-1.437488,0.869907,0.204865,-1.161944,-2.728959,2.748042,-0.255869,-1.287874
2,-0.719547,-2.156341,2.610192,-1.07182,-1.200954,-0.230935,0.440337,-0.345965,-2.897556,-1.287473,2.829013,0.0185
3,-0.526299,-1.739988,-1.251682,1.923614,-1.297479,-0.896975,-1.20065,1.800749,-1.617711,-0.86272,-1.921488,2.807564


** Fit a non-linear model **

Concatenate meta-features with original features

In [23]:
_train = pd.concat((X_train, train_stack.astype(np.float)), axis='columns')
_test  = pd.concat((X_test,  test_stack.astype(np.float)), axis='columns')

skf = StratifiedKFold(n_splits=10, random_state=SEED)

fold_scores = []
for itr, ite in tqdm_notebook(skf.split(_train, y_train)):
#     model = RandomForestClassifier(n_estimators=300, n_jobs=-1, max_depth=7)
    model = xgb.XGBClassifier(n_estimators=100, max_depth=2, seed=SEED)
    model.fit(_train.iloc[itr], y_train.iloc[itr])
    y_pred = model.predict_proba(_train.iloc[ite])
    
    fold_score = log_loss(y_train.iloc[ite], y_pred)
    print('Fold score: {}'.format(fold_score))
    fold_scores.append(fold_score)
    print('='*75)

print('Mean cv score: {0}, std cv score: {1}'.format(np.mean(fold_scores), np.std(fold_scores)))

Fold score: 0.10056597830125966
Fold score: 0.11535855715345521
Fold score: 0.10128152306944334
Fold score: 0.10899580670367157
Fold score: 0.10754092219528026
Fold score: 0.10225083642317764
Fold score: 0.08325148095708303
Fold score: 0.10357332693210478
Fold score: 0.10788029757200646
Fold score: 0.11120837921647114

Mean cv score: 0.10419071085239531, std cv score: 0.008283067406186981


In [24]:
# model = RandomForestClassifier(n_estimators=300, max_depth=7, random_state=SEED)
model = xgb.XGBClassifier(max_depth=2, seed=SEED)
model.fit(_train, y_train)
y_pred = model.predict_proba(_test)

print('Log loss on hold out set: {}'.format(log_loss(y_test, y_pred)))

Log loss on hold out set: 0.10685389464947592


** Full Training **

In [25]:
st = time.time()

train_stack = pd.DataFrame(index=X.index, columns=['y_xgb_0', 'y_xgb_1', 'y_xgb_2', 'y_xgb_3', 
                                                   'y_rf_0', 'y_rf_1', 'y_rf_2', 'y_rf_3',
                                                   'y_etc_0', 'y_etc_1', 'y_etc_2', 'y_etc_3'
                                                  ])
test_stack  = pd.DataFrame(index=Xtest.index, columns=['y_xgb_0', 'y_xgb_1', 'y_xgb_2', 'y_xgb_3', 
                                                       'y_rf_0', 'y_rf_1', 'y_rf_2', 'y_rf_3',
                                                       'y_etc_0', 'y_etc_1', 'y_etc_2', 'y_etc_3'
                                                      ])

params_xgb = {
    'n_estimators': 300, 
    'learning_rate': .1, 
    'max_depth': 2, 
    'seed': SEED
}

train_stack.loc[:, ['y_xgb_0', 'y_xgb_1', 'y_xgb_2', 'y_xgb_3']] = predict_out_of_folds(X, y, Xtest=None, 
                                            params=params_xgb, func_name='xgb')

test_stack.loc[:, ['y_xgb_0', 'y_xgb_1', 'y_xgb_2', 'y_xgb_3']] = predict_out_of_folds(X, y, Xtest=Xtest, 
                                            params=params_xgb, func_name='xgb')

params_rf = {
    'n_estimators': 1000, 
    'max_depth': 10, 
    'min_samples_split': 2, 
    'random_state': SEED,
    'n_jobs': -1
}

train_stack.loc[:, ['y_rf_0', 'y_rf_1', 'y_rf_2', 'y_rf_3']] = predict_out_of_folds(X, y, Xtest=None, 
                                            params=params_rf, func_name='rf')

test_stack.loc[:, ['y_rf_0', 'y_rf_1', 'y_rf_2', 'y_rf_3']]  = predict_out_of_folds(X, y, Xtest=Xtest, 
                                            params=params_rf, func_name='rf')

params_etc = {
    'n_estimators': 1000, 
    'max_depth': 15, 
    'min_samples_split': 2, 
    'random_state': SEED,
    'n_jobs': -1
}

train_stack.loc[:, ['y_etc_0', 'y_etc_1', 'y_etc_2', 'y_etc_3']] = predict_out_of_folds(X, y, Xtest=None, 
                                            params=params_etc, func_name='etc')

test_stack.loc[:, ['y_etc_0', 'y_etc_1', 'y_etc_2', 'y_etc_3']]  = predict_out_of_folds(X, y, Xtest=Xtest, 
                                            params=params_etc, func_name='etc')


et = time.time()

print('Took: {} seconds to create out of fold predictions'.format((et - st)))




Took: 276.25753140449524 seconds to create out of fold predictions


In [26]:
_train = pd.concat((X, train_stack.astype(np.float)), axis='columns')
_test  = pd.concat((Xtest,  test_stack.astype(np.float)), axis='columns')

# model = RandomForestClassifier(n_estimators=300, max_depth=7, random_state=SEED)
model = xgb.XGBClassifier(max_depth=2, seed=SEED)
model.fit(_train, y)
y_pred = model.predict_proba(_test)

In [27]:
sub = pd.read_csv('../data/raw/4b699168-4-here_dataset/sample_submission.csv')
sub.loc[:, ['Front', 'Left', 'Rear', 'Right']] = y_pred
sub.to_csv('../submissions/predict_sign/xgb_stacking_1.csv', index=False)