In [34]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp
import gc

from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

sns.set_style('dark')

import warnings
warnings.filterwarnings('ignore')

SEED = 12313
np.random.seed(SEED)

# Load Files

In [19]:
train = pd.read_csv('../data/raw/4b699168-4-here_dataset/train.csv')
test  = pd.read_csv('../data/raw/4b699168-4-here_dataset/test.csv')
sub   = pd.read_csv('../data/raw/4b699168-4-here_dataset/sample_submission.csv')

In [21]:
train.columns

Index(['Id', 'DetectedCamera', 'AngleOfSign', 'SignAspectRatio', 'SignWidth',
       'SignHeight', 'SignFacing (Target)'],
      dtype='object')

In [22]:
# encode target variable
lbl = LabelEncoder()
lbl.fit(train['SignFacing (Target)'])

train['SignFacing (Target)'] = lbl.transform(train['SignFacing (Target)'])

In [23]:
# encoded classes
lbl.classes_

array(['Front', 'Left', 'Rear', 'Right'], dtype=object)

In [24]:
# rename target column
train = train.rename(columns={
    'SignFacing (Target)': 'Target'
})

In [25]:
# concatenate train and test data
data = pd.concat((train, test))
del train, test
gc.collect()

train_mask = data.Target.notnull()

** Plot decision boundary **

In [9]:
from matplotlib.colors import ListedColormap

c0=sns.color_palette()[0]
c1=sns.color_palette()[1]
c2=sns.color_palette()[2]

cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

def points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=True, colorscale=cmap_light, cdiscrete=cmap_bold, alpha=0.1, psize=10, zfunc=False, predicted=False):
    h = .02
    X=np.concatenate((Xtr, Xte))
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))

    #plt.figure(figsize=(10,6))
    if zfunc:
        p0 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]
        p1 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        Z=zfunc(p0, p1)
    else:
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    ZZ = Z.reshape(xx.shape)
    if mesh:
        plt.pcolormesh(xx, yy, ZZ, cmap=cmap_light, alpha=alpha, axes=ax)
    if predicted:
        showtr = clf.predict(Xtr)
        showte = clf.predict(Xte)
    else:
        showtr = ytr
        showte = yte
    ax.scatter(Xtr[:, 0], Xtr[:, 1], c=showtr-1, cmap=cmap_bold, s=psize, alpha=alpha,edgecolor="k")
    # and testing points
    ax.scatter(Xte[:, 0], Xte[:, 1], c=showte-1, cmap=cmap_bold, alpha=alpha, marker="s", s=psize+10)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    return ax,xx,yy

** Details of this experiment **

1. Only consider AngleOfSign as the predictor variable.
2. Plot decision surface to check how smooth it is.
3. Evaluate model using different metrics like accuracy, logloss etc.
4. Go throught the instances which were misclassified.

In [11]:
# class distribution
data.loc[train_mask, 'Target'].value_counts(normalize=True)

Front    0.554164
Rear     0.406964
Right    0.021333
Left     0.017539
Name: Target, dtype: float64

** 55% of the signs are in front of the vehicle, 40% are at rear end, 2% on right and 1.7% on left. **

In [26]:
X = data.loc[train_mask, ['AngleOfSign']]
y = data.loc[train_mask, 'Target']

Xtest = data.loc[~train_mask, ['AngleOfSign']]

** Split data into training and test set **

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.25, random_state=SEED)

** Train a RF model **

In [31]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=2)
rf = rf.fit(X_train, y_train)

preds = rf.predict(X_test)
pred_prob = rf.predict_proba(X_test)
print('Classification Report: \n', classification_report(y_test, preds))
print('\nLogloss on test examples: ', log_loss(y_test, pred_prob))

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.96      0.99      0.97      5332
        1.0       0.50      0.14      0.22       169
        2.0       0.97      0.98      0.98      3916
        3.0       0.50      0.35      0.41       205

avg / total       0.95      0.95      0.95      9622


Logloss on test examples:  0.265303645568


In [35]:
# Visualize one of the tree in the random forest
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
tree.export_graphviz(clf, out_file='../reports/figures/tree.dot')

In [47]:
# analyze predictions
pred_analysis = pd.DataFrame({
    'predictions': lbl.inverse_transform(y_test.astype(np.int)),
    'front_prob':  pred_prob[:, 0],
    'left_prob':   pred_prob[:, 1],
    'rear_prob':  pred_prob[:, 2],
    'right_prob' :  pred_prob[:, 3]
})

In [83]:
X_test.iloc[663:664]

Unnamed: 0,AngleOfSign
16890,271


In [84]:
y_test.iloc[663]

1.0

In [85]:
data.loc[train_mask, :].iloc[663]

AngleOfSign                                      24
DetectedCamera                                Front
Id                 2c91809e5aba7d25015acdcb0e75299b
SignAspectRatio                                0.98
SignHeight                                      112
SignWidth                                       110
Target                                            0
Name: 663, dtype: object

In [58]:
pred_analysis[['predictions', 'front_prob', 'left_prob', 'rear_prob', 'right_prob']]\
            .to_csv('../reports/prediction_angle.csv', index=False)