In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report

#models
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, 
                              AdaBoostClassifier, BaggingRegressor, VotingClassifier)
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression

print(__doc__)

Automatically created module for IPython interactive environment


In [2]:
data = pd.read_csv('data/df.csv', index_col = 0)

In [3]:
data.head()

Unnamed: 0,number_1,number_2,intersect_loc,time+loc_15,target,time+log+angle
0,158500411793,158500437133,0.0,0,0,0
1,158500411793,158500611579,0.0,0,0,0
2,158500411793,158500670731,0.0,0,0,0
3,158500411793,158500730293,0.0,0,0,0
4,158500411793,158500730487,0.0,0,0,0


In [4]:
X = data[['intersect_loc', 'time+loc_15', 'time+log+angle']]

In [5]:
y = data['target']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2, stratify=y)

Two of best models after training the bunch of them and tuning parameters are below:

In [7]:
xgb = XGBClassifier().fit(X_train, y_train)

In [8]:
print(classification_report(y_test, xgb.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8568
           1       0.74      0.56      0.63        36

   micro avg       1.00      1.00      1.00      8604
   macro avg       0.87      0.78      0.82      8604
weighted avg       1.00      1.00      1.00      8604



In [9]:
# Training classifiers
clf1 = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=0, min_samples_leaf=1)
clf2 = KNeighborsClassifier(n_neighbors=2)
clf3 = SVC(kernel= 'linear', probability=True)
clf4 = XGBClassifier()
eclf = VotingClassifier(estimators=[('rf', clf1),('knn', clf2), ('svc', clf3), ('xgb', clf4)],
                        voting='soft', weights=[1.5, 1, 1, 1.5])


clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)
clf4.fit(X_train, y_train)
eclf.fit(X_train, y_train)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))],
         flatten_transform=None, n_jobs=None, voting='soft',
         weights=[1.5, 1, 1, 1.5])

In [10]:
print(classification_report(y_test, eclf.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8568
           1       0.86      0.50      0.63        36

   micro avg       1.00      1.00      1.00      8604
   macro avg       0.93      0.75      0.82      8604
weighted avg       1.00      1.00      1.00      8604



To detect which observations were labeled incorrectly

In [11]:
y_test[~(xgb.predict(X_test) == y_test)]

10223    1
11688    1
1495     1
3159     1
26502    0
17458    1
24439    1
1971     1
23381    0
9034     0
14558    1
6214     1
12450    0
9271     1
23921    1
22653    0
23418    0
7994     1
25837    1
27475    0
5076     1
28027    1
775      1
Name: target, dtype: int64

Most of the numbers incorrectly labels here have 3-4 observations - it's the main reason of low recall score (dataset include only forn4 days)