OPTIMIZATION

In [43]:
from IPython.display import Image
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()

import warnings
warnings.simplefilter("ignore")

In [44]:
df = pd.read_csv('train.csv', parse_dates = ['Occurrence Local Date Time'])
df.head()

Unnamed: 0,EventId,Occurrence Local Date Time,Reporting Agency,Cause,Subcause,Status,longitude,latitude,road_segment_id
0,60558,2016-01-01 00:53:00,Cam,Stationary Vehicle,Vehicle On Shoulder,Closed,18.5408955032,-33.888275,S0B3CGQ
1,60559,2016-01-01 00:54:00,CAMERA,Accident,With A Fixed Object,Closed,18.9307563219,-34.140857,RYJYAPI
2,60560,2016-01-01 02:26:00,Law Enforcement,Accident,Multi Vehicle,Closed,18.5533575029,-33.959154,U3KP57C
3,60561,2016-01-01 02:56:00,CAMERA,Stationary Vehicle,Vehicle On Shoulder,Closed,18.6775561589,-33.895258,RY0TRQ8
4,60562,2016-01-01 03:40:00,CAMERA,Accident,Multi Vehicle,Closed,18.8371319682,-34.087051,8LOVJZ3


In [45]:
df.shape

(53845, 9)

In [46]:
df.dtypes

EventId                                int64
Occurrence Local Date Time    datetime64[ns]
Reporting Agency                      object
Cause                                 object
Subcause                              object
Status                                object
longitude                             object
latitude                             float64
road_segment_id                       object
dtype: object

In [47]:
#turn longitude into numeric
df['longitude'] = pd.to_numeric(df.longitude, errors='coerce')

MISSING VALUES

In [48]:
df.isna().sum()

EventId                          0
Occurrence Local Date Time       0
Reporting Agency              1619
Cause                            0
Subcause                         0
Status                           0
longitude                        5
latitude                         0
road_segment_id                  0
dtype: int64

In [49]:
#drop Reporting Agency column (not very useful to predict)
df = df.drop(columns = 'Reporting Agency', axis=1)
df.head()

Unnamed: 0,EventId,Occurrence Local Date Time,Cause,Subcause,Status,longitude,latitude,road_segment_id
0,60558,2016-01-01 00:53:00,Stationary Vehicle,Vehicle On Shoulder,Closed,18.540896,-33.888275,S0B3CGQ
1,60559,2016-01-01 00:54:00,Accident,With A Fixed Object,Closed,18.930756,-34.140857,RYJYAPI
2,60560,2016-01-01 02:26:00,Accident,Multi Vehicle,Closed,18.553358,-33.959154,U3KP57C
3,60561,2016-01-01 02:56:00,Stationary Vehicle,Vehicle On Shoulder,Closed,18.677556,-33.895258,RY0TRQ8
4,60562,2016-01-01 03:40:00,Accident,Multi Vehicle,Closed,18.837132,-34.087051,8LOVJZ3


In [50]:
# drop missing rows with longitude data (no use if we cant know the location)
df = df.dropna(how="any")

In [51]:
df.isna().sum()

EventId                       0
Occurrence Local Date Time    0
Cause                         0
Subcause                      0
Status                        0
longitude                     0
latitude                      0
road_segment_id               0
dtype: int64

In [52]:
len(df.road_segment_id.unique())

544

First, we split by date.
Using 2017 for training, and part of 2018 for testing.

In [53]:
# Train on 2017
train = df.loc[df['Occurrence Local Date Time'] < '2018-01-01']
train = train.loc[train['Occurrence Local Date Time'] >= '2017-01-01']

# Test locally on the last part of 2018
local_test = df.loc[df['Occurrence Local Date Time'] < '2019-01-01']
local_test = local_test.loc[local_test['Occurrence Local Date Time'] >= '2018-09-01']

reshaping for optimization

In [54]:
# Create a dataframe with a column for each segment_id (sid)
# Each row represents an hour.

sids = df['road_segment_id'].unique()

dts = pd.date_range('2017-01-01',
                    '2018-01-01',
                    freq="1h")
tr = pd.DataFrame({'datetime':dts})

for sid in sids:
    tr[str(sid)] = 0
    events = train.loc[train['road_segment_id'] == sid]
    dts = events['Occurrence Local Date Time'].dt.round('H')
    dates = dts.astype(str).unique()
    tr.loc[tr['datetime'].isin(dates), sid] = 1
tr.head()

Unnamed: 0,datetime,S0B3CGQ,RYJYAPI,U3KP57C,RY0TRQ8,8LOVJZ3,X4UA382,0QR8FDW,DZABHQW,EKZN1VM,...,YVR8GT6,ZAVM3PJ,DS4NLQE,HR19LL7,1451FOG,2ON8NSO,NFUEAN5,4T821GV,J6A19TW,43RCYZH
0,2017-01-01 00:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2017-01-01 01:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2017-01-01 02:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2017-01-01 03:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2017-01-01 04:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
tr.shape

(8761, 545)

In [56]:
# Reshape this as in sample submission
# I add some extra columns that may be useful
train = pd.DataFrame({
    'datetime x segment_id':np.concatenate([[str(x) + " x " + str(c) 
                                             for c in sids] 
                                            for x in tr['datetime']]),
    'datetime':np.concatenate([[str(x) for c in sids] for x in tr['datetime']]),
    'segment_id':np.concatenate([[str(c) for c in sids] for x in tr['datetime']]),
    'y':tr[sids].values.flatten()
})
train.head()

Unnamed: 0,datetime x segment_id,datetime,segment_id,y
0,2017-01-01 00:00:00 x S0B3CGQ,2017-01-01 00:00:00,S0B3CGQ,0
1,2017-01-01 00:00:00 x RYJYAPI,2017-01-01 00:00:00,RYJYAPI,0
2,2017-01-01 00:00:00 x U3KP57C,2017-01-01 00:00:00,U3KP57C,0
3,2017-01-01 00:00:00 x RY0TRQ8,2017-01-01 00:00:00,RY0TRQ8,0
4,2017-01-01 00:00:00 x 8LOVJZ3,2017-01-01 00:00:00,8LOVJZ3,0


In [57]:
# Same for local test (test from now on)
dts = pd.date_range('2018-09-01','2018-12-31',
                    freq="1h")
tr = pd.DataFrame({'datetime':dts})

for sid in sids:
    tr[str(sid)] = 0
    events = local_test.loc[local_test['road_segment_id'] == sid]
    dts = events['Occurrence Local Date Time'].dt.round('H')
    dates = dts.astype(str).unique()
    tr.loc[tr['datetime'].isin(dates), sid] = 1
    
test = pd.DataFrame({
    'datetime x segment_id':np.concatenate([[str(x) + " x " + str(c) 
                                             for c in sids] 
                                            for x in tr['datetime']]),
    'datetime':np.concatenate([[str(x) for c in sids] for x in tr['datetime']]),
    'segment_id':np.concatenate([[str(c) for c in sids] for x in tr['datetime']]),
    'y':tr[sids].values.flatten()
})
test.head()

Unnamed: 0,datetime x segment_id,datetime,segment_id,y
0,2018-09-01 00:00:00 x S0B3CGQ,2018-09-01 00:00:00,S0B3CGQ,0
1,2018-09-01 00:00:00 x RYJYAPI,2018-09-01 00:00:00,RYJYAPI,0
2,2018-09-01 00:00:00 x U3KP57C,2018-09-01 00:00:00,U3KP57C,0
3,2018-09-01 00:00:00 x RY0TRQ8,2018-09-01 00:00:00,RY0TRQ8,0
4,2018-09-01 00:00:00 x 8LOVJZ3,2018-09-01 00:00:00,8LOVJZ3,0


In [58]:
train.y.unique()

array([0, 1])

what features can we add?

In [59]:
# add day and minutes columns
train['datetime'] = pd.to_datetime(train['datetime'])
train['day'] = train['datetime'].dt.weekday
train['min'] = train['datetime'].dt.hour*60+train['datetime'].dt.minute
train.head()

Unnamed: 0,datetime x segment_id,datetime,segment_id,y,day,min
0,2017-01-01 00:00:00 x S0B3CGQ,2017-01-01,S0B3CGQ,0,6,0
1,2017-01-01 00:00:00 x RYJYAPI,2017-01-01,RYJYAPI,0,6,0
2,2017-01-01 00:00:00 x U3KP57C,2017-01-01,U3KP57C,0,6,0
3,2017-01-01 00:00:00 x RY0TRQ8,2017-01-01,RY0TRQ8,0,6,0
4,2017-01-01 00:00:00 x 8LOVJZ3,2017-01-01,8LOVJZ3,0,6,0


In [60]:
df.head()

Unnamed: 0,EventId,Occurrence Local Date Time,Cause,Subcause,Status,longitude,latitude,road_segment_id
0,60558,2016-01-01 00:53:00,Stationary Vehicle,Vehicle On Shoulder,Closed,18.540896,-33.888275,S0B3CGQ
1,60559,2016-01-01 00:54:00,Accident,With A Fixed Object,Closed,18.930756,-34.140857,RYJYAPI
2,60560,2016-01-01 02:26:00,Accident,Multi Vehicle,Closed,18.553358,-33.959154,U3KP57C
3,60561,2016-01-01 02:56:00,Stationary Vehicle,Vehicle On Shoulder,Closed,18.677556,-33.895258,RY0TRQ8
4,60562,2016-01-01 03:40:00,Accident,Multi Vehicle,Closed,18.837132,-34.087051,8LOVJZ3


In [61]:
df1 = df.groupby('road_segment_id').mean()[['longitude', 'latitude']]

In [62]:
train = pd.merge(train, df1, left_on='segment_id', right_on='road_segment_id')
train.head()

Unnamed: 0,datetime x segment_id,datetime,segment_id,y,day,min,longitude,latitude
0,2017-01-01 00:00:00 x S0B3CGQ,2017-01-01 00:00:00,S0B3CGQ,0,6,0,18.541422,-33.888613
1,2017-01-01 01:00:00 x S0B3CGQ,2017-01-01 01:00:00,S0B3CGQ,0,6,60,18.541422,-33.888613
2,2017-01-01 02:00:00 x S0B3CGQ,2017-01-01 02:00:00,S0B3CGQ,0,6,120,18.541422,-33.888613
3,2017-01-01 03:00:00 x S0B3CGQ,2017-01-01 03:00:00,S0B3CGQ,0,6,180,18.541422,-33.888613
4,2017-01-01 04:00:00 x S0B3CGQ,2017-01-01 04:00:00,S0B3CGQ,0,6,240,18.541422,-33.888613


In [63]:
train.dtypes

datetime x segment_id            object
datetime                 datetime64[ns]
segment_id                       object
y                                 int64
day                               int64
min                               int64
longitude                       float64
latitude                        float64
dtype: object

In [64]:
train.day.unique()

array([6, 0, 1, 2, 3, 4, 5])

TRAIN DATA EXPLORATION

In [66]:
#categorical: segment_id
#numerical: 'min', 'day',longitude', 'latitude'
# data that won't be used as it is repetitive: datetime x segment_id

In [67]:
numeric = ['min','day', 'longitude', 'latitude']
x_cols = ['day', 'segment_id', 'min', 'longitude', 'latitude']
numerical_data = train[numeric]

LOGISTIC REGRESSION

1) Let's split our data set into train test split 

In [70]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X = train[numeric]
y = train['y']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

2) Fit the logistic regression model 

In [71]:
clf = LogisticRegression()
clf.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

3) Fit the model to our test variables

In [72]:
predictions = clf.predict(X_test)
predictions

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

4) Evaluate our Model

In [73]:
from sklearn.model_selection import cross_val_score

In [74]:
#To use crossval we need to feed in...
cv_res = cross_val_score(LogisticRegression(),  # Our estimatorr
                X, #Our data
                y, 
                scoring="precision")

In [33]:
cv_res.mean()

0.0

In [75]:
#To use crossval we need to feed in...
cv_res1 = cross_val_score(LogisticRegression(),  # Our estimatorr
                X, #Our data
                y, 
                scoring="accuracy")
cv_res1.mean()

0.9970142577063662

In [76]:
#To use crossval we need to feed in...
cv_res2 = cross_val_score(LogisticRegression(),  # Our estimatorr
                X, #Our data
                y, 
                scoring="roc_auc")
cv_res2.mean()

0.6953190796070251