OPTIMIZATION

In [1]:
from IPython.display import Image
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()

import warnings
warnings.simplefilter("ignore")

load data for 'train.csv'

In [151]:
df = pd.read_csv('train.csv', parse_dates = ['Occurrence Local Date Time'])#use parse_dates to turn the column to datetime type
df.head()

Unnamed: 0,EventId,Occurrence Local Date Time,Reporting Agency,Cause,Subcause,Status,longitude,latitude,road_segment_id
0,60558,2016-01-01 00:53:00,Cam,Stationary Vehicle,Vehicle On Shoulder,Closed,18.5408955032,-33.888275,S0B3CGQ
1,60559,2016-01-01 00:54:00,CAMERA,Accident,With A Fixed Object,Closed,18.9307563219,-34.140857,RYJYAPI
2,60560,2016-01-01 02:26:00,Law Enforcement,Accident,Multi Vehicle,Closed,18.5533575029,-33.959154,U3KP57C
3,60561,2016-01-01 02:56:00,CAMERA,Stationary Vehicle,Vehicle On Shoulder,Closed,18.6775561589,-33.895258,RY0TRQ8
4,60562,2016-01-01 03:40:00,CAMERA,Accident,Multi Vehicle,Closed,18.8371319682,-34.087051,8LOVJZ3


In [152]:
df.shape

(53845, 9)

In [153]:
df.dtypes

EventId                                int64
Occurrence Local Date Time    datetime64[ns]
Reporting Agency                      object
Cause                                 object
Subcause                              object
Status                                object
longitude                             object
latitude                             float64
road_segment_id                       object
dtype: object

In [154]:
#turn longitude into numeric
df['longitude'] = pd.to_numeric(df.longitude, errors='coerce')

MISSING VALUES

In [155]:
df.isna().sum()

EventId                          0
Occurrence Local Date Time       0
Reporting Agency              1619
Cause                            0
Subcause                         0
Status                           0
longitude                        5
latitude                         0
road_segment_id                  0
dtype: int64

In [156]:
#drop Reporting Agency column (not as useful to predict the aciidents and there are too many missing values)
df = df.drop(columns = 'Reporting Agency', axis=1)
df.head()

Unnamed: 0,EventId,Occurrence Local Date Time,Cause,Subcause,Status,longitude,latitude,road_segment_id
0,60558,2016-01-01 00:53:00,Stationary Vehicle,Vehicle On Shoulder,Closed,18.540896,-33.888275,S0B3CGQ
1,60559,2016-01-01 00:54:00,Accident,With A Fixed Object,Closed,18.930756,-34.140857,RYJYAPI
2,60560,2016-01-01 02:26:00,Accident,Multi Vehicle,Closed,18.553358,-33.959154,U3KP57C
3,60561,2016-01-01 02:56:00,Stationary Vehicle,Vehicle On Shoulder,Closed,18.677556,-33.895258,RY0TRQ8
4,60562,2016-01-01 03:40:00,Accident,Multi Vehicle,Closed,18.837132,-34.087051,8LOVJZ3


In [157]:
# drop missing rows with longitude data (no use if we cant know the location)
df = df.dropna(how="any")

In [158]:
df.isna().sum()

EventId                       0
Occurrence Local Date Time    0
Cause                         0
Subcause                      0
Status                        0
longitude                     0
latitude                      0
road_segment_id               0
dtype: int64

First, we split by date.
Using 2017 for training, and part of 2018 for testing.

In [160]:
# Train on 2017
train = df.loc[df['Occurrence Local Date Time'] < '2018-01-01']
train = train.loc[train['Occurrence Local Date Time'] >= '2017-01-01']

# Test locally on the last part of 2018
local_test = df.loc[df['Occurrence Local Date Time'] < '2019-01-01']
local_test = local_test.loc[local_test['Occurrence Local Date Time'] >= '2018-09-01']

reshaping for optimization

In [161]:
# Create a dataframe with a column for each segment_id
# Each row represents an hour.

segment_ids = df['road_segment_id'].unique()

dts = pd.date_range('2017-01-01',
                    '2018-01-01',
                    freq="1h")
tr = pd.DataFrame({'datetime':dts}) #tr is a dataframe with one hour diffrence per row

for segment_id in segment_ids:
    tr[str(segment_id)] = 0
    events = train.loc[train['road_segment_id'] == segment_id]
    dts = events['Occurrence Local Date Time'].dt.round('H')
    dates = dts.astype(str).unique() #returns and array of unique dates in dts as strings
    tr.loc[tr['datetime'].isin(dates), segment_id] = 1 # puts 1 in place of the segment_id and hour an accident happened
tr.head()

Unnamed: 0,datetime,S0B3CGQ,RYJYAPI,U3KP57C,RY0TRQ8,8LOVJZ3,X4UA382,0QR8FDW,DZABHQW,EKZN1VM,...,YVR8GT6,ZAVM3PJ,DS4NLQE,HR19LL7,1451FOG,2ON8NSO,NFUEAN5,4T821GV,J6A19TW,43RCYZH
0,2017-01-01 00:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2017-01-01 01:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2017-01-01 02:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2017-01-01 03:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2017-01-01 04:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [162]:
tr.shape

(8761, 545)

In [163]:
# Reshape this as in sample submission
# I add some extra columns that may be useful
train = pd.DataFrame({
    'datetime x segment_id':np.concatenate([[str(x) + " x " + str(c) 
                                             for c in segment_ids] 
                                            for x in tr['datetime']]),
    'datetime':np.concatenate([[str(x) for c in segment_ids] for x in tr['datetime']]),
    'segment_id':np.concatenate([[str(c) for c in segment_ids] for x in tr['datetime']]),
    'y':tr[segment_ids].values.flatten() # .values return. a numpy() representation of tr[segment_ids] 
})# and flatten() turns all the values into a list
train.head()

Unnamed: 0,datetime x segment_id,datetime,segment_id,y
0,2017-01-01 00:00:00 x S0B3CGQ,2017-01-01 00:00:00,S0B3CGQ,0
1,2017-01-01 00:00:00 x RYJYAPI,2017-01-01 00:00:00,RYJYAPI,0
2,2017-01-01 00:00:00 x U3KP57C,2017-01-01 00:00:00,U3KP57C,0
3,2017-01-01 00:00:00 x RY0TRQ8,2017-01-01 00:00:00,RY0TRQ8,0
4,2017-01-01 00:00:00 x 8LOVJZ3,2017-01-01 00:00:00,8LOVJZ3,0


In [165]:
# Same for local test (test from now on)
dts = pd.date_range('2018-09-01','2018-12-31',
                    freq="1h")
tr = pd.DataFrame({'datetime':dts})

for segment_id in segment_ids:
    tr[str(segment_id)] = 0
    events = local_test.loc[local_test['road_segment_id'] == segment_id]
    dts = events['Occurrence Local Date Time'].dt.round('H')
    dates = dts.astype(str).unique()
    tr.loc[tr['datetime'].isin(dates), segment_id] = 1
    
test = pd.DataFrame({
    'datetime x segment_id':np.concatenate([[str(x) + " x " + str(c) for c in segment_ids] for x in tr['datetime']]),
    'datetime':np.concatenate([[str(x) for c in segment_ids] for x in tr['datetime']]),
    'segment_id':np.concatenate([[str(c) for c in segment_ids] for x in tr['datetime']]),
    'y':tr[segment_ids].values.flatten()
})
test.head()

Unnamed: 0,datetime x segment_id,datetime,segment_id,y
0,2018-09-01 00:00:00 x S0B3CGQ,2018-09-01 00:00:00,S0B3CGQ,0
1,2018-09-01 00:00:00 x RYJYAPI,2018-09-01 00:00:00,RYJYAPI,0
2,2018-09-01 00:00:00 x U3KP57C,2018-09-01 00:00:00,U3KP57C,0
3,2018-09-01 00:00:00 x RY0TRQ8,2018-09-01 00:00:00,RY0TRQ8,0
4,2018-09-01 00:00:00 x 8LOVJZ3,2018-09-01 00:00:00,8LOVJZ3,0


In [108]:
train.y.unique()

array([0, 1])

what features can we add?

TRAIN DATA

In [166]:
# add day, minutes, month, longitude and latitude columns
train['datetime'] = pd.to_datetime(train['datetime'])
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.weekday_name
train['min'] = train['datetime'].dt.hour*60+train['datetime'].dt.minute
location = df.groupby('road_segment_id').mean()[['longitude', 'latitude']]
train = pd.merge(train, location, left_on='segment_id', right_on='road_segment_id')
train.head()

Unnamed: 0,datetime x segment_id,datetime,segment_id,y,month,day,min,longitude,latitude
0,2017-01-01 00:00:00 x S0B3CGQ,2017-01-01 00:00:00,S0B3CGQ,0,1,Sunday,0,18.541422,-33.888613
1,2017-01-01 01:00:00 x S0B3CGQ,2017-01-01 01:00:00,S0B3CGQ,0,1,Sunday,60,18.541422,-33.888613
2,2017-01-01 02:00:00 x S0B3CGQ,2017-01-01 02:00:00,S0B3CGQ,0,1,Sunday,120,18.541422,-33.888613
3,2017-01-01 03:00:00 x S0B3CGQ,2017-01-01 03:00:00,S0B3CGQ,0,1,Sunday,180,18.541422,-33.888613
4,2017-01-01 04:00:00 x S0B3CGQ,2017-01-01 04:00:00,S0B3CGQ,0,1,Sunday,240,18.541422,-33.888613


TEST DATA

In [167]:
# Pre-process the test to match train: add day, minutes, month, longitude and latitude columns
test['datetime'] = pd.to_datetime(test['datetime'])
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.weekday_name
test['min'] = test['datetime'].dt.hour*60+test['datetime'].dt.minute
location = df.groupby('road_segment_id').mean()[['longitude', 'latitude']]
test = pd.merge(test, location, left_on='segment_id', right_on='road_segment_id')
test.head()

Unnamed: 0,datetime x segment_id,datetime,segment_id,y,month,day,min,longitude,latitude
0,2018-09-01 00:00:00 x S0B3CGQ,2018-09-01 00:00:00,S0B3CGQ,0,9,Saturday,0,18.541422,-33.888613
1,2018-09-01 01:00:00 x S0B3CGQ,2018-09-01 01:00:00,S0B3CGQ,0,9,Saturday,60,18.541422,-33.888613
2,2018-09-01 02:00:00 x S0B3CGQ,2018-09-01 02:00:00,S0B3CGQ,0,9,Saturday,120,18.541422,-33.888613
3,2018-09-01 03:00:00 x S0B3CGQ,2018-09-01 03:00:00,S0B3CGQ,0,9,Saturday,180,18.541422,-33.888613
4,2018-09-01 04:00:00 x S0B3CGQ,2018-09-01 04:00:00,S0B3CGQ,0,9,Saturday,240,18.541422,-33.888613


Adding weather info

In [168]:
w= pd.read_csv('weather.csv', sep=";", skiprows=6, usecols=range(14),
                      parse_dates=['Local time in Cape Town (airport)']) 
w.head()

Unnamed: 0,Local time in Cape Town (airport),T,P0,P,U,DD,Ff,ff10,WW,W'W',c,VV,Td,Unnamed: 13
0,2019-04-04 23:00:00,15.0,762.0,765.8,77.0,Wind blowing from the south-southeast,6.0,,,,"Scattered clouds (40-50%) 720 m, broken clouds...",10.0 and more,11.0,
1,2019-04-04 22:00:00,16.0,761.2,765.0,77.0,Wind blowing from the south-southeast,6.0,,,,"Scattered clouds (40-50%) 570 m, broken clouds...",10.0 and more,12.0,
2,2019-04-04 21:59:00,16.0,761.2,765.0,77.0,Wind blowing from the south-southeast,6.0,,,,"Scattered clouds (40-50%) 570 m, broken clouds...",10.0 and more,12.0,
3,2019-04-04 21:01:00,15.0,761.2,765.0,94.0,Wind blowing from the south-southeast,6.0,,,,"Scattered clouds (40-50%) 150 m, broken clouds...",10.0 and more,14.0,
4,2019-04-04 21:00:00,15.0,761.2,765.0,94.0,Wind blowing from the south-southeast,6.0,,,,"Scattered clouds (40-50%) 150 m, broken clouds...",10.0 and more,14.0,


In [169]:
# Add dt column
w['dt'] = w['Local time in Cape Town (airport)'].dt.round('H')
w_cols = ['dt', 'T', 'P0', 'P', 'U', 'Ff']
w.head()

Unnamed: 0,Local time in Cape Town (airport),T,P0,P,U,DD,Ff,ff10,WW,W'W',c,VV,Td,Unnamed: 13,dt
0,2019-04-04 23:00:00,15.0,762.0,765.8,77.0,Wind blowing from the south-southeast,6.0,,,,"Scattered clouds (40-50%) 720 m, broken clouds...",10.0 and more,11.0,,2019-04-04 23:00:00
1,2019-04-04 22:00:00,16.0,761.2,765.0,77.0,Wind blowing from the south-southeast,6.0,,,,"Scattered clouds (40-50%) 570 m, broken clouds...",10.0 and more,12.0,,2019-04-04 22:00:00
2,2019-04-04 21:59:00,16.0,761.2,765.0,77.0,Wind blowing from the south-southeast,6.0,,,,"Scattered clouds (40-50%) 570 m, broken clouds...",10.0 and more,12.0,,2019-04-04 22:00:00
3,2019-04-04 21:01:00,15.0,761.2,765.0,94.0,Wind blowing from the south-southeast,6.0,,,,"Scattered clouds (40-50%) 150 m, broken clouds...",10.0 and more,14.0,,2019-04-04 21:00:00
4,2019-04-04 21:00:00,15.0,761.2,765.0,94.0,Wind blowing from the south-southeast,6.0,,,,"Scattered clouds (40-50%) 150 m, broken clouds...",10.0 and more,14.0,,2019-04-04 21:00:00


In [170]:
#merge train with weather data
train = pd.merge(train, w[w_cols], left_on='datetime', right_on='dt', how='left')

In [171]:
#merge test with weather data
test = pd.merge(test, w[w_cols], left_on='datetime', right_on='dt', how='left')

In [172]:
train.head()

Unnamed: 0,datetime x segment_id,datetime,segment_id,y,month,day,min,longitude,latitude,dt,T,P0,P,U,Ff
0,2017-01-01 00:00:00 x S0B3CGQ,2017-01-01 00:00:00,S0B3CGQ,0,1,Sunday,0,18.541422,-33.888613,2017-01-01 00:00:00,22.0,756.8,760.5,73.0,8.0
1,2017-01-01 01:00:00 x S0B3CGQ,2017-01-01 01:00:00,S0B3CGQ,0,1,Sunday,60,18.541422,-33.888613,2017-01-01 01:00:00,22.0,756.0,759.7,69.0,7.0
2,2017-01-01 02:00:00 x S0B3CGQ,2017-01-01 02:00:00,S0B3CGQ,0,1,Sunday,120,18.541422,-33.888613,2017-01-01 02:00:00,22.0,756.0,759.7,69.0,5.0
3,2017-01-01 03:00:00 x S0B3CGQ,2017-01-01 03:00:00,S0B3CGQ,0,1,Sunday,180,18.541422,-33.888613,2017-01-01 03:00:00,21.0,755.3,759.0,73.0,4.0
4,2017-01-01 04:00:00 x S0B3CGQ,2017-01-01 04:00:00,S0B3CGQ,0,1,Sunday,240,18.541422,-33.888613,2017-01-01 04:00:00,20.0,755.3,759.0,78.0,4.0


Create an equivalent dataset for submission

In [173]:
# Make the dataframe - dates based on sample submission file
dts = pd.date_range('2019-01-01 01:00:00',
                    '2019-03-31 23:00:00',
                    freq="1h")
tr = pd.DataFrame({'datetime':dts})

for segment_id in segment_ids:
    tr[str(segment_id)] = 0
    
ss = pd.DataFrame({
    'datetime x segment_id':np.concatenate([[str(x) + " x " + str(c)  for x in tr['datetime']for c in segment_ids]]),
    'datetime':np.concatenate([[str(x) for x in tr['datetime']for c in segment_ids]]),
    'segment_id':np.concatenate([[str(c) for x in tr['datetime']for c in segment_ids]])
})
ss.head()

Unnamed: 0,datetime x segment_id,datetime,segment_id
0,2019-01-01 01:00:00 x S0B3CGQ,2019-01-01 01:00:00,S0B3CGQ
1,2019-01-01 01:00:00 x RYJYAPI,2019-01-01 01:00:00,RYJYAPI
2,2019-01-01 01:00:00 x U3KP57C,2019-01-01 01:00:00,U3KP57C
3,2019-01-01 01:00:00 x RY0TRQ8,2019-01-01 01:00:00,RY0TRQ8
4,2019-01-01 01:00:00 x 8LOVJZ3,2019-01-01 01:00:00,8LOVJZ3


In [174]:
# Add the extra features
ss['datetime'] = pd.to_datetime(ss['datetime'])
ss['day'] = ss['datetime'].dt.weekday_name
ss['month'] = ss['datetime'].dt.month
ss['min'] = ss['datetime'].dt.hour*60+ss['datetime'].dt.minute
location = df.groupby('road_segment_id').mean()[['longitude', 'latitude']]
ss = pd.merge(ss, location, left_on='segment_id', right_on='road_segment_id', how='left')
ss = pd.merge(ss, w[w_cols], left_on='datetime', right_on='dt', how='left')
ss['prediction'] = 0
ss.head()

Unnamed: 0,datetime x segment_id,datetime,segment_id,day,month,min,longitude,latitude,dt,T,P0,P,U,Ff,prediction
0,2019-01-01 01:00:00 x S0B3CGQ,2019-01-01 01:00:00,S0B3CGQ,Tuesday,1,60,18.541422,-33.888613,2019-01-01 01:00:00,19.0,756.8,760.5,88.0,5.0,0
1,2019-01-01 01:00:00 x RYJYAPI,2019-01-01 01:00:00,RYJYAPI,Tuesday,1,60,18.931088,-34.13993,2019-01-01 01:00:00,19.0,756.8,760.5,88.0,5.0,0
2,2019-01-01 01:00:00 x U3KP57C,2019-01-01 01:00:00,U3KP57C,Tuesday,1,60,18.550771,-33.958279,2019-01-01 01:00:00,19.0,756.8,760.5,88.0,5.0,0
3,2019-01-01 01:00:00 x RY0TRQ8,2019-01-01 01:00:00,RY0TRQ8,Tuesday,1,60,18.677475,-33.896611,2019-01-01 01:00:00,19.0,756.8,760.5,88.0,5.0,0
4,2019-01-01 01:00:00 x 8LOVJZ3,2019-01-01 01:00:00,8LOVJZ3,Tuesday,1,60,18.836121,-34.086109,2019-01-01 01:00:00,19.0,756.8,760.5,88.0,5.0,0


In [175]:
ss.dtypes

datetime x segment_id            object
datetime                 datetime64[ns]
segment_id                       object
day                              object
month                             int64
min                               int64
longitude                       float64
latitude                        float64
dt                       datetime64[ns]
T                               float64
P0                              float64
P                               float64
U                               float64
Ff                              float64
prediction                        int64
dtype: object

MODELING - using catboost

In [182]:
x_cols = ['day', 'segment_id', 'min', 'longitude', 'latitude', 'month', 'T', 'P0', 'P', 'U', 'Ff']
cat_cols = ['day', 'month','segment_id']

In [183]:
# Create the model
model2 = CatBoostClassifier(iterations=20, 
                           loss_function='Logloss', 
                           verbose=False) 

model2.fit(train[x_cols], train['y'], cat_features=cat_cols) # Takes about 

<catboost.core.CatBoostClassifier at 0x4f7b2ec50>

predicting model

In [184]:
f1_score(test['y'], model2.predict(test[x_cols]))

0.008827364830976027

LGBMClassifier MODEL

In [196]:
from lightgbm import LGBMClassifier
clf_lgbm = LGBMClassifier(n_estimators=100)

In [212]:
train['day'] = train['datetime'].dt.weekday
test['day'] = test['datetime'].dt.weekday
train.dtypes

datetime x segment_id            object
datetime                 datetime64[ns]
segment_id                       object
y                                 int64
month                             int64
day                               int64
min                               int64
longitude                       float64
latitude                        float64
dt                       datetime64[ns]
T                               float64
P0                              float64
P                               float64
U                               float64
Ff                              float64
dtype: object

In [198]:
x_cols = ['day', 'min', 'longitude', 'latitude', 'month', 'T', 'P0', 'P', 'U', 'Ff']

In [199]:
clf_lgbm.fit(train[x_cols], train['y'])

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [203]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

In [204]:
cv_results = cross_validate(clf_lgbm,
                    X=train[x_cols], # X = independant variables
                    y=train['y'], # y = target variable (aka answer)
                    scoring="neg_mean_absolute_error", # what error metric to use to compare
                    cv=3)

In [210]:
cv_results['test_score'].mean()

-0.2032153621655853

In [211]:
cv_results = cross_validate(clf_lgbm,
                    X=train[x_cols], # X = independant variables
                    y=train['y'], # y = target variable (aka answer)
                    scoring="accuracy", # what error metric to use to compare
                    cv=3)

In [213]:
cv_results['test_score'].mean()

0.7967846378344147

In [215]:
cv_results = cross_validate(clf_lgbm,
                    X=train[x_cols], # X = independant variables
                    y=train['y'], # y = target variable (aka answer)
                    scoring="precision", # what error metric to use to compare
                    cv=3)

In [216]:
cv_results['test_score'].mean()

0.0013632608251477477

In [214]:
f1_score(test['y'], clf_lgbm.predict(test[x_cols]))

0.01050534323492121