In [29]:
import pandas as pd

from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [30]:
tr_name = Path("transport_data.csv")
tr_path = Path("./data")/tr_name

In [31]:
tr_data = pd.read_csv(tr_path)

In [32]:
tr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58665 entries, 0 to 58664
Data columns (total 5 columns):
log           58665 non-null float64
lat           58665 non-null float64
request_ts    58665 non-null int64
trans_ts      58665 non-null int64
label         58665 non-null object
dtypes: float64(2), int64(2), object(1)
memory usage: 2.2+ MB


In [33]:
tr_data.head()

Unnamed: 0,log,lat,request_ts,trans_ts,label
0,30.29578,59.932167,1511956827,1511956594,0
1,30.219528,59.954617,1511956827,1511956746,-
2,30.409101,59.931679,1511956827,1511956747,-
3,30.293602,59.938892,1511956827,1511956759,-
4,30.360102,59.931103,1511956827,1511956767,-


In [34]:
tr_data.drop(tr_data.index[tr_data.label == '-'].tolist(), axis=0, inplace=True)
tr_data.trans_ts = pd.to_datetime(tr_data.trans_ts, unit='s')
tr_data.request_ts = pd.to_datetime(tr_data.request_ts, unit='s')

tr_data['second_trans_ts'] = tr_data.trans_ts.dt.second
tr_data['minute_trans_ts'] = tr_data.trans_ts.dt.minute
tr_data['hour_trans_ts'] = tr_data.trans_ts.dt.hour
tr_data['dayofweek_trans_ts'] = tr_data.trans_ts.dt.dayofweek
tr_data.drop('trans_ts', axis=1, inplace=True)

tr_data.drop('request_ts', axis=1, inplace=True)

In [35]:
target = tr_data[tr_data.label == '?'].drop('label', axis=1)
target.head()

Unnamed: 0,log,lat,second_trans_ts,minute_trans_ts,hour_trans_ts,dayofweek_trans_ts
11,30.35919,59.931217,47,59,11,2
18,30.355488,59.931679,11,0,12,2
41,30.312258,59.937222,45,1,12,2
42,30.367332,59.931988,47,1,12,2
56,30.329239,59.934929,47,2,12,2


In [36]:
tr_data.drop(tr_data.index[tr_data.label == '?'].tolist(), axis=0, inplace=True)
tr_data.index = range(len(tr_data))
tr_data.head(2)

Unnamed: 0,log,lat,label,second_trans_ts,minute_trans_ts,hour_trans_ts,dayofweek_trans_ts
0,30.29578,59.932167,0,34,56,11,2
1,30.385973,59.94426,0,42,59,11,2


In [40]:
model = RandomForestClassifier(n_estimators=500, max_depth=100, random_state=8)

In [41]:
model.fit(X = tr_data.drop(['label'], axis=1), y = tr_data.label)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2000, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=8, verbose=0, warm_start=False)

In [42]:
cross_val_score(model, tr_data.drop(['label'], axis=1), tr_data.label, cv=5).mean()

0.4178879848969962

In [27]:
preds = model.predict(target)

In [28]:
with open('random_forest.txt', 'w') as f:
    for p in preds:
        f.write(p + "\n")