In [240]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

import glob
import os

In [241]:
pd.set_option("display.max_columns", 100)
%matplotlib inline

In [242]:
RANDOM_SEED = 6

In [256]:
DATA_PATH = Path.cwd() / ""
train_df = pd.read_csv(DATA_PATH / "train.csv", index_col="tripid")
train_df['checkout_datetime'] = pd.to_datetime(train_df['drop_time'])
train_df['checkin_datetime'] = pd.to_datetime(train_df['pickup_time'])
#train_df['duration'] = train_df['checkout_datetime']-train_df['checkin_datetime']
train_df['duration_in_minutes'] = (train_df['checkout_datetime']-train_df['checkin_datetime'])/pd.Timedelta(minutes=1)
train_df['lat']  = abs(train_df['pick_lat']-train_df['drop_lat'])
train_df['lon']  = abs(train_df['pick_lon']-train_df['drop_lon'])

features_df = train_df.drop(["pickup_time", "drop_time","duration","drop_lat","drop_lon","checkin_datetime", "checkout_datetime", "label"], axis = 1)


label_df = train_df[["label"]]
label_df = label_df.replace({'label': {'correct': 1, 'incorrect': 0}})

features_df

Unnamed: 0_level_0,additional_fare,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pick_lat,pick_lon,fare,duration_in_minutes,lat,lon
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
189123628,10.5,56.0,0.000000,64.0,6.86252,79.8993,270.32,14.0,0.04078,0.0210
189125358,10.5,47.0,0.000000,134.0,6.88589,79.8984,197.85,13.0,0.02784,0.0061
189125719,10.5,80.0,0.000000,61.0,6.90839,79.8651,301.64,18.0,0.02830,0.0495
189127273,10.5,271.0,15.663800,68.0,6.92570,79.8895,82.30,10.0,0.00178,0.0076
189128020,,,,,6.87441,79.8615,358.39,17.0,0.02963,0.0675
189129552,10.5,182.0,0.000000,112.0,7.13402,79.8969,1065.02,57.0,0.21537,0.0320
189132829,10.5,487.0,0.000000,133.0,6.84371,79.9051,266.62,20.0,0.00698,0.0427
189135103,10.5,295.0,17.198500,212.0,6.90760,79.9524,318.05,22.0,0.00126,0.0482
189139296,10.5,80.0,4.664000,3.0,7.26706,80.6064,100.32,6.0,0.00716,0.0060
189138671,10.5,588.0,33.986400,43.0,6.85137,79.9537,257.89,26.0,0.00358,0.0263


In [257]:
features_list = features_df.columns.tolist()
features_list

['additional_fare',
 'meter_waiting',
 'meter_waiting_fare',
 'meter_waiting_till_pickup',
 'pick_lat',
 'pick_lon',
 'fare',
 'duration_in_minutes',
 'lat',
 'lon']

In [258]:
## chain numerical preprocessing into a pipeline object
numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='median')),
    ('minmax_scaler', MinMaxScaler())
])


## create preprocessor stage of the final pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_preprocessing_steps, features_list)
    ],
    remainder = 'drop'
)



In [259]:
estimator =  RandomForestClassifier( class_weight='balanced')


In [260]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('estimators', estimator)
])


In [261]:
features_df

Unnamed: 0_level_0,additional_fare,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pick_lat,pick_lon,fare,duration_in_minutes,lat,lon
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
189123628,10.5,56.0,0.000000,64.0,6.86252,79.8993,270.32,14.0,0.04078,0.0210
189125358,10.5,47.0,0.000000,134.0,6.88589,79.8984,197.85,13.0,0.02784,0.0061
189125719,10.5,80.0,0.000000,61.0,6.90839,79.8651,301.64,18.0,0.02830,0.0495
189127273,10.5,271.0,15.663800,68.0,6.92570,79.8895,82.30,10.0,0.00178,0.0076
189128020,,,,,6.87441,79.8615,358.39,17.0,0.02963,0.0675
189129552,10.5,182.0,0.000000,112.0,7.13402,79.8969,1065.02,57.0,0.21537,0.0320
189132829,10.5,487.0,0.000000,133.0,6.84371,79.9051,266.62,20.0,0.00698,0.0427
189135103,10.5,295.0,17.198500,212.0,6.90760,79.9524,318.05,22.0,0.00126,0.0482
189139296,10.5,80.0,4.664000,3.0,7.26706,80.6064,100.32,6.0,0.00716,0.0060
189138671,10.5,588.0,33.986400,43.0,6.85137,79.9537,257.89,26.0,0.00358,0.0263


In [262]:
X_train, X_eval, y_train, y_eval = train_test_split(features_df, label_df, test_size=0.3, shuffle=True, stratify=label_df, random_state=RANDOM_SEED)

## Train the model
pipeline.fit(X_train, y_train)

# Predict for the evaluation set

print("Training Accuracy: %.2f" % (pipeline.score(X_eval, y_eval)*100), "%")
preds = pipeline.predict_proba(X_eval)


  self._final_estimator.fit(Xt, y, **fit_params)


Training Accuracy: 94.57 %


In [263]:
pipeline.fit(features_df, label_df)

None


  self._final_estimator.fit(Xt, y, **fit_params)


In [264]:
test_set = pd.read_csv(DATA_PATH / "test.csv", index_col="tripid")
test_set.head()


Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
213284604,10.5,924,42,2.4486,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.875,6.7749,79.884,289.27
213286352,10.5,4249,20,0.0,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.7
213293973,10.5,1552,255,2.6588,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.0
213294622,10.5,462,16,0.0,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
213298687,10.5,814,392,12.3692,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.913,6.98875,79.8914,147.47


In [265]:
test_set['checkout_datetime'] = pd.to_datetime(test_set['drop_time'])
test_set['checkin_datetime'] = pd.to_datetime(test_set['pickup_time'])
#train_df['duration'] = train_df['checkout_datetime']-train_df['checkin_datetime']
test_set['duration_in_minutes'] = (test_set['checkout_datetime']-test_set['checkin_datetime'])/pd.Timedelta(minutes=1)
test_set['lat']  = abs(test_set['pick_lat']-test_set['drop_lat'])
test_set['lon']  = abs(test_set['pick_lon']-test_set['drop_lon'])

new_test_set = test_set.drop(["pickup_time", "drop_time","duration","drop_lat","drop_lon","checkin_datetime", "checkout_datetime"], axis = 1)
new_test_set

ValueError: cannot reindex from a duplicate axis

In [None]:
test_probs = pipeline.predict_proba(new_test_set)


In [210]:
test_probs

array([[0. , 1. ],
       [0.3, 0.7],
       [0.2, 0.8],
       ...,
       [0. , 1. ],
       [0. , 1. ],
       [0. , 1. ]])

In [212]:
submission_set = pd.read_csv(DATA_PATH / "sample_submission.csv", index_col="tripid")
submission_set.head()


Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,0
213293973,0
213294622,1
213298687,1


In [213]:
try:
    np.testing.assert_array_equal(test_set.index.values, submission_set.index.values)
    print("rows in the same order")
except:
    print("rows not in the same order or error")


rows in the same order


In [214]:
submission_set['prediction'] = test_probs[:,1]


In [215]:
submission_set

Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1.0
213286352,0.7
213293973,0.8
213294622,1.0
213298687,1.0
213299545,0.4
213302332,1.0
213302671,0.9
213305594,1.0
213305134,0.9


In [216]:
submission_set.loc[submission_set['prediction'] > 0.5, 'prediction'] = 1
submission_set.loc[submission_set['prediction'] <= 0.5, 'prediction'] = 0
submission_set['prediction'] = submission_set['prediction'].astype(np.int64)


#submission_set['presiction'].convert_dtypes(convert_integer: bool = True)
submission_set



Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,1
213293973,1
213294622,1
213298687,1
213299545,0
213302332,1
213302671,1
213305594,1
213305134,1


In [217]:
submission_set.to_csv('160374E_submission_05.csv', index=True)
print("Completed!")


Completed!
