In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

import glob
import os

In [2]:
pd.set_option("display.max_columns", 100)
%matplotlib inline

In [3]:
RANDOM_SEED = 6

In [41]:
DATA_PATH = Path.cwd() / ""
train_df = pd.read_csv(DATA_PATH / "train.csv", index_col="tripid")
train_df['checkout_datetime'] = pd.to_datetime(train_df['drop_time'])
train_df['checkin_datetime'] = pd.to_datetime(train_df['pickup_time'])
#train_df['duration'] = train_df['checkout_datetime']-train_df['checkin_datetime']
train_df['duration_in_minutes'] = (train_df['checkout_datetime']-train_df['checkin_datetime'])/pd.Timedelta(minutes=1)

features_df = train_df.drop(["pickup_time", "drop_time","duration","checkin_datetime", "checkout_datetime", "label"], axis = 1)


label_df = train_df[["label"]]
label_df = label_df.replace({'label': {'correct': 1, 'incorrect': 0}})

features_df

Unnamed: 0_level_0,additional_fare,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pick_lat,pick_lon,drop_lat,drop_lon,fare,duration_in_minutes
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
189123628,10.5,56.0,0.000000,64.0,6.86252,79.8993,6.90330,79.8783,270.32,14.0
189125358,10.5,47.0,0.000000,134.0,6.88589,79.8984,6.91373,79.8923,197.85,13.0
189125719,10.5,80.0,0.000000,61.0,6.90839,79.8651,6.93669,79.9146,301.64,18.0
189127273,10.5,271.0,15.663800,68.0,6.92570,79.8895,6.92748,79.8971,82.30,10.0
189128020,,,,,6.87441,79.8615,6.84478,79.9290,358.39,17.0
189129552,10.5,182.0,0.000000,112.0,7.13402,79.8969,6.91865,79.8649,1065.02,57.0
189132829,10.5,487.0,0.000000,133.0,6.84371,79.9051,6.85069,79.8624,266.62,20.0
189135103,10.5,295.0,17.198500,212.0,6.90760,79.9524,6.90634,79.9042,318.05,22.0
189139296,10.5,80.0,4.664000,3.0,7.26706,80.6064,7.27422,80.6124,100.32,6.0
189138671,10.5,588.0,33.986400,43.0,6.85137,79.9537,6.84779,79.9274,257.89,26.0


In [32]:
features_list = features_df.columns.tolist()
features_list

['additional_fare',
 'meter_waiting',
 'meter_waiting_fare',
 'meter_waiting_till_pickup',
 'pick_lat',
 'pick_lon',
 'drop_lat',
 'drop_lon',
 'fare',
 'duration_in_minutes']

In [33]:
## chain numerical preprocessing into a pipeline object
numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='median')),
    ('minmax_scaler', MinMaxScaler())
])


## create preprocessor stage of the final pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_preprocessing_steps, features_list)
    ],
    remainder = 'drop'
)



In [34]:
estimator =  RandomForestClassifier( class_weight='balanced')


In [45]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('estimators', estimator)
])


In [40]:
features_df

Unnamed: 0_level_0,additional_fare,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pick_lat,pick_lon,drop_lat,drop_lon,fare,duration_in_minutes
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
189123628,10.5,56.0,0.000000,64.0,6.86252,79.8993,6.90330,79.8783,270.32,14.0
189125358,10.5,47.0,0.000000,134.0,6.88589,79.8984,6.91373,79.8923,197.85,13.0
189125719,10.5,80.0,0.000000,61.0,6.90839,79.8651,6.93669,79.9146,301.64,18.0
189127273,10.5,271.0,15.663800,68.0,6.92570,79.8895,6.92748,79.8971,82.30,10.0
189128020,,,,,6.87441,79.8615,6.84478,79.9290,358.39,17.0
189129552,10.5,182.0,0.000000,112.0,7.13402,79.8969,6.91865,79.8649,1065.02,57.0
189132829,10.5,487.0,0.000000,133.0,6.84371,79.9051,6.85069,79.8624,266.62,20.0
189135103,10.5,295.0,17.198500,212.0,6.90760,79.9524,6.90634,79.9042,318.05,22.0
189139296,10.5,80.0,4.664000,3.0,7.26706,80.6064,7.27422,80.6124,100.32,6.0
189138671,10.5,588.0,33.986400,43.0,6.85137,79.9537,6.84779,79.9274,257.89,26.0


In [46]:
X_train, X_eval, y_train, y_eval = train_test_split(features_df, label_df, test_size=0.3, shuffle=True, stratify=label_df, random_state=RANDOM_SEED)

## Train the model
pipeline.fit(X_train, y_train)

# Predict for the evaluation set

print("Training Accuracy: %.2f" % (pipeline.score(X_eval, y_eval)*100), "%")
preds = pipeline.predict_proba(X_eval)


  self._final_estimator.fit(Xt, y, **fit_params)


Training Accuracy: 93.79 %


In [51]:
pipeline.fit(features_df, label_df)

None


  self._final_estimator.fit(Xt, y, **fit_params)


In [52]:
test_set = pd.read_csv(DATA_PATH / "test.csv", index_col="tripid")
test_set.head()


Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
213284604,10.5,924,42,2.4486,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.875,6.7749,79.884,289.27
213286352,10.5,4249,20,0.0,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.7
213293973,10.5,1552,255,2.6588,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.0
213294622,10.5,462,16,0.0,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
213298687,10.5,814,392,12.3692,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.913,6.98875,79.8914,147.47


In [54]:
test_set['checkout_datetime'] = pd.to_datetime(test_set['drop_time'])
test_set['checkin_datetime'] = pd.to_datetime(test_set['pickup_time'])
#train_df['duration'] = train_df['checkout_datetime']-train_df['checkin_datetime']
test_set['duration_in_minutes'] = (test_set['checkout_datetime']-test_set['checkin_datetime'])/pd.Timedelta(minutes=1)

new_test_set = test_set.drop(["pickup_time", "drop_time","duration","checkin_datetime", "checkout_datetime"], axis = 1)
new_test_set

Unnamed: 0_level_0,additional_fare,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pick_lat,pick_lon,drop_lat,drop_lon,fare,duration_in_minutes
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
213284604,10.5,42,2.448600,148,6.83454,79.8750,6.77490,79.8840,289.27,15.0
213286352,10.5,20,0.000000,91,6.91168,79.8723,6.55091,79.9706,1912.70,71.0
213293973,10.5,255,2.658800,23,6.92145,79.8478,6.90539,79.8989,394.00,26.0
213294622,10.5,16,0.000000,198,6.77433,79.9416,6.80401,79.9407,154.32,8.0
213298687,10.5,392,12.369200,69,6.97968,79.9130,6.98875,79.8914,147.47,14.0
213299545,10.5,351,16.530800,9,6.99819,79.9378,7.13916,79.8726,1156.97,42.0
213302332,10.5,454,23.929200,43,6.79064,79.8878,6.81875,79.8859,196.81,18.0
213302671,10.5,320,18.496000,17,6.81545,79.9707,6.82144,79.8662,688.43,45.0
213305594,10.5,29,0.000000,130,6.82920,79.9798,6.79732,79.9309,288.77,19.0
213305134,10.5,277,16.046498,63,6.05588,80.2391,6.04033,80.2043,199.57,23.0


In [55]:
test_probs = pipeline.predict_proba(new_test_set)


In [56]:
test_probs

array([[0. , 1. ],
       [0.2, 0.8],
       [0.2, 0.8],
       ...,
       [0. , 1. ],
       [0.1, 0.9],
       [0. , 1. ]])

In [57]:
submission_set = pd.read_csv(DATA_PATH / "sample_submission.csv", index_col="tripid")
submission_set.head()


Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,0
213293973,0
213294622,1
213298687,1


In [58]:
try:
    np.testing.assert_array_equal(test_set.index.values, submission_set.index.values)
    print("rows in the same order")
except:
    print("rows not in the same order or error")


rows in the same order


In [70]:
submission_set['prediction'] = test_probs[:,1]


In [71]:
submission_set

Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1.0
213286352,0.8
213293973,0.8
213294622,1.0
213298687,1.0
213299545,0.7
213302332,1.0
213302671,0.8
213305594,0.9
213305134,1.0


In [72]:
submission_set.loc[submission_set['prediction'] > 0.5, 'prediction'] = 1
submission_set.loc[submission_set['prediction'] <= 0.5, 'prediction'] = 0
submission_set['prediction'] = submission_set['prediction'].astype(np.int64)


#submission_set['presiction'].convert_dtypes(convert_integer: bool = True)
submission_set



<class 'pandas.core.frame.DataFrame'>
Int64Index: 8576 entries, 213284604 to 222860703
Data columns (total 1 columns):
prediction    8576 non-null int64
dtypes: int64(1)
memory usage: 134.0 KB


In [63]:
submission_set.to_csv('160374E_submission_02.csv', index=True)
print("Completed!")


Completed!
