In [12]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF



from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

import glob
import os

In [13]:
pd.set_option("display.max_columns", 100)
%matplotlib inline

In [14]:
RANDOM_SEED = 6

In [111]:
DATA_PATH = Path.cwd() / ""
train_df = pd.read_csv(DATA_PATH / "train.csv", index_col="tripid")
train_df['checkout_datetime'] = pd.to_datetime(train_df['drop_time'])
train_df['checkin_datetime'] = pd.to_datetime(train_df['pickup_time'])
#train_df['duration'] = train_df['checkout_datetime']-train_df['checkin_datetime']
train_df['duration_in_minutes'] = (train_df['checkout_datetime']-train_df['checkin_datetime'])/pd.Timedelta(minutes=1)
train_df['lat']  = abs(train_df['pick_lat']-train_df['drop_lat'])
train_df['lon']  = abs(train_df['pick_lon']-train_df['drop_lon'])
#train_df['distance'] = (((train_df['lat'])**2 + (train_df['lon'])**2)**(1/2))*100
#train_df['manhatton_distance'] = (train_df['lat'] + train_df['lat'])

#train_df.loc[train_df['meter_waiting_fare'] > 0, 'is_meter_waiting_fare'] = 1
#train_df.loc[train_df['meter_waiting_fare'] <= 0, 'is_meter_waiting_fare'] = 0

#train_df['meter_waiting_fare'] = train_df['meter_waiting_fare'].astype(np.int64)


#train_df['is_waiting_fare'] = train_df['meter_waiting']*train_df['meter_waiting_fare']
#train_df['waiting_till_pickup_fare'] = train_df['meter_waiting_till_pickup']*train_df['meter_waiting_fare']

features_df_num = train_df.drop(["pickup_time","drop_time","duration","checkin_datetime","drop_lat","drop_lon","checkout_datetime", "label"], axis = 1)
features_df = train_df.drop(["duration","checkin_datetime","drop_lat","drop_lon","checkout_datetime", "label"], axis = 1)

pure_train_df = train_df.drop(["label"], axis = 1)
categorical_features = pure_train_df.columns[pure_train_df.dtypes == "object"].values


label_df = train_df[["label"]]
label_df = label_df.replace({'label': {'correct': 1, 'incorrect': 0}})

categorical_features

array(['pickup_time', 'drop_time'], dtype=object)

In [112]:
features_list = features_df_num.columns.tolist()
features_list

['additional_fare',
 'meter_waiting',
 'meter_waiting_fare',
 'meter_waiting_till_pickup',
 'pick_lat',
 'pick_lon',
 'fare',
 'duration_in_minutes',
 'lat',
 'lon']

In [113]:
## chain numerical preprocessing into a pipeline object
numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='most_frequent')),
    ('minmax_scaler', MinMaxScaler())
])

## chain non-numerical preprocessing into a pipeline object
non_numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore')),
#     ('label_encoder', LabelEncoder())
])

## create preprocessor stage of the final pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_preprocessing_steps, features_list)
    ],
    remainder = 'drop'
)



In [119]:
estimator =  RandomForestClassifier( class_weight='balanced', n_estimators=500, criterion="entropy")
#estimator =  RandomForestClassifier( class_weight='balanced', n_estimators=400)

#estimator = KNeighborsClassifier(n_neighbors=100, weights="uniform", algorithm="brute")
#estimator = AdaBoostClassifier()
#estimator =  RandomForestClassifier( class_weight='balanced', n_estimators=400, max_depth=200, max_features=10)

In [120]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('estimators', estimator)
])


In [121]:
features_df

Unnamed: 0_level_0,additional_fare,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,fare,duration_in_minutes,lat,lon
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
189123628,10.5,56.0,0.000000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,270.32,14.0,0.04078,0.0210
189125358,10.5,47.0,0.000000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,197.85,13.0,0.02784,0.0061
189125719,10.5,80.0,0.000000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,301.64,18.0,0.02830,0.0495
189127273,10.5,271.0,15.663800,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,82.30,10.0,0.00178,0.0076
189128020,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,358.39,17.0,0.02963,0.0675
189129552,10.5,182.0,0.000000,112.0,11/1/2019 5:38,11/1/2019 6:35,7.13402,79.8969,1065.02,57.0,0.21537,0.0320
189132829,10.5,487.0,0.000000,133.0,11/1/2019 6:29,11/1/2019 6:49,6.84371,79.9051,266.62,20.0,0.00698,0.0427
189135103,10.5,295.0,17.198500,212.0,11/1/2019 6:50,11/1/2019 7:12,6.90760,79.9524,318.05,22.0,0.00126,0.0482
189139296,10.5,80.0,4.664000,3.0,11/1/2019 7:00,11/1/2019 7:06,7.26706,80.6064,100.32,6.0,0.00716,0.0060
189138671,10.5,588.0,33.986400,43.0,11/1/2019 7:02,11/1/2019 7:28,6.85137,79.9537,257.89,26.0,0.00358,0.0263


In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(features_df, label_df, test_size=0.3, shuffle=True, stratify=label_df, random_state=RANDOM_SEED)

## Train the model
pipeline.fit(X_train, y_train)

# Predict for the evaluation set

print("Training Accuracy: %.2f" % (pipeline.score(X_eval, y_eval)*100), "%")
preds = pipeline.predict_proba(X_eval)


In [None]:
pipeline.fit(features_df, label_df)

None


  self._final_estimator.fit(Xt, y, **fit_params)


In [217]:
test_set = pd.read_csv(DATA_PATH / "test.csv", index_col="tripid")
test_set.head()


Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
213284604,10.5,924,42,2.4486,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.875,6.7749,79.884,289.27
213286352,10.5,4249,20,0.0,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.7
213293973,10.5,1552,255,2.6588,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.0
213294622,10.5,462,16,0.0,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
213298687,10.5,814,392,12.3692,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.913,6.98875,79.8914,147.47


In [221]:
test_set['checkout_datetime'] = pd.to_datetime(test_set['drop_time'])
test_set['checkin_datetime'] = pd.to_datetime(test_set['pickup_time'])
#train_df['duration'] = train_df['checkout_datetime']-train_df['checkin_datetime']
test_set['duration_in_minutes'] = (test_set['checkout_datetime']-test_set['checkin_datetime'])/pd.Timedelta(minutes=1)
test_set['lat']  = abs(test_set['pick_lat']-test_set['drop_lat'])
test_set['lon']  = abs(test_set['pick_lon']-test_set['drop_lon'])
test_set.loc[test_set['meter_waiting_fare'] > 0, 'is_meter_waiting_fare'] = 1
test_set.loc[test_set['meter_waiting_fare'] <= 0, 'is_meter_waiting_fare'] = 0

#test_set['distance'] = ((test_set['lat'])**2 + (test_set['lat'])**2)**(1/2)


new_test_set = test_set.drop(["pickup_time", "drop_time","duration","meter_waiting_fare","drop_lat","drop_lon","checkin_datetime", "checkout_datetime"], axis = 1)
new_test_set

Unnamed: 0_level_0,additional_fare,meter_waiting,meter_waiting_till_pickup,pick_lat,pick_lon,fare,duration_in_minutes,lat,lon,is_meter_waiting_fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
213284604,10.5,42,148,6.83454,79.8750,289.27,15.0,0.05964,0.0090,1.0
213286352,10.5,20,91,6.91168,79.8723,1912.70,71.0,0.36077,0.0983,0.0
213293973,10.5,255,23,6.92145,79.8478,394.00,26.0,0.01606,0.0511,1.0
213294622,10.5,16,198,6.77433,79.9416,154.32,8.0,0.02968,0.0009,0.0
213298687,10.5,392,69,6.97968,79.9130,147.47,14.0,0.00907,0.0216,1.0
213299545,10.5,351,9,6.99819,79.9378,1156.97,42.0,0.14097,0.0652,1.0
213302332,10.5,454,43,6.79064,79.8878,196.81,18.0,0.02811,0.0019,1.0
213302671,10.5,320,17,6.81545,79.9707,688.43,45.0,0.00599,0.1045,1.0
213305594,10.5,29,130,6.82920,79.9798,288.77,19.0,0.03188,0.0489,0.0
213305134,10.5,277,63,6.05588,80.2391,199.57,23.0,0.01555,0.0348,1.0


In [222]:
test_probs = pipeline.predict_proba(new_test_set)


In [223]:
test_probs

array([[0.02 , 0.98 ],
       [0.298, 0.702],
       [0.126, 0.874],
       ...,
       [0.008, 0.992],
       [0.026, 0.974],
       [0.014, 0.986]])

In [224]:
submission_set = pd.read_csv(DATA_PATH / "sample_submission.csv", index_col="tripid")
submission_set.head()


Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,0
213293973,0
213294622,1
213298687,1


In [225]:
try:
    np.testing.assert_array_equal(test_set.index.values, submission_set.index.values)
    print("rows in the same order")
except:
    print("rows not in the same order or error")


rows in the same order


In [226]:
submission_set['prediction'] = test_probs[:,1]


In [227]:
submission_set

Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,0.980
213286352,0.702
213293973,0.874
213294622,0.992
213298687,0.992
213299545,0.784
213302332,0.984
213302671,0.858
213305594,0.998
213305134,0.954


In [228]:
submission_set.loc[submission_set['prediction'] > 0.5, 'prediction'] = 1
submission_set.loc[submission_set['prediction'] <= 0.5, 'prediction'] = 0
submission_set['prediction'] = submission_set['prediction'].astype(np.int64)


#submission_set['presiction'].convert_dtypes(convert_integer: bool = True)
submission_set



Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,1
213293973,1
213294622,1
213298687,1
213299545,1
213302332,1
213302671,1
213305594,1
213305134,1


In [229]:
submission_set.to_csv('160374E_submission_07.csv', index=True)
print("Completed!")


Completed!
