In [4]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier



from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

import glob
import os

In [5]:
pd.set_option("display.max_columns", 100)
%matplotlib inline

In [6]:
RANDOM_SEED = 6

In [7]:
DATA_PATH = Path.cwd() / ""
train_df = pd.read_csv(DATA_PATH / "train.csv", index_col="tripid")
train_df['checkout_datetime'] = pd.to_datetime(train_df['drop_time'])
train_df['checkin_datetime'] = pd.to_datetime(train_df['pickup_time'])
#train_df['duration'] = train_df['checkout_datetime']-train_df['checkin_datetime']
train_df['duration_in_minutes'] = (train_df['checkout_datetime']-train_df['checkin_datetime'])/pd.Timedelta(minutes=1)
train_df['lat']  = abs(train_df['pick_lat']-train_df['drop_lat'])
train_df['lon']  = abs(train_df['pick_lon']-train_df['drop_lon'])
#train_df['distance'] = (((train_df['lat'])**2 + (train_df['lon'])**2)**(1/2))*100
#train_df['manhatton_distance'] = (train_df['lat'] + train_df['lat'])

#train_df.loc[train_df['meter_waiting_fare'] > 0, 'is_meter_waiting_fare'] = 1
#train_df.loc[train_df['meter_waiting_fare'] <= 0, 'is_meter_waiting_fare'] = 0

#train_df['meter_waiting_fare'] = train_df['meter_waiting_fare'].astype(np.int64)


#train_df['is_waiting_fare'] = train_df['meter_waiting']*train_df['meter_waiting_fare']
#train_df['waiting_till_pickup_fare'] = train_df['meter_waiting_till_pickup']*train_df['meter_waiting_fare']

train_df['date_time'] = pd.to_datetime(train_df['drop_time'])
train_df['drop_Hour'] = train_df['date_time'].apply(lambda x: x.hour+1)
#train_df['weekday'] = train_df['date_time'].apply(lambda x: x.weekday())
train_df['remaining_fare'] = train_df['fare']-train_df['additional_fare']-train_df['meter_waiting_fare']
train_df['mobile_time'] = train_df['duration']-train_df['meter_waiting']-train_df['meter_waiting_till_pickup']

#train_df
#train_df['pick_Hour'] = pd.to_datetime(train_df['pickup_time'])
#train_df['pick_Hour'] = train_df['pick_Hour'].apply(lambda x: x.hour+1)

#features_df_num = train_df.drop(["date_time","pickup_time","drop_time","duration","checkin_datetime","drop_lat","drop_lon","checkout_datetime", "label"], axis = 1)
features_df = train_df.drop(["date_time","duration","pickup_time","drop_time","checkin_datetime","drop_lat","drop_lon","pick_lat","pick_lon","checkout_datetime", "label"], axis = 1)

pure_train_df = train_df.drop(["label"], axis = 1)
categorical_features = pure_train_df.columns[pure_train_df.dtypes == "object"].values


label_df = train_df[["label"]]
label_df = label_df.replace({'label': {'correct': 1, 'incorrect': 0}})

features_df

Unnamed: 0_level_0,additional_fare,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,duration_in_minutes,lat,lon,drop_Hour,remaining_fare,mobile_time
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
189123628,10.5,56.0,0.0000,64.0,270.32,14.0,0.04078,0.0210,1,259.8200,714.0
189125358,10.5,47.0,0.0000,134.0,197.85,13.0,0.02784,0.0061,2,187.3500,610.0
189125719,10.5,80.0,0.0000,61.0,301.64,18.0,0.02830,0.0495,2,291.1400,946.0
189127273,10.5,271.0,15.6638,68.0,82.30,10.0,0.00178,0.0076,3,56.1362,259.0
189128020,,,,,358.39,17.0,0.02963,0.0675,4,,
...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.5,93.0,5.4219,451.0,198.26,14.0,0.00182,0.0190,23,182.3381,294.0
213812756,10.5,428.0,0.0000,39.0,581.23,36.0,0.04520,0.0873,24,570.7300,1684.0
213813930,10.5,9.0,0.0000,110.0,76.20,4.0,0.00925,0.0017,24,65.7000,144.0
213815405,10.5,115.0,0.0000,317.0,133.31,14.0,0.00966,0.0242,24,122.8100,426.0


In [8]:
features_list = features_df.columns.tolist()
features_list

['additional_fare',
 'meter_waiting',
 'meter_waiting_fare',
 'meter_waiting_till_pickup',
 'fare',
 'duration_in_minutes',
 'lat',
 'lon',
 'drop_Hour',
 'remaining_fare',
 'mobile_time']

In [9]:
## chain numerical preprocessing into a pipeline object
numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='most_frequent')),
    ('minmax_scaler', MinMaxScaler())
])

## chain non-numerical preprocessing into a pipeline object
non_numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore')),
#     ('label_encoder', LabelEncoder())
])

## create preprocessor stage of the final pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_preprocessing_steps, features_list)
    ],
    remainder = 'drop'
)



In [10]:
#estimator =  RandomForestClassifier( class_weight='balanced', n_estimators=500, criterion="entropy")
#estimator =  RandomForestClassifier( class_weight='balanced', n_estimators=400)
estimator = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(11,121,1331,14000), random_state=1, learning_rate = 'adaptive')

#estimator = KNeighborsClassifier(n_neighbors=100, weights="uniform", algorithm="brute")
#estimator = AdaBoostClassifier()
#estimator =  RandomForestClassifier( class_weight='balanced', n_estimators=400, max_depth=200, max_features=10)

In [11]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('estimators', estimator)
])


In [12]:
features_df

Unnamed: 0_level_0,additional_fare,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,duration_in_minutes,lat,lon,drop_Hour,remaining_fare,mobile_time
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
189123628,10.5,56.0,0.0000,64.0,270.32,14.0,0.04078,0.0210,1,259.8200,714.0
189125358,10.5,47.0,0.0000,134.0,197.85,13.0,0.02784,0.0061,2,187.3500,610.0
189125719,10.5,80.0,0.0000,61.0,301.64,18.0,0.02830,0.0495,2,291.1400,946.0
189127273,10.5,271.0,15.6638,68.0,82.30,10.0,0.00178,0.0076,3,56.1362,259.0
189128020,,,,,358.39,17.0,0.02963,0.0675,4,,
...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.5,93.0,5.4219,451.0,198.26,14.0,0.00182,0.0190,23,182.3381,294.0
213812756,10.5,428.0,0.0000,39.0,581.23,36.0,0.04520,0.0873,24,570.7300,1684.0
213813930,10.5,9.0,0.0000,110.0,76.20,4.0,0.00925,0.0017,24,65.7000,144.0
213815405,10.5,115.0,0.0000,317.0,133.31,14.0,0.00966,0.0242,24,122.8100,426.0


In [13]:
X_train, X_eval, y_train, y_eval = train_test_split(features_df, label_df, test_size=0.3, shuffle=True, stratify=label_df, random_state=RANDOM_SEED)

## Train the model
pipeline.fit(X_train, y_train)

# Predict for the evaluation set

print("Training Accuracy: %.2f" % (pipeline.score(X_eval, y_eval)*100), "%")
preds = pipeline.predict_proba(X_eval)


  y = column_or_1d(y, warn=True)


Training Accuracy: 94.22 %


In [14]:
pipeline.fit(features_df, label_df)

None


  y = column_or_1d(y, warn=True)


In [15]:
test_set = pd.read_csv(DATA_PATH / "test.csv", index_col="tripid")
test_set.head()


Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
213284604,10.5,924,42,2.4486,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.875,6.7749,79.884,289.27
213286352,10.5,4249,20,0.0,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.7
213293973,10.5,1552,255,2.6588,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.0
213294622,10.5,462,16,0.0,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
213298687,10.5,814,392,12.3692,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.913,6.98875,79.8914,147.47


In [16]:
test_set['checkout_datetime'] = pd.to_datetime(test_set['drop_time'])
test_set['checkin_datetime'] = pd.to_datetime(test_set['pickup_time'])
#train_df['duration'] = train_df['checkout_datetime']-train_df['checkin_datetime']
test_set['duration_in_minutes'] = (test_set['checkout_datetime']-test_set['checkin_datetime'])/pd.Timedelta(minutes=1)
test_set['lat']  = abs(test_set['pick_lat']-test_set['drop_lat'])
test_set['lon']  = abs(test_set['pick_lon']-test_set['drop_lon'])
#test_set.loc[test_set['meter_waiting_fare'] > 0, 'is_meter_waiting_fare'] = 1
#test_set.loc[test_set['meter_waiting_fare'] <= 0, 'is_meter_waiting_fare'] = 0
test_set['date_time'] = pd.to_datetime(test_set['drop_time'])
test_set['drop_Hour'] = test_set['date_time'].apply(lambda x: x.hour+1)
test_set['remaining_fare'] = test_set['fare']-test_set['additional_fare']-test_set['meter_waiting_fare']
test_set['mobile_time'] = test_set['duration']-test_set['meter_waiting']-test_set['meter_waiting_till_pickup']

#test_set['distance'] = ((test_set['lat'])**2 + (test_set['lat'])**2)**(1/2)

new_test_set = test_set.drop(["date_time","duration","pickup_time","drop_time","pick_lat","pick_lon","checkin_datetime","drop_lat","drop_lon","checkout_datetime"], axis = 1)

#new_test_set = test_set.drop(["date_time","pickup_time", "drop_time","duration","meter_waiting_fare","drop_lat","drop_lon","checkin_datetime", "checkout_datetime"], axis = 1)
new_test_set

Unnamed: 0_level_0,additional_fare,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,duration_in_minutes,lat,lon,drop_Hour,remaining_fare,mobile_time
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
213284604,10.5,42,2.44860,148,289.27,15.0,0.05964,0.0090,1,276.32140,734
213286352,10.5,20,0.00000,91,1912.70,71.0,0.36077,0.0983,3,1902.20000,4138
213293973,10.5,255,2.65880,23,394.00,26.0,0.01606,0.0511,6,380.84120,1274
213294622,10.5,16,0.00000,198,154.32,8.0,0.02968,0.0009,6,143.82000,248
213298687,10.5,392,12.36920,69,147.47,14.0,0.00907,0.0216,8,124.60080,353
...,...,...,...,...,...,...,...,...,...,...,...
222856243,10.5,429,24.83332,3,388.48,28.0,0.00485,0.0353,22,353.14668,1291
222857785,10.5,80,0.00000,125,379.85,23.0,0.00819,0.0676,23,369.35000,1173
222858416,10.5,56,3.28440,93,112.79,7.0,0.01850,0.0002,23,99.00560,269
222858691,10.5,548,31.67440,17,248.46,27.0,0.01870,0.0299,23,206.28560,1039


In [17]:
test_probs = pipeline.predict_proba(new_test_set)


In [18]:
test_probs

array([[0.02183086, 0.97816914],
       [0.18612845, 0.81387155],
       [0.01066879, 0.98933121],
       ...,
       [0.0235792 , 0.9764208 ],
       [0.02090565, 0.97909435],
       [0.02025748, 0.97974252]])

In [311]:
submission_set = pd.read_csv(DATA_PATH / "sample_submission.csv", index_col="tripid")
submission_set.head()


Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,0
213293973,0
213294622,1
213298687,1


In [312]:
try:
    np.testing.assert_array_equal(test_set.index.values, submission_set.index.values)
    print("rows in the same order")
except:
    print("rows not in the same order or error")


rows in the same order


In [313]:
submission_set['prediction'] = test_probs[:,1]


In [314]:
submission_set

Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,0.972
213286352,0.568
213293973,0.800
213294622,0.970
213298687,0.976
213299545,0.656
213302332,0.994
213302671,0.954
213305594,1.000
213305134,0.972


In [315]:
submission_set.loc[submission_set['prediction'] > 0.5, 'prediction'] = 1
submission_set.loc[submission_set['prediction'] <= 0.5, 'prediction'] = 0
submission_set['prediction'] = submission_set['prediction'].astype(np.int64)


#submission_set['presiction'].convert_dtypes(convert_integer: bool = True)
submission_set



Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,1
213293973,1
213294622,1
213298687,1
213299545,1
213302332,1
213302671,1
213305594,1
213305134,1


In [316]:
submission_set.to_csv('160374E_submission_10_95_25.csv', index=True)
print("Completed!")


Completed!
