In [4]:
from pathlib import Path

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
import xgboost as xgb

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

pd.set_option("display.max_columns", 100)

In [5]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier 

In [6]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

In [7]:
#!pip install catboost

In [8]:
from catboost import CatBoostClassifier

In [9]:
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a)) 

In [10]:
DATA_PATH = Path.cwd() / ""
features_df = pd.read_csv(DATA_PATH / "train.csv", index_col="tripid")


In [11]:
features_df.drop( features_df[ features_df['pick_lat'] < 5.9 ].index , inplace=True)
features_df.drop( features_df[ features_df['pick_lat'] > 9.82 ].index , inplace=True)	
features_df.drop( features_df[ features_df['pick_lon'] < 79 ].index , inplace=True)
features_df.drop( features_df[ features_df['pick_lon'] > 82 ].index , inplace=True)
features_df.drop( features_df[ features_df['drop_lat'] < 5.9 ].index , inplace=True)
features_df.drop( features_df[ features_df['drop_lat'] > 9.82 ].index , inplace=True)	
features_df.drop( features_df[ features_df['drop_lon'] < 79 ].index , inplace=True)
features_df.drop( features_df[ features_df['drop_lon'] > 82 ].index , inplace=True)

In [12]:

features_df = features_df.drop_duplicates()
features_df.isnull().sum()

additional_fare              201
duration                     201
meter_waiting                201
meter_waiting_fare           201
meter_waiting_till_pickup    201
pickup_time                    0
drop_time                      0
pick_lat                       0
pick_lon                       0
drop_lat                       0
drop_lon                       0
fare                         137
label                          0
dtype: int64

In [13]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,correct
189125358,10.5,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
189125719,10.5,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,correct
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,correct
...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.5,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,correct
213812756,10.5,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,correct
213813930,10.5,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,correct
213815405,10.5,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,correct


In [14]:
features_df["distance"] =  distance(features_df["pick_lat"],features_df["pick_lon"],features_df["drop_lat"],features_df["drop_lon"])

In [15]:
features_df["pickup_time"] = pd.to_datetime(features_df["pickup_time"],errors = "coerce")
features_df["drop_time"] = pd.to_datetime(features_df["drop_time"],errors = "coerce")
features_df['duration'] = features_df['duration'].fillna((features_df['drop_time'] - features_df['pickup_time']).astype('timedelta64[s]'))

In [16]:
features_df['additional_fare'] = features_df['additional_fare'].fillna(features_df['additional_fare'].mode().iloc[0])

In [17]:
features_df_correct = features_df[features_df.label=='correct']
features_df_incorrect = features_df[features_df.label=='incorrect']

In [18]:
features_df_correct['meter_waiting'] = features_df_correct['meter_waiting'].fillna(features_df_correct['meter_waiting'].mean())
features_df_incorrect['meter_waiting'] = features_df_incorrect['meter_waiting'].fillna(features_df_incorrect['meter_waiting'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [19]:
features_df_correct['meter_waiting_fare'] = features_df_correct['meter_waiting_fare'].fillna(features_df_correct['meter_waiting_fare'].mean())
features_df_incorrect['meter_waiting_fare'] = features_df_incorrect['meter_waiting_fare'].fillna(features_df_incorrect['meter_waiting_fare'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [20]:
features_df_correct['fare'] = features_df_correct['fare'].fillna((92.33)*features_df_correct['distance']+features_df_correct['additional_fare']+features_df_correct['meter_waiting_fare'])
features_df_incorrect['fare'] = features_df_incorrect['fare'].fillna((221)*features_df_incorrect['distance']+features_df_incorrect['additional_fare']+features_df_incorrect['meter_waiting_fare'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
#features_df = pd.concat([features_df_correct,features_df_incorrect],sort=True)

In [22]:
# features_df.describe()

In [23]:
y= features_df["label"]

In [24]:
y =y.replace(to_replace="correct",value=1)
y =y.replace(to_replace="incorrect",value=0)

In [25]:
y[189157607]


0

In [26]:
X = features_df.drop(columns=["label"], axis=1)

In [27]:
test_features_df = pd.read_csv("test.csv", 
                               index_col="tripid")

In [28]:
test_features_df
test_features_df.isnull().sum()

additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
dtype: int64

In [29]:
X

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0000,64.0,2019-11-01 00:20:00,2019-11-01 00:34:00,6.86252,79.8993,6.90330,79.8783,270.32,3.164501
189125358,10.5,791.0,47.0,0.0000,134.0,2019-11-01 00:56:00,2019-11-01 01:09:00,6.88589,79.8984,6.91373,79.8923,197.85,1.968540
189125719,10.5,1087.0,80.0,0.0000,61.0,2019-11-01 01:08:00,2019-11-01 01:26:00,6.90839,79.8651,6.93669,79.9146,301.64,3.917991
189127273,10.5,598.0,271.0,15.6638,68.0,2019-11-01 02:27:00,2019-11-01 02:37:00,6.92570,79.8895,6.92748,79.8971,82.30,0.535588
189128020,10.5,1020.0,,,,2019-11-01 03:34:00,2019-11-01 03:51:00,6.87441,79.8615,6.84478,79.9290,358.39,5.062797
...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.5,838.0,93.0,5.4219,451.0,2020-01-31 22:07:00,2020-01-31 22:21:00,7.29073,80.6367,7.28891,80.6557,198.26,1.308220
213812756,10.5,2151.0,428.0,0.0000,39.0,2020-01-31 23:07:00,2020-01-31 23:43:00,6.90569,79.8516,6.95089,79.9389,581.23,6.753297
213813930,10.5,263.0,9.0,0.0000,110.0,2020-01-31 23:21:00,2020-01-31 23:25:00,7.09210,79.9000,7.10135,79.9017,76.20,0.649655
213815405,10.5,858.0,115.0,0.0000,317.0,2020-01-31 23:39:00,2020-01-31 23:53:00,6.94540,79.8768,6.93574,79.9010,133.31,1.788975


In [30]:
#X = X.dropna()

In [31]:
X

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0000,64.0,2019-11-01 00:20:00,2019-11-01 00:34:00,6.86252,79.8993,6.90330,79.8783,270.32,3.164501
189125358,10.5,791.0,47.0,0.0000,134.0,2019-11-01 00:56:00,2019-11-01 01:09:00,6.88589,79.8984,6.91373,79.8923,197.85,1.968540
189125719,10.5,1087.0,80.0,0.0000,61.0,2019-11-01 01:08:00,2019-11-01 01:26:00,6.90839,79.8651,6.93669,79.9146,301.64,3.917991
189127273,10.5,598.0,271.0,15.6638,68.0,2019-11-01 02:27:00,2019-11-01 02:37:00,6.92570,79.8895,6.92748,79.8971,82.30,0.535588
189128020,10.5,1020.0,,,,2019-11-01 03:34:00,2019-11-01 03:51:00,6.87441,79.8615,6.84478,79.9290,358.39,5.062797
...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.5,838.0,93.0,5.4219,451.0,2020-01-31 22:07:00,2020-01-31 22:21:00,7.29073,80.6367,7.28891,80.6557,198.26,1.308220
213812756,10.5,2151.0,428.0,0.0000,39.0,2020-01-31 23:07:00,2020-01-31 23:43:00,6.90569,79.8516,6.95089,79.9389,581.23,6.753297
213813930,10.5,263.0,9.0,0.0000,110.0,2020-01-31 23:21:00,2020-01-31 23:25:00,7.09210,79.9000,7.10135,79.9017,76.20,0.649655
213815405,10.5,858.0,115.0,0.0000,317.0,2020-01-31 23:39:00,2020-01-31 23:53:00,6.94540,79.8768,6.93574,79.9010,133.31,1.788975


In [32]:
# X = X.fillna(X.mean())
#features_df = features_df.fillna(features_df.mean())


In [33]:
test_features_df = test_features_df.fillna(test_features_df.mean())

In [48]:
X["pickup_time"] = pd.to_datetime(X["pickup_time"],errors = "coerce")

In [49]:
X["drop_time"] = pd.to_datetime(X["drop_time"],errors = "coerce")

In [50]:
X["pickup_time_hour"] = X["pickup_time"].dt.hour

In [51]:
X["pickup_time_minute"] = X["pickup_time"].dt.minute

In [52]:
X["drop_time_hour"] =X["drop_time"].dt.hour

In [53]:
X["drop_time_minute"] =X["drop_time"].dt.minute
X['m_time'] = X['duration']-X['meter_waiting_till_pickup']


In [54]:
X["pickup_time_day"] = X["pickup_time"].dt.day
# X["pickup_time_month"] = X["pickup_time"].dt.month
X["drop_time_day"] = X["drop_time"].dt.day
# X["drop_time_month"] = X["drop_time"].dt.month
# X["pick_up_year"] = X["drop_time"].dt.year
# X["drop_year"] = X["drop_time"].dt.year

In [55]:
# X["distance"] = (X["pick_lat"]-X["drop_lat"])**2 + (X["pick_lon"]-X["drop_lon"])**2
# X["distance"] =  distance(X["pick_lat"],X["pick_lon"],X["drop_lat"],X["drop_lon"])

In [56]:
X.dtypes

additional_fare                     float64
duration                            float64
meter_waiting                       float64
meter_waiting_fare                  float64
meter_waiting_till_pickup           float64
pickup_time                  datetime64[ns]
drop_time                    datetime64[ns]
pick_lat                            float64
pick_lon                            float64
drop_lat                            float64
drop_lon                            float64
fare                                float64
distance                            float64
pickup_time_hour                      int64
pickup_time_minute                    int64
drop_time_hour                        int64
drop_time_minute                      int64
m_time                              float64
pickup_time_day                       int64
drop_time_day                         int64
effective_time                      float64
speed                               float64
fare_outlier                    

In [57]:
X["effective_time"] = X["duration"]-X["meter_waiting"]-X["meter_waiting_till_pickup"]
X['speed']= X['distance']/X['effective_time']

In [58]:
X.loc[X['fare'] <= 2000, 'fare_outlier'] = 1
X.loc[X['fare'] > 2000, 'fare_outlier'] = 0
X['fare'].isnull().sum()

137

In [59]:
X['meter_waiting_fare_diff'] = X['meter_waiting_fare'] - X['meter_waiting']*0.057

In [60]:
X['fare_mean'] = (X['fare'] - X['meter_waiting_fare'])/(X['duration'] - X['meter_waiting'])
#X['remaining_fare'] = X['fare']-X['additional_fare']-X['meter_waiting_fare']

In [61]:
# X.loc[X['additional_fare'] <= 100, 'additional_fare_outlier'] = 1
# X.loc[X['additional_fare'] > 100, 'additional_fare_outlier'] = 0

In [62]:
# X.loc[X['meter_waiting'] <= 2000, 'meter_waiting_outlier'] = 1
# X.loc[X['meter_waiting'] > 2000, 'meter_waiting_outlier'] = 0

In [63]:
# X.loc[X['duration'] <= 6000, 'duration_outlier'] = 1
# X.loc[X['duration'] > 6000, 'duration_outlier'] = 0

In [64]:
# X.loc[X['meter_waiting_fare'] <= 114, 'meter_waiting_fare_outlier'] = 1
# X.loc[X['meter_waiting_fare'] > 114, 'meter_waiting_fare_outlier'] = 0

In [65]:
#X["effective_time"] = (X["effective_time"] -X["effective_time"].mean() )/X["effective_time"].std() 

In [66]:
# X["effective_fare"] = X["fare"]+X["additional_fare"]

In [67]:
#X["effective_fare"] = (X["effective_fare"] -X["effective_fare"].mean() )/X["effective_fare"].std() 

In [68]:
# X["mean_fare_for_unit_length"] =X["fare"]/ X["distance"]

In [69]:
#X["mean_fare_for_unit_length"] = (X["mean_fare_for_unit_length"] -X["mean_fare_for_unit_length"].mean() )/X["mean_fare_for_unit_length"].std() 

In [70]:
# X["duration_fare"] = X["duration"]*X["fare"]

In [71]:
#X["duration_fare"] = (X["duration_fare"] -X["duration_fare"].mean() )/X["duration_fare"].std() 

In [72]:
# X = X.drop(columns=["pickup_time","drop_time"],axis=1)

In [73]:
# X['speed'] = X['distance']/X['effective_time']

In [74]:
# X['duration'] = X['duration'].fillna((X['drop_time'] - X['pickup_time']).astype('timedelta64[s]'))
#X['fare'] = X['fare'].fillna(0)
X['fare'].isnull().sum()

137

In [75]:
# X2 = X.values 
# x2_scaled = min_max_scaler.fit_transform(X2)
# X = pd.DataFrame(x2_scaled)
X

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,pickup_time_hour,pickup_time_minute,drop_time_hour,drop_time_minute,m_time,pickup_time_day,drop_time_day,effective_time,speed,fare_outlier,meter_waiting_fare_diff,fare_mean
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
189123628,10.5,834.0,56.0,0.0000,64.0,2019-11-01 00:20:00,2019-11-01 00:34:00,6.86252,79.8993,6.90330,79.8783,270.32,3.164501,0,20,0,34,770.0,1,1,714.0,0.004432,1.0,-3.1920,0.347455
189125358,10.5,791.0,47.0,0.0000,134.0,2019-11-01 00:56:00,2019-11-01 01:09:00,6.88589,79.8984,6.91373,79.8923,197.85,1.968540,0,56,1,9,657.0,1,1,610.0,0.003227,1.0,-2.6790,0.265927
189125719,10.5,1087.0,80.0,0.0000,61.0,2019-11-01 01:08:00,2019-11-01 01:26:00,6.90839,79.8651,6.93669,79.9146,301.64,3.917991,1,8,1,26,1026.0,1,1,946.0,0.004142,1.0,-4.5600,0.299543
189127273,10.5,598.0,271.0,15.6638,68.0,2019-11-01 02:27:00,2019-11-01 02:37:00,6.92570,79.8895,6.92748,79.8971,82.30,0.535588,2,27,2,37,530.0,1,1,259.0,0.002068,1.0,0.2168,0.203780
189128020,10.5,1020.0,,,,2019-11-01 03:34:00,2019-11-01 03:51:00,6.87441,79.8615,6.84478,79.9290,358.39,5.062797,3,34,3,51,,1,1,,,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.5,838.0,93.0,5.4219,451.0,2020-01-31 22:07:00,2020-01-31 22:21:00,7.29073,80.6367,7.28891,80.6557,198.26,1.308220,22,7,22,21,387.0,31,31,294.0,0.004450,1.0,0.1209,0.258843
213812756,10.5,2151.0,428.0,0.0000,39.0,2020-01-31 23:07:00,2020-01-31 23:43:00,6.90569,79.8516,6.95089,79.9389,581.23,6.753297,23,7,23,43,2112.0,31,31,1684.0,0.004010,1.0,-24.3960,0.337336
213813930,10.5,263.0,9.0,0.0000,110.0,2020-01-31 23:21:00,2020-01-31 23:25:00,7.09210,79.9000,7.10135,79.9017,76.20,0.649655,23,21,23,25,153.0,31,31,144.0,0.004511,1.0,-0.5130,0.300000
213815405,10.5,858.0,115.0,0.0000,317.0,2020-01-31 23:39:00,2020-01-31 23:53:00,6.94540,79.8768,6.93574,79.9010,133.31,1.788975,23,39,23,53,541.0,31,31,426.0,0.004199,1.0,-6.5550,0.179421


In [76]:
test_features_df["pickup_time"] = pd.to_datetime(test_features_df["pickup_time"],errors = "coerce")
test_features_df["drop_time"] = pd.to_datetime(test_features_df["drop_time"],errors = "coerce")
test_features_df["pickup_time_hour"] = test_features_df["pickup_time"].dt.hour
test_features_df["pickup_time_minute"] = test_features_df["pickup_time"].dt.minute
test_features_df["drop_time_hour"] =test_features_df["drop_time"].dt.hour
test_features_df["drop_time_minute"] =test_features_df["drop_time"].dt.minute
test_features_df["effective_time"] = test_features_df["duration"]-test_features_df["meter_waiting"]-test_features_df['meter_waiting_till_pickup']
test_features_df["pickup_time_day"] = test_features_df["pickup_time"].dt.day

# test_features_df["pickup_time_month"] = test_features_df["pickup_time"].dt.month
test_features_df["drop_time_day"] = test_features_df["drop_time"].dt.day
# test_features_df["drop_time_month"] = test_features_df["drop_time"].dt.month
# test_features_df["pick_up_year"] = test_features_df["drop_time"].dt.year
# test_features_df["drop_year"] = test_features_df["drop_time"].dt.year
# test_features_df["distance"] =(test_features_df["pick_lat"]-test_features_df["drop_lat"])**2 + (test_features_df["pick_lon"]-test_features_df["drop_lon"])**2
test_features_df["distance"] =  distance(test_features_df["pick_lat"],test_features_df["pick_lon"],test_features_df["drop_lat"],test_features_df["drop_lon"])
test_features_df['m_time'] = test_features_df['duration']-test_features_df['meter_waiting_till_pickup']
test_features_df['speed']= test_features_df['distance']/test_features_df['effective_time']
#test_features_df['remaining_fare'] = test_features_df['fare']-test_features_df['additional_fare']-test_features_df['meter_waiting_fare']
# test_features_df["effective_fare"] = test_features_df["fare"]+test_features_df["additional_fare"]
# test_features_df["mean_fare_for_unit_length"] = test_features_df["fare"]/test_features_df["distance"]
# test_features_df["duration_fare"] = test_features_df["duration"]*test_features_df["fare"]

In [77]:
# test_features_df["effective_time"] = (test_features_df["effective_time"] -test_features_df["effective_time"].mean() )/test_features_df["effective_time"].std() 
# test_features_df["effective_fare"] = (test_features_df["effective_fare"] -test_features_df["effective_fare"].mean() )/test_features_df["effective_fare"].std() 
# test_features_df["mean_fare_for_unit_length"] = (test_features_df["mean_fare_for_unit_length"] -test_features_df["mean_fare_for_unit_length"].mean() )/test_features_df["mean_fare_for_unit_length"].std() 
# test_features_df["duration_fare"] = (test_features_df["duration_fare"] -test_features_df["duration_fare"].mean() )/test_features_df["duration_fare"].std() 
test_features_df.isnull().sum()

additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
pickup_time_hour             0
pickup_time_minute           0
drop_time_hour               0
drop_time_minute             0
effective_time               0
pickup_time_day              0
drop_time_day                0
distance                     0
m_time                       0
speed                        1
dtype: int64

In [78]:
# test_features_df = test_features_df.drop(columns=["pickup_time","drop_time"],axis=1)
test_features_df.loc[test_features_df['fare'] <= 2000, 'fare_outlier'] = 1
test_features_df.loc[test_features_df['fare'] > 2000, 'fare_outlier'] = 0

In [79]:
test_features_df['meter_waiting_fare_diff'] = test_features_df['meter_waiting_fare'] - test_features_df['meter_waiting']*0.057

In [80]:
test_features_df['fare_mean'] = (test_features_df['fare'] - test_features_df['meter_waiting_fare'])/(test_features_df['duration'] - test_features_df['meter_waiting'])

In [81]:
# test_features_df.loc[test_features_df['additional_fare'] <= 100, 'additional_fare_outlier'] = 1
# test_features_df.loc[test_features_df['additional_fare'] > 100, 'additional_fare_outlier'] = 0

In [82]:
# test_features_df.loc[test_features_df['meter_waiting'] <= 2000, 'meter_waiting_outlier'] = 1
# test_features_df.loc[test_features_df['meter_waiting'] > 2000, 'meter_waiting_outlier'] = 0

In [83]:
# test_features_df.loc[test_features_df['duration'] <= 6000, 'duration_outlier'] = 1
# test_features_df.loc[test_features_df['duration'] > 6000, 'duration_outlier'] = 0

In [84]:
# test_features_df.loc[test_features_df['meter_waiting_fare'] <= 114, 'meter_waiting_fare_outlier'] = 1
# test_features_df.loc[test_features_df['meter_waiting_fare'] > 114, 'meter_waiting_fare_outlier'] = 0

In [85]:
# test_features_df['speed'] = test_features_df['distance']/test_features_df['effective_time']
#test_features_df['fare'] = test_features_df['fare'].fillna(0)

In [86]:
# X2 = test_features_df.values 
# x2_scaled = min_max_scaler.fit_transform(X2)
# test_features_df = pd.DataFrame(x2_scaled)
test_features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,pickup_time_hour,pickup_time_minute,drop_time_hour,drop_time_minute,effective_time,pickup_time_day,drop_time_day,distance,m_time,speed,fare_outlier,meter_waiting_fare_diff,fare_mean
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
213284604,10.5,924,42,2.44860,148,2020-02-01 00:38:00,2020-02-01 00:53:00,6.83454,79.8750,6.77490,79.8840,289.27,0,38,0,53,734,1,1,4.166730,776,0.005677,1.0,0.05460,0.325194
213286352,10.5,4249,20,0.00000,91,2020-02-01 01:02:00,2020-02-01 02:13:00,6.91168,79.8723,6.55091,79.9706,1912.70,1,2,2,13,4138,1,1,25.823263,4158,0.006241,1.0,-1.14000,0.452282
213293973,10.5,1552,255,2.65880,23,2020-02-01 05:02:00,2020-02-01 05:28:00,6.92145,79.8478,6.90539,79.8989,394.00,5,2,5,28,1274,1,1,3.676453,1529,0.002886,1.0,-11.87620,0.301728
213294622,10.5,462,16,0.00000,198,2020-02-01 05:30:00,2020-02-01 05:38:00,6.77433,79.9416,6.80401,79.9407,154.32,5,30,5,38,248,1,1,2.051619,264,0.008273,1.0,-0.91200,0.346009
213298687,10.5,814,392,12.36920,69,2020-02-01 07:00:00,2020-02-01 07:14:00,6.97968,79.9130,6.98875,79.8914,147.47,7,0,7,14,353,1,1,1.608445,745,0.004557,1.0,-9.97480,0.320144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222856243,10.5,1723,429,24.83332,3,2020-03-16 21:28:00,2020-03-16 21:56:00,6.85103,79.9567,6.85588,79.9214,388.48,21,28,21,56,1291,16,16,2.444643,1720,0.001894,1.0,0.38032,0.281025
222857785,10.5,1378,80,0.00000,125,2020-03-16 21:59:00,2020-03-16 22:22:00,6.91293,79.9656,6.92112,79.8980,379.85,21,59,22,22,1173,16,16,4.671116,1253,0.003982,1.0,-4.56000,0.292643
222858416,10.5,418,56,3.28440,93,2020-03-16 22:02:00,2020-03-16 22:09:00,6.85718,79.9081,6.83868,79.9083,112.79,22,2,22,9,269,16,16,1.278300,325,0.004752,1.0,0.09240,0.302502
222858691,10.5,1604,548,31.67440,17,2020-03-16 22:07:00,2020-03-16 22:34:00,6.91289,79.8846,6.93159,79.9145,248.46,22,7,22,34,1039,16,16,2.423900,1587,0.002333,1.0,0.43840,0.205289


In [87]:
categorical_var1 = np.where(test_features_df.dtypes != np.float )[0]

In [88]:
categorical_var1

array([ 1,  2,  4,  5,  6, 12, 13, 14, 15, 16, 17, 18, 20], dtype=int64)

In [89]:
X

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,pickup_time_hour,pickup_time_minute,drop_time_hour,drop_time_minute,m_time,pickup_time_day,drop_time_day,effective_time,speed,fare_outlier,meter_waiting_fare_diff,fare_mean
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
189123628,10.5,834.0,56.0,0.0000,64.0,2019-11-01 00:20:00,2019-11-01 00:34:00,6.86252,79.8993,6.90330,79.8783,270.32,3.164501,0,20,0,34,770.0,1,1,714.0,0.004432,1.0,-3.1920,0.347455
189125358,10.5,791.0,47.0,0.0000,134.0,2019-11-01 00:56:00,2019-11-01 01:09:00,6.88589,79.8984,6.91373,79.8923,197.85,1.968540,0,56,1,9,657.0,1,1,610.0,0.003227,1.0,-2.6790,0.265927
189125719,10.5,1087.0,80.0,0.0000,61.0,2019-11-01 01:08:00,2019-11-01 01:26:00,6.90839,79.8651,6.93669,79.9146,301.64,3.917991,1,8,1,26,1026.0,1,1,946.0,0.004142,1.0,-4.5600,0.299543
189127273,10.5,598.0,271.0,15.6638,68.0,2019-11-01 02:27:00,2019-11-01 02:37:00,6.92570,79.8895,6.92748,79.8971,82.30,0.535588,2,27,2,37,530.0,1,1,259.0,0.002068,1.0,0.2168,0.203780
189128020,10.5,1020.0,,,,2019-11-01 03:34:00,2019-11-01 03:51:00,6.87441,79.8615,6.84478,79.9290,358.39,5.062797,3,34,3,51,,1,1,,,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.5,838.0,93.0,5.4219,451.0,2020-01-31 22:07:00,2020-01-31 22:21:00,7.29073,80.6367,7.28891,80.6557,198.26,1.308220,22,7,22,21,387.0,31,31,294.0,0.004450,1.0,0.1209,0.258843
213812756,10.5,2151.0,428.0,0.0000,39.0,2020-01-31 23:07:00,2020-01-31 23:43:00,6.90569,79.8516,6.95089,79.9389,581.23,6.753297,23,7,23,43,2112.0,31,31,1684.0,0.004010,1.0,-24.3960,0.337336
213813930,10.5,263.0,9.0,0.0000,110.0,2020-01-31 23:21:00,2020-01-31 23:25:00,7.09210,79.9000,7.10135,79.9017,76.20,0.649655,23,21,23,25,153.0,31,31,144.0,0.004511,1.0,-0.5130,0.300000
213815405,10.5,858.0,115.0,0.0000,317.0,2020-01-31 23:39:00,2020-01-31 23:53:00,6.94540,79.8768,6.93574,79.9010,133.31,1.788975,23,39,23,53,541.0,31,31,426.0,0.004199,1.0,-6.5550,0.179421


In [90]:
categorical_var = np.where(X.dtypes != np.float )[0]

In [91]:
categorical_var

array([ 5,  6, 13, 14, 15, 16, 18, 19], dtype=int64)

In [92]:
model = CatBoostClassifier(iterations=500000,nan_mode='Max')

In [None]:
model.fit(X,y,plot=False)

Learning rate set to 0.000116
0:	learn: 0.6929906	total: 321ms	remaining: 1d 20h 34m 1s
1:	learn: 0.6928060	total: 385ms	remaining: 1d 2h 45m 52s
2:	learn: 0.6926358	total: 436ms	remaining: 20h 10m 53s
3:	learn: 0.6924740	total: 498ms	remaining: 17h 18m 30s
4:	learn: 0.6923138	total: 565ms	remaining: 15h 41m 58s
5:	learn: 0.6921594	total: 633ms	remaining: 14h 39m 48s
6:	learn: 0.6920009	total: 688ms	remaining: 13h 39m 13s
7:	learn: 0.6918365	total: 745ms	remaining: 12h 56m 9s
8:	learn: 0.6916651	total: 831ms	remaining: 12h 49m 20s
9:	learn: 0.6914908	total: 911ms	remaining: 12h 38m 45s
10:	learn: 0.6913094	total: 964ms	remaining: 12h 10m 7s
11:	learn: 0.6911479	total: 1.07s	remaining: 12h 26m 22s
12:	learn: 0.6909803	total: 1.15s	remaining: 12h 17m 31s
13:	learn: 0.6908067	total: 1.21s	remaining: 11h 58m 19s
14:	learn: 0.6906509	total: 1.27s	remaining: 11h 46m 47s
15:	learn: 0.6904829	total: 1.34s	remaining: 11h 40m 34s
16:	learn: 0.6903095	total: 1.41s	remaining: 11h 32m 22s
17:	learn

147:	learn: 0.6687661	total: 5.64s	remaining: 5h 17m 15s
148:	learn: 0.6686057	total: 5.66s	remaining: 5h 16m 23s
149:	learn: 0.6684602	total: 5.67s	remaining: 5h 15m 11s
150:	learn: 0.6682944	total: 5.7s	remaining: 5h 14m 12s
151:	learn: 0.6681333	total: 5.71s	remaining: 5h 13m 9s
152:	learn: 0.6679715	total: 5.74s	remaining: 5h 12m 33s
153:	learn: 0.6678195	total: 5.76s	remaining: 5h 11m 25s
154:	learn: 0.6676693	total: 5.78s	remaining: 5h 10m 26s
155:	learn: 0.6675248	total: 5.79s	remaining: 5h 9m 18s
156:	learn: 0.6673799	total: 5.82s	remaining: 5h 8m 40s
157:	learn: 0.6672121	total: 5.84s	remaining: 5h 7m 44s
158:	learn: 0.6670568	total: 5.86s	remaining: 5h 6m 58s
159:	learn: 0.6668893	total: 5.88s	remaining: 5h 5m 57s
160:	learn: 0.6667302	total: 5.9s	remaining: 5h 5m 31s
161:	learn: 0.6665743	total: 5.93s	remaining: 5h 4m 54s
162:	learn: 0.6664039	total: 5.95s	remaining: 5h 4m 7s
163:	learn: 0.6662458	total: 5.97s	remaining: 5h 3m 3s
164:	learn: 0.6660832	total: 5.98s	remaining:

303:	learn: 0.6440399	total: 8.31s	remaining: 3h 47m 45s
304:	learn: 0.6438799	total: 8.33s	remaining: 3h 47m 29s
305:	learn: 0.6437169	total: 8.35s	remaining: 3h 47m 14s
306:	learn: 0.6435608	total: 8.37s	remaining: 3h 46m 57s
307:	learn: 0.6434109	total: 8.38s	remaining: 3h 46m 39s
308:	learn: 0.6432621	total: 8.4s	remaining: 3h 46m 21s
309:	learn: 0.6431070	total: 8.41s	remaining: 3h 46m 4s
310:	learn: 0.6429413	total: 8.43s	remaining: 3h 45m 50s
311:	learn: 0.6427988	total: 8.45s	remaining: 3h 45m 39s
312:	learn: 0.6426519	total: 8.47s	remaining: 3h 45m 22s
313:	learn: 0.6425129	total: 8.49s	remaining: 3h 45m 4s
314:	learn: 0.6423595	total: 8.51s	remaining: 3h 44m 57s
315:	learn: 0.6422063	total: 8.53s	remaining: 3h 44m 41s
316:	learn: 0.6420500	total: 8.54s	remaining: 3h 44m 30s
317:	learn: 0.6419024	total: 8.56s	remaining: 3h 44m 13s
318:	learn: 0.6417335	total: 8.58s	remaining: 3h 43m 58s
319:	learn: 0.6415752	total: 8.6s	remaining: 3h 43m 42s
320:	learn: 0.6414117	total: 8.61s	

453:	learn: 0.6215293	total: 11s	remaining: 3h 21m 26s
454:	learn: 0.6213999	total: 11s	remaining: 3h 21m 18s
455:	learn: 0.6212474	total: 11s	remaining: 3h 21m 15s
456:	learn: 0.6211213	total: 11s	remaining: 3h 21m 6s
457:	learn: 0.6209616	total: 11.1s	remaining: 3h 20m 58s
458:	learn: 0.6208272	total: 11.1s	remaining: 3h 20m 49s
459:	learn: 0.6206829	total: 11.1s	remaining: 3h 20m 40s
460:	learn: 0.6205346	total: 11.1s	remaining: 3h 20m 31s
461:	learn: 0.6203838	total: 11.1s	remaining: 3h 20m 22s
462:	learn: 0.6202290	total: 11.1s	remaining: 3h 20m 14s
463:	learn: 0.6200805	total: 11.2s	remaining: 3h 20m 6s
464:	learn: 0.6199349	total: 11.2s	remaining: 3h 20m
465:	learn: 0.6197756	total: 11.2s	remaining: 3h 20m 4s
466:	learn: 0.6196354	total: 11.2s	remaining: 3h 19m 57s
467:	learn: 0.6194870	total: 11.2s	remaining: 3h 19m 48s
468:	learn: 0.6193503	total: 11.2s	remaining: 3h 19m 39s
469:	learn: 0.6191902	total: 11.3s	remaining: 3h 19m 31s
470:	learn: 0.6190375	total: 11.3s	remaining: 

603:	learn: 0.5999698	total: 13.9s	remaining: 3h 11m 6s
604:	learn: 0.5998247	total: 13.9s	remaining: 3h 11m 1s
605:	learn: 0.5996913	total: 13.9s	remaining: 3h 10m 56s
606:	learn: 0.5995505	total: 13.9s	remaining: 3h 10m 50s
607:	learn: 0.5994016	total: 13.9s	remaining: 3h 10m 46s
608:	learn: 0.5992697	total: 14s	remaining: 3h 10m 40s
609:	learn: 0.5991360	total: 14s	remaining: 3h 10m 40s
610:	learn: 0.5989952	total: 14s	remaining: 3h 10m 35s
611:	learn: 0.5988557	total: 14s	remaining: 3h 10m 30s
612:	learn: 0.5987138	total: 14s	remaining: 3h 10m 24s
613:	learn: 0.5985798	total: 14s	remaining: 3h 10m 20s
614:	learn: 0.5984388	total: 14.1s	remaining: 3h 10m 15s
615:	learn: 0.5982912	total: 14.1s	remaining: 3h 10m 13s
616:	learn: 0.5981492	total: 14.1s	remaining: 3h 10m 11s
617:	learn: 0.5979966	total: 14.1s	remaining: 3h 10m 7s
618:	learn: 0.5978495	total: 14.1s	remaining: 3h 10m 2s
619:	learn: 0.5977081	total: 14.2s	remaining: 3h 9m 57s
620:	learn: 0.5975807	total: 14.2s	remaining: 3h

756:	learn: 0.5791326	total: 16.5s	remaining: 3h 1m 37s
757:	learn: 0.5790090	total: 16.5s	remaining: 3h 1m 33s
758:	learn: 0.5788781	total: 16.6s	remaining: 3h 1m 29s
759:	learn: 0.5787414	total: 16.6s	remaining: 3h 1m 25s
760:	learn: 0.5785986	total: 16.6s	remaining: 3h 1m 21s
761:	learn: 0.5784674	total: 16.6s	remaining: 3h 1m 17s
762:	learn: 0.5783287	total: 16.6s	remaining: 3h 1m 12s
763:	learn: 0.5781874	total: 16.6s	remaining: 3h 1m 8s
764:	learn: 0.5780467	total: 16.6s	remaining: 3h 1m 5s
765:	learn: 0.5779093	total: 16.7s	remaining: 3h 1m 2s
766:	learn: 0.5777773	total: 16.7s	remaining: 3h 1m 2s
767:	learn: 0.5776478	total: 16.7s	remaining: 3h 59s
768:	learn: 0.5775144	total: 16.7s	remaining: 3h 58s
769:	learn: 0.5773922	total: 16.7s	remaining: 3h 53s
770:	learn: 0.5772720	total: 16.8s	remaining: 3h 51s
771:	learn: 0.5771325	total: 16.8s	remaining: 3h 50s
772:	learn: 0.5770067	total: 16.8s	remaining: 3h 50s
773:	learn: 0.5768781	total: 16.8s	remaining: 3h 46s
774:	learn: 0.576

915:	learn: 0.5584073	total: 19.1s	remaining: 2h 53m 41s
916:	learn: 0.5582767	total: 19.1s	remaining: 2h 53m 40s
917:	learn: 0.5581479	total: 19.2s	remaining: 2h 53m 38s
918:	learn: 0.5580141	total: 19.2s	remaining: 2h 53m 35s
919:	learn: 0.5578776	total: 19.2s	remaining: 2h 53m 32s
920:	learn: 0.5577609	total: 19.2s	remaining: 2h 53m 29s
921:	learn: 0.5576236	total: 19.2s	remaining: 2h 53m 26s
922:	learn: 0.5575114	total: 19.2s	remaining: 2h 53m 22s
923:	learn: 0.5573821	total: 19.3s	remaining: 2h 53m 19s
924:	learn: 0.5572574	total: 19.3s	remaining: 2h 53m 16s
925:	learn: 0.5571297	total: 19.3s	remaining: 2h 53m 13s
926:	learn: 0.5570144	total: 19.3s	remaining: 2h 53m 10s
927:	learn: 0.5568781	total: 19.3s	remaining: 2h 53m 7s
928:	learn: 0.5567459	total: 19.3s	remaining: 2h 53m 8s
929:	learn: 0.5566142	total: 19.4s	remaining: 2h 53m 5s
930:	learn: 0.5564837	total: 19.4s	remaining: 2h 53m 4s
931:	learn: 0.5563513	total: 19.4s	remaining: 2h 53m 1s
932:	learn: 0.5562384	total: 19.4s	r

1066:	learn: 0.5396432	total: 21.6s	remaining: 2h 48m 14s
1067:	learn: 0.5395357	total: 21.6s	remaining: 2h 48m 13s
1068:	learn: 0.5394096	total: 21.6s	remaining: 2h 48m 11s
1069:	learn: 0.5392903	total: 21.6s	remaining: 2h 48m 8s
1070:	learn: 0.5391730	total: 21.7s	remaining: 2h 48m 6s
1071:	learn: 0.5390500	total: 21.7s	remaining: 2h 48m 4s
1072:	learn: 0.5389253	total: 21.7s	remaining: 2h 48m 1s
1073:	learn: 0.5388031	total: 21.7s	remaining: 2h 47m 59s
1074:	learn: 0.5386834	total: 21.7s	remaining: 2h 47m 57s
1075:	learn: 0.5385765	total: 21.7s	remaining: 2h 47m 54s
1076:	learn: 0.5384581	total: 21.7s	remaining: 2h 47m 52s
1077:	learn: 0.5383419	total: 21.8s	remaining: 2h 47m 50s
1078:	learn: 0.5382179	total: 21.8s	remaining: 2h 47m 47s
1079:	learn: 0.5380926	total: 21.8s	remaining: 2h 47m 49s
1080:	learn: 0.5379719	total: 21.8s	remaining: 2h 47m 47s
1081:	learn: 0.5378508	total: 21.8s	remaining: 2h 47m 44s
1082:	learn: 0.5377310	total: 21.8s	remaining: 2h 47m 42s
1083:	learn: 0.537

1216:	learn: 0.5218930	total: 24s	remaining: 2h 44m 15s
1217:	learn: 0.5217745	total: 24.1s	remaining: 2h 44m 14s
1218:	learn: 0.5216551	total: 24.1s	remaining: 2h 44m 12s
1219:	learn: 0.5215449	total: 24.1s	remaining: 2h 44m 10s
1220:	learn: 0.5214285	total: 24.1s	remaining: 2h 44m 8s
1221:	learn: 0.5213066	total: 24.1s	remaining: 2h 44m 6s
1222:	learn: 0.5211884	total: 24.1s	remaining: 2h 44m 4s
1223:	learn: 0.5210790	total: 24.2s	remaining: 2h 44m 2s
1224:	learn: 0.5209586	total: 24.2s	remaining: 2h 44m
1225:	learn: 0.5208471	total: 24.2s	remaining: 2h 43m 59s
1226:	learn: 0.5207338	total: 24.2s	remaining: 2h 43m 59s
1227:	learn: 0.5206108	total: 24.2s	remaining: 2h 43m 57s
1228:	learn: 0.5204942	total: 24.2s	remaining: 2h 43m 56s
1229:	learn: 0.5203771	total: 24.3s	remaining: 2h 43m 56s
1230:	learn: 0.5202582	total: 24.3s	remaining: 2h 43m 54s
1231:	learn: 0.5201403	total: 24.3s	remaining: 2h 43m 52s
1232:	learn: 0.5200231	total: 24.3s	remaining: 2h 43m 54s
1233:	learn: 0.5199007	t

1365:	learn: 0.5050754	total: 26.5s	remaining: 2h 41m 14s
1366:	learn: 0.5049635	total: 26.5s	remaining: 2h 41m 12s
1367:	learn: 0.5048451	total: 26.5s	remaining: 2h 41m 11s
1368:	learn: 0.5047330	total: 26.5s	remaining: 2h 41m 10s
1369:	learn: 0.5046273	total: 26.6s	remaining: 2h 41m 8s
1370:	learn: 0.5045189	total: 26.6s	remaining: 2h 41m 7s
1371:	learn: 0.5044033	total: 26.6s	remaining: 2h 41m 5s
1372:	learn: 0.5043008	total: 26.6s	remaining: 2h 41m 3s
1373:	learn: 0.5041843	total: 26.6s	remaining: 2h 41m 2s
1374:	learn: 0.5040695	total: 26.6s	remaining: 2h 41m
1375:	learn: 0.5039712	total: 26.7s	remaining: 2h 40m 58s
1376:	learn: 0.5038594	total: 26.7s	remaining: 2h 40m 57s
1377:	learn: 0.5037470	total: 26.7s	remaining: 2h 40m 56s
1378:	learn: 0.5036419	total: 26.7s	remaining: 2h 40m 56s
1379:	learn: 0.5035258	total: 26.7s	remaining: 2h 40m 55s
1380:	learn: 0.5034112	total: 26.7s	remaining: 2h 40m 54s
1381:	learn: 0.5033002	total: 26.8s	remaining: 2h 40m 52s
1382:	learn: 0.5032004	

1519:	learn: 0.4886141	total: 29.1s	remaining: 2h 39m 15s
1520:	learn: 0.4885073	total: 29.2s	remaining: 2h 39m 16s
1521:	learn: 0.4884025	total: 29.2s	remaining: 2h 39m 15s
1522:	learn: 0.4882945	total: 29.2s	remaining: 2h 39m 14s
1523:	learn: 0.4881894	total: 29.2s	remaining: 2h 39m 13s
1524:	learn: 0.4880842	total: 29.2s	remaining: 2h 39m 12s
1525:	learn: 0.4879871	total: 29.2s	remaining: 2h 39m 10s
1526:	learn: 0.4878901	total: 29.3s	remaining: 2h 39m 9s
1527:	learn: 0.4877953	total: 29.3s	remaining: 2h 39m 9s
1528:	learn: 0.4876876	total: 29.3s	remaining: 2h 39m 8s
1529:	learn: 0.4875991	total: 29.3s	remaining: 2h 39m 7s
1530:	learn: 0.4874927	total: 29.3s	remaining: 2h 39m 5s
1531:	learn: 0.4873868	total: 29.3s	remaining: 2h 39m 7s
1532:	learn: 0.4872937	total: 29.4s	remaining: 2h 39m 6s
1533:	learn: 0.4871973	total: 29.4s	remaining: 2h 39m 8s
1534:	learn: 0.4870934	total: 29.4s	remaining: 2h 39m 8s
1535:	learn: 0.4869932	total: 29.4s	remaining: 2h 39m 7s
1536:	learn: 0.4868873	t

1670:	learn: 0.4732405	total: 31.8s	remaining: 2h 38m 8s
1671:	learn: 0.4731371	total: 31.8s	remaining: 2h 38m 12s
1672:	learn: 0.4730315	total: 31.9s	remaining: 2h 38m 11s
1673:	learn: 0.4729374	total: 31.9s	remaining: 2h 38m 10s
1674:	learn: 0.4728433	total: 31.9s	remaining: 2h 38m 9s
1675:	learn: 0.4727508	total: 31.9s	remaining: 2h 38m 8s
1676:	learn: 0.4726476	total: 31.9s	remaining: 2h 38m 8s
1677:	learn: 0.4725511	total: 31.9s	remaining: 2h 38m 7s
1678:	learn: 0.4724612	total: 32s	remaining: 2h 38m 5s
1679:	learn: 0.4723788	total: 32s	remaining: 2h 38m 4s
1680:	learn: 0.4722767	total: 32s	remaining: 2h 38m 3s
1681:	learn: 0.4721759	total: 32s	remaining: 2h 38m 2s
1682:	learn: 0.4720820	total: 32s	remaining: 2h 38m 2s
1683:	learn: 0.4719889	total: 32s	remaining: 2h 38m 1s
1684:	learn: 0.4718888	total: 32.1s	remaining: 2h 38m
1685:	learn: 0.4717842	total: 32.1s	remaining: 2h 37m 59s
1686:	learn: 0.4716772	total: 32.1s	remaining: 2h 37m 58s
1687:	learn: 0.4715847	total: 32.1s	remai

1817:	learn: 0.4591827	total: 34.3s	remaining: 2h 36m 35s
1818:	learn: 0.4590841	total: 34.3s	remaining: 2h 36m 35s
1819:	learn: 0.4589819	total: 34.3s	remaining: 2h 36m 34s
1820:	learn: 0.4588976	total: 34.3s	remaining: 2h 36m 34s
1821:	learn: 0.4588009	total: 34.4s	remaining: 2h 36m 33s
1822:	learn: 0.4587041	total: 34.4s	remaining: 2h 36m 32s
1823:	learn: 0.4586081	total: 34.4s	remaining: 2h 36m 31s
1824:	learn: 0.4585061	total: 34.4s	remaining: 2h 36m 31s
1825:	learn: 0.4584069	total: 34.4s	remaining: 2h 36m 30s
1826:	learn: 0.4583140	total: 34.4s	remaining: 2h 36m 29s
1827:	learn: 0.4582270	total: 34.4s	remaining: 2h 36m 28s
1828:	learn: 0.4581283	total: 34.5s	remaining: 2h 36m 27s
1829:	learn: 0.4580276	total: 34.5s	remaining: 2h 36m 29s
1830:	learn: 0.4579278	total: 34.5s	remaining: 2h 36m 28s
1831:	learn: 0.4578332	total: 34.5s	remaining: 2h 36m 27s
1832:	learn: 0.4577407	total: 34.5s	remaining: 2h 36m 26s
1833:	learn: 0.4576434	total: 34.6s	remaining: 2h 36m 25s
1834:	learn: 0

1961:	learn: 0.4458594	total: 36.7s	remaining: 2h 35m 20s
1962:	learn: 0.4457644	total: 36.7s	remaining: 2h 35m 20s
1963:	learn: 0.4456773	total: 36.8s	remaining: 2h 35m 19s
1964:	learn: 0.4455895	total: 36.8s	remaining: 2h 35m 21s
1965:	learn: 0.4455014	total: 36.8s	remaining: 2h 35m 20s
1966:	learn: 0.4454099	total: 36.8s	remaining: 2h 35m 20s
1967:	learn: 0.4453177	total: 36.8s	remaining: 2h 35m 19s
1968:	learn: 0.4452271	total: 36.8s	remaining: 2h 35m 18s
1969:	learn: 0.4451410	total: 36.9s	remaining: 2h 35m 17s
1970:	learn: 0.4450480	total: 36.9s	remaining: 2h 35m 16s
1971:	learn: 0.4449541	total: 36.9s	remaining: 2h 35m 15s
1972:	learn: 0.4448648	total: 36.9s	remaining: 2h 35m 15s
1973:	learn: 0.4447821	total: 36.9s	remaining: 2h 35m 15s
1974:	learn: 0.4446942	total: 36.9s	remaining: 2h 35m 16s
1975:	learn: 0.4446099	total: 37s	remaining: 2h 35m 16s
1976:	learn: 0.4445266	total: 37s	remaining: 2h 35m 15s
1977:	learn: 0.4444452	total: 37s	remaining: 2h 35m 14s
1978:	learn: 0.44435

In [None]:
prediction = model.predict(test_features_df)

In [None]:
prediction

In [None]:
submission_df = pd.read_csv("sample_submission.csv", 
                            index_col="tripid")

In [None]:
X.head()


In [None]:
np.testing.assert_array_equal(test_features_df.index.values, 
                              submission_df.index.values)


In [None]:
submission_df["prediction"] = prediction[:]

In [None]:
submission_df.head()
submission_df.isnull().sum()

In [857]:
submission_df.to_csv('160374E_catboost_speed_mtime_new.csv', index=True)

In [0]:

# model.save_model("model_trained_features_added_month_dropped_cbm", format="cbm")


In [0]:
cd /content/drive/My Drive/Machine learning

/content/drive/My Drive/Machine learning


In [0]:
from xgboost import XGBClassifier

In [0]:
model1 = XGBClassifier()

In [0]:
model1.fit(X, y)

ValueError: ignored

In [0]:
y_pred = model.predict(test_features_df)

In [0]:
X.shape

In [0]:
test_features_df.shape
