In [18]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from catboost import CatBoostClassifier

In [19]:
# changed distance calculation
def harv_distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a)) 

In [20]:
train_df = pd.read_csv(
    "../data/train.csv", 
    index_col="tripid",
    parse_dates=['pickup_time','drop_time']
)


In [21]:
train_df['label'].replace(to_replace=['incorrect','correct'], value=[0,1],inplace=True)
y = train_df['label']

In [22]:
X = train_df.drop(columns=["label"], axis=1)

In [23]:
test_df = pd.read_csv("../data/test.csv", 
                               index_col="tripid",
                     parse_dates=['pickup_time','drop_time'])

In [24]:
#X = X.dropna() // reduced the accuracy 

In [25]:
X = X.fillna(X.mean())
test_df = test_df.fillna(test_df.mean())

In [26]:
X["pickup_hour"] = X["pickup_time"].dt.hour
X["pickup_minute"] = X["pickup_time"].dt.minute
X["pickup_date"] = X["pickup_time"].dt.day
X["drop_hour"] =X["drop_time"].dt.hour
X["drop_minute"] =X["drop_time"].dt.minute
X["drop_date"] = X["drop_time"].dt.day
# X["pickup_month"] = X["pickup_time"].dt.month
# X["drop_month"] = X["drop_time"].dt.month
# X["pickup_year"] = X["drop_time"].dt.year
# X["drop_year"] = X["drop_time"].dt.year
X["distance"] =  harv_distance(X["pick_lat"],X["pick_lon"],X["drop_lat"],X["drop_lon"])
X["time_taken"] = X["duration"]-X["meter_waiting"]

In [27]:
X.dtypes

additional_fare                     float64
duration                            float64
meter_waiting                       float64
meter_waiting_fare                  float64
meter_waiting_till_pickup           float64
pickup_time                  datetime64[ns]
drop_time                    datetime64[ns]
pick_lat                            float64
pick_lon                            float64
drop_lat                            float64
drop_lon                            float64
fare                                float64
pickup_hour                           int64
pickup_minute                         int64
pickup_date                           int64
drop_hour                             int64
drop_minute                           int64
drop_date                             int64
distance                            float64
time_taken                          float64
dtype: object

In [28]:
test_df["pickup_hour"] = test_df["pickup_time"].dt.hour
test_df["pickup_minute"] = test_df["pickup_time"].dt.minute
test_df["drop_hour"] =test_df["drop_time"].dt.hour
test_df["drop_minute"] =test_df["drop_time"].dt.minute
test_df["pickup_date"] = test_df["pickup_time"].dt.day
test_df["drop_date"] = test_df["drop_time"].dt.day
# test_df["pickup_month"] = test_df["pickup_time"].dt.month
# test_df["drop_month"] = test_df["drop_time"].dt.month
# test_df["pickup_year"] = test_df["pickup_time"].dt.year
# test_df["drop_year"] = test_df["drop_time"].dt.year
test_df["distance"] =  harv_distance(test_df["pick_lat"],test_df["pick_lon"],test_df["drop_lat"],test_df["drop_lon"])
test_df["time_taken"] = test_df["duration"]-test_df["meter_waiting"]

In [64]:
model = CatBoostClassifier(iterations=600000)

In [65]:
model.fit(X,y,plot=False,verbose=False)

<catboost.core.CatBoostClassifier at 0x7f1bedf5f278>

In [66]:
prediction = model.predict(test_df)

In [67]:
prediction

array([1, 1, 1, ..., 1, 1, 1])

In [68]:

data = {'tripid':test_df.index.values}
df_res = pd.DataFrame(data)
df_res['prediction'] = prediction
df_res.to_csv('data/catboost_12.csv', index=False)
df_res['prediction'].value_counts()

1    8202
0     374
Name: prediction, dtype: int64