In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from pyproj import Geod
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score

In [3]:
train_df = pd.read_csv('../data/train.csv',parse_dates=['pickup_time','drop_time'])
test_df = pd.read_csv('../data/test.csv', parse_dates=['pickup_time','drop_time'])

In [4]:
train_df.isnull().sum()

tripid                         0
additional_fare              202
duration                     202
meter_waiting                202
meter_waiting_fare           202
meter_waiting_till_pickup    202
pickup_time                    0
drop_time                      0
pick_lat                       0
pick_lon                       0
drop_lat                       0
drop_lon                       0
fare                         137
label                          0
dtype: int64

In [5]:
wgs84_geod = Geod(ellps='WGS84')
def Distance(lat1,lon1,lat2,lon2):
  az12,az21,dist = wgs84_geod.inv(lon1,lat1,lon2,lat2)
  return dist

In [6]:
distances_train = Distance(train_df.pick_lat.to_list(),
                     train_df.pick_lon.to_list(),
                     train_df.drop_lat.to_list(),
                     train_df.drop_lon.to_list())

distances_test = Distance(test_df.pick_lat.to_list(),
                     test_df.pick_lon.to_list(),
                     test_df.drop_lat.to_list(),
                     test_df.drop_lon.to_list())

In [7]:
train_df["distance"]=distances_train
test_df["distance"]=distances_test

In [8]:
# pickup timestamp
train_df = train_df.assign(
                            pickup_minute=test_df.pickup_time.dt.minute,
                            pickup_hour=test_df.pickup_time.dt.hour,
                            pickup_date=test_df.pickup_time.dt.day,
                            pickup_day = test_df.pickup_time.dt.dayofweek,
                            pickup_month=test_df.pickup_time.dt.month,
                            pickup_year=test_df.pickup_time.dt.year
                        )

# drop timestamp
test_df = test_df.assign(
                            drop_minute=test_df.drop_time.dt.minute,
                            drop_hour=test_df.drop_time.dt.hour,
                            drop_date=test_df.drop_time.dt.day,
                            drop_day = test_df.drop_time.dt.dayofweek,
                            drop_month=test_df.drop_time.dt.month,
                            drop_year=test_df.drop_time.dt.year,
               )

In [9]:
X = train_df[[
    'additional_fare', 
    'duration',
    'meter_waiting',
    'meter_waiting_fare',
    'meter_waiting_till_pickup', 
    'fare',
    'distance',
#     'pickup_hour',
#     'pickup_minute',
#     'drop_hour',
#     'drop_minute'
]]

In [10]:
train_df['label'].replace(to_replace=['incorrect','correct'], value=[0,1],inplace=True)
y = train_df['label']

In [11]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (13740, 7) (13740,)
Test set: (3436, 7) (3436,)


In [12]:
scalable = [col for col in X_train.columns if 
                X_train[col].dtype in ['int64', 'float64']]

In [13]:
scal_imput = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy='median')) 
])

In [14]:
preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", scal_imput, scalable)
    ],
    remainder = "drop"
)

In [15]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=1000, learning_rate=0.01, n_jobs=6, max_depth=5)



In [16]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

In [17]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numeric',
                                                  Pipeline(memory=None,
                                                           steps=[('standard_scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True)),
                                                                  ('simple_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                 

In [18]:
y_pred = pipeline.predict(X_test)

In [19]:
score = mean_absolute_error(y_test, y_pred)
print('MAE:', score)

MAE: 0.050058207217694994


In [21]:
y_train_pred = pipeline.predict(X_train)
print('train acc f1 ',f1_score(y_train,y_train_pred))
print('test acc',accuracy_score(y_test,y_pred))
print('test acc f1 ',f1_score(y_test,y_pred))

train acc f1  0.9791319527096722
test acc 0.9499417927823051
test acc f1  0.9728706624605679


In [53]:
# from sklearn.impute import SimpleImputer
# imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
# X= imp_mean.fit(X).transform(X)

In [54]:
# np.isnan(X) == True

In [55]:
# X = StandardScaler().fit(X).transform(X.astype(float))

In [56]:
# from xgboost import XGBClassifier

# model = XGBClassifier(n_estimators=600, learning_rate=0.01, n_jobs=4, max_depth=14)

In [57]:
# model.fit(X_train,y_train)
# y_pred = model.predict(X_test)

In [58]:
# from sklearn.metrics import mean_absolute_error
# score = mean_absolute_error(y_test, y_pred)
# print('MAE:', score)

In [59]:
# print('train acc',accuracy_score(y_train,y_train_pred))
# from sklearn.metrics import accuracy_score, f1_score
# y_train_pred = model.predict(X_train)
# print('train acc f1 ',f1_score(y_train,y_train_pred))
# print('test acc',accuracy_score(y_test,y_pred))
# print('test acc f1 ',f1_score(y_test,y_pred))

In [22]:
X_ = test_df[[
    'additional_fare', 
    'duration',
    'meter_waiting',
    'meter_waiting_fare',
    'meter_waiting_till_pickup', 
    'fare',
    'distance',
#     'pickup_hour',
#     'pickup_minute',
#     'drop_hour',
#     'drop_minute'
]]


In [24]:
yhat = pipeline.predict(X_)
data = {'tripid':test_df['tripid'].values}
df_res = pd.DataFrame(data)
df_res['prediction'] = yhat
df_res.to_csv('../data/xgboost_16.csv', index=False)
df_res['prediction'].value_counts()

1    8232
0     344
Name: prediction, dtype: int64