In [21]:
import pickle
import pandas as pd
import math
from sklearn import preprocessing

def get_season(month):
    if month >= 3 and month <= 5:
        return 'spring'
    elif month >= 6 and month <= 8:
        return 'summer'
    elif month >= 9 and month <= 11:
        return 'autumn'
    else:
        return 'winter'

def distance(origin, destination):
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371 # km

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c

    return d

In [9]:
# load the test set
test_data = pd.read_csv("test.csv")
test_data['pickup_datetime'] = pd.to_datetime(test_data['pickup_datetime'])
test_data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [25]:
test_data['pickup_Month'] = test_data['pickup_datetime'].map(lambda x: x.month)
test_data['pickup_Hour'] = test_data['pickup_datetime'].map(lambda x: x.hour)
test_data = test_data.assign(pickup_time=pd.cut(test_data.pickup_datetime.dt.hour, [-1, 12, 16, 24], labels=['Morning', 'Afternoon', 'Evening']))

In [12]:
test_data['pickup_season'] = test_data['pickup_Month'].apply(get_season)

In [17]:
test_data['jarak'] = [distance((test_data['pickup_latitude'][m], test_data['pickup_longitude'][m]), (test_data['dropoff_latitude'][m], test_data['dropoff_longitude'][m])) for m in range(len(test_data.ix[:]))]

In [24]:
test_data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_Month,pickup_Hour,pickup_season,jarak
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N,6,23,summer,2.746426
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N,6,23,summer,2.759239
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N,6,23,summer,1.306155
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N,6,23,summer,5.269088
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N,6,23,summer,0.960842


In [26]:
X = test_data.drop(['id','vendor_id','pickup_datetime'],axis=1)

# encode data prep
le_flag = preprocessing.LabelEncoder()
X["store_and_fwd_flag"] = le_flag.fit_transform(X["store_and_fwd_flag"])

le_pseason = preprocessing.LabelEncoder()
X["pickup_season"] = le_pseason.fit_transform(X["pickup_season"])

le_ptime = preprocessing.LabelEncoder()
X["pickup_time"] = le_ptime.fit_transform(X["pickup_time"])

# no trip duraction available in the dataset
# Y = raw_data["trip_duration"] 

In [28]:
model_pkl = open('Regengbrtaxi.pkl', 'rb')
model = pickle.load(model_pkl)
y_pred = model.predict(X)
print ("Loaded model : ", model)
print('Prediction : ',y_pred)

Loaded model :  RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Prediction :  [ 9029.    316.2   396.9 ...,   642.2   995.3   412.4]


In [34]:
compiled_result = test_data.drop(['pickup_time','vendor_id','pickup_datetime','passenger_count','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','store_and_fwd_flag','pickup_Month','pickup_Hour','pickup_season','jarak'],axis=1)

In [35]:
compiled_result['trip_duration'] = y_pred

In [36]:
compiled_result

Unnamed: 0,id,trip_duration
0,id3004672,9029.0
1,id3505355,316.2
2,id1217141,396.9
3,id2150126,361.6
4,id1598245,506.4
5,id0668992,328.1
6,id1765014,287.5
7,id0898117,540.7
8,id3905224,9221.4
9,id1543102,404.0


In [37]:
compiled_result.to_csv('predict.csv')

In [39]:
df_importance = pd.DataFrame(X.columns, columns=["Column name"])
df_importance["Feature Importance"] = model.feature_importances_

In [40]:
df_importance

Unnamed: 0,Column name,Feature Importance
0,passenger_count,0.025342
1,pickup_longitude,0.079492
2,pickup_latitude,0.229026
3,dropoff_longitude,0.083875
4,dropoff_latitude,0.122387
5,store_and_fwd_flag,0.000101
6,pickup_Month,0.052782
7,pickup_Hour,0.071426
8,pickup_season,0.005321
9,jarak,0.019145
