In [16]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [17]:
# import data
data_sample = pd.read_csv("data/train_sample2.csv")

In [18]:
# Change datetime
data_sample["pickup_datetime"] = pd.DatetimeIndex(data_sample["pickup_datetime"])

In [19]:
from keras.utils.np_utils import to_categorical
import geopy
from geopy import distance

data_sample.dropna()
data_sample = data_sample[(data_sample['pickup_latitude'] <=90)
            & (data_sample['pickup_latitude'] >=-90)
            & (data_sample['pickup_longitude'] <=90)
            & (data_sample['pickup_longitude'] >=-90)
            & (data_sample['dropoff_latitude'] <=90)
            & (data_sample['dropoff_latitude'] >=-90)
            & (data_sample['dropoff_longitude'] <=90)
            & (data_sample['dropoff_longitude'] >=-90)
            & (data_sample['dropoff_longitude'] !=0)
            & (data_sample['dropoff_latitude'] !=0)
            & (data_sample['pickup_longitude'] !=0)
            & (data_sample['pickup_latitude'] !=0)
            & (data_sample['fare_amount']>0)]

coordonnes_geo = data_sample.values[:,3:7]

def dist(row):
    coords_1 = (row[1],row[0])
    coords_2 = (row[3],row[2])
    return geopy.distance.distance(coords_1,coords_2).km

distances  = []

for trajet in coordonnes_geo:
    distance_trajet = dist(trajet)/1000
    distances.append(distance_trajet)

X = data_sample.iloc[:, 3:]
X = X.drop(columns = "passenger_count")
X['year'] = data_sample["pickup_datetime"].dt.year
#X['day_of_week'] = data_sample["pickup_datetime"].dt.dayofweek
X['hour'] = data_sample["pickup_datetime"].dt.hour
X['month'] = data_sample["pickup_datetime"].dt.month
X["distances(km)"] = distances
print(X)

Y = data_sample.iloc[:, 1]

        pickup_longitude  pickup_latitude  dropoff_longitude  \
0             -73.982178        40.765060         -73.978310   
1             -73.956352        40.769759         -73.985643   
2             -73.994662        40.750358         -73.991467   
3             -73.987130        40.733143         -73.991567   
4             -73.947868        40.779554         -74.006704   
5             -73.969501        40.760041         -73.998951   
6             -73.984140        40.754670         -73.865088   
7             -74.431217        41.346233         -73.953667   
8             -73.967725        40.771409         -73.983998   
9             -73.955731        40.779478         -73.974705   
10            -73.958953        40.767867         -73.964362   
11            -74.002267        40.724853         -74.000773   
12            -73.989639        40.734341         -73.988152   
13            -73.992917        40.752082         -73.782372   
14            -73.962350        40.77904

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [45]:
X_train = X
Y_train = Y

In [55]:
GBM = GradientBoostingRegressor(n_estimators=1000,
                                         max_depth=8,
                                         min_samples_split= 4,
                                         learning_rate=0.05,
                                         loss='ls')
GBM.fit(X_train, Y_train)
y_pred = GBM.predict(X_test)
GBM

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.05, loss='ls', max_depth=8,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=4,
                          min_weight_fraction_leaf=0.0, n_estimators=1000,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [58]:
from sklearn import tree

DTR = tree.DecisionTreeRegressor()
DTR.fit(X_train, Y_train)
y_pred = DTR.predict(X_test)
DTR

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [56]:
print(GBM.score(X_test, Y_test))
mse = mean_squared_error(Y_test, GBM.predict(X_test))
print(mse)

0.7379016557220651
25.866760750320278


In [59]:
print(DTR.score(X_test, Y_test))
mse = mean_squared_error(Y_test, DTR.predict(X_test))
print(mse)

0.5462614031553155
44.77993846203612


In [57]:
GBM.feature_importances_

array([0.23994257, 0.13031938, 0.4207653 , 0.14100605, 0.00392697,
       0.03272377, 0.00843598, 0.02287998])

In [60]:
DTR.feature_importances_

array([0.23097749, 0.12666182, 0.41921688, 0.13757158, 0.01022826,
       0.03158601, 0.01705957, 0.02669838])

In [6]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 500, n_jobs = 6)

In [34]:
from sklearn.model_selection import GridSearchCV

max_features_values = [3,4,5,6]


param_grid = {"max_features":max_features_values}
grid_search = GridSearchCV(rf, param_grid, n_jobs = 6)
grid_result = grid_search.fit(X_train, Y_train)



In [35]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
    
print(grid_result.best_params_)

0.799302 (0.017758) with: {'max_features': 3}
0.797806 (0.017609) with: {'max_features': 4}
0.795330 (0.016324) with: {'max_features': 5}
0.789572 (0.016172) with: {'max_features': 6}
{'max_features': 3}


In [27]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestRegressor(max_features = 3, n_estimators = 400, n_jobs = 6)

max_depth_values = [3,5,10]

param_grid = {"max_depth":max_depth_values}
grid_search = GridSearchCV(rf, param_grid, n_jobs = 6)
grid_result = grid_search.fit(X_train, Y_train)

KeyboardInterrupt: 

In [None]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
    
print(grid_result.best_params_)

In [21]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 400, n_jobs = 6, max_features = 3)

rf.fit(X_train, Y_train)
y_pred = rf.predict(X_test)

In [22]:
rmse = sqrt(mean_squared_error(Y_test, rf.predict(X_test)))
print(rmse)

3.8065690326717565


In [25]:
print(rf.feature_importances_)

[0.1364576  0.06475473 0.13184188 0.07024449 0.0268377  0.01836205
 0.01283556 0.53866599]


In [23]:
X_sub_data = pd.read_csv('data/test.csv',header=0)
keys = X_sub_data.key

X_sub_data.dropna()
X_sub_data = X_sub_data[(X_sub_data['pickup_latitude'] <=90)
            & (X_sub_data['pickup_latitude'] >=-90)
            & (X_sub_data['pickup_longitude'] <=90)
            & (X_sub_data['pickup_longitude'] >=-90)
            & (X_sub_data['dropoff_latitude'] <=90)
            & (X_sub_data['dropoff_latitude'] >=-90)
            & (X_sub_data['dropoff_longitude'] <=90)
            & (X_sub_data['dropoff_longitude'] >=-90)
            & (X_sub_data['dropoff_longitude'] !=0)
            & (X_sub_data['dropoff_latitude'] !=0)
            & (X_sub_data['pickup_longitude'] !=0)
            & (X_sub_data['pickup_latitude'] !=0)]

X_sub_data["pickup_datetime"] = pd.DatetimeIndex(X_sub_data["pickup_datetime"])

coordonnes_geo = X_sub_data.values[:,2:6]

distances  = []

for trajet in coordonnes_geo:
    distance_trajet = dist(trajet)/1000
    distances.append(distance_trajet)

In [24]:
X_sub = X_sub_data.iloc[:, 2:]
X_sub = X_sub.drop(columns = "passenger_count")
X_sub['year'] = X_sub_data["pickup_datetime"].dt.year
#X['day_of_week'] = data_sample["pickup_datetime"].dt.dayofweek
X_sub['hour'] = X_sub_data["pickup_datetime"].dt.hour
X_sub['month'] = X_sub_data["pickup_datetime"].dt.month
X_sub["distances(km)"] = distances

print(X_sub)

Y_sub = rf.predict(X_sub)
X_sub['fare_amount']=Y_sub
X_sub['key']=keys
Submission = X_sub[['key','fare_amount']]
Submission.to_csv('Sub_test.csv',index=False)

      pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude  \
0           -73.973320        40.763805         -73.981430         40.743835   
1           -73.986862        40.719383         -73.998886         40.739201   
2           -73.982524        40.751260         -73.979654         40.746139   
3           -73.981160        40.767807         -73.990448         40.751635   
4           -73.966046        40.789775         -73.988565         40.744427   
5           -73.960983        40.765547         -73.979177         40.740053   
6           -73.949013        40.773204         -73.959622         40.770893   
7           -73.777282        40.646636         -73.985083         40.759368   
8           -74.014099        40.709638         -73.995106         40.741365   
9           -73.969582        40.765519         -73.980686         40.770725   
10          -73.989374        40.741973         -73.999300         40.722534   
11          -74.001614        40.740893 