In [56]:
import numpy as np
import pandas as pd

from geopy.geocoders import Nominatim
import googlemaps
import functools as ft
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostRegressor
from haversine import haversine
from sklearn.cluster import MiniBatchKMeans

import matplotlib.pyplot as plt

## Sample submission

In [57]:
pd.read_csv("sample_submission.csv").head()

Unnamed: 0,id,trip_duration
0,id3004672,959
1,id3505355,959
2,id1217141,959
3,id2150126,959
4,id1598245,959


## Solution

In [58]:
def prepare_data(): 
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv") 
    y = train['trip_duration'].values
    train.drop(['trip_duration'], axis=1, inplace=True)
    data = pd.concat([train, test])
    return train, test, y, data

In [59]:
train, test, y, data = prepare_data()

In [60]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N


### Подготовка данных

In [61]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 10 columns):
id                    1458644 non-null object
vendor_id             1458644 non-null int64
pickup_datetime       1458644 non-null object
dropoff_datetime      1458644 non-null object
passenger_count       1458644 non-null int64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null object
dtypes: float64(4), int64(2), object(4)
memory usage: 111.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 9 columns):
id                    625134 non-null object
vendor_id             625134 non-null int64
pickup_datetime       625134 non-null object
passenger_count       625134 non-null int64
pickup_longitude      625134 non-null float64
pickup_latitude       62513

#### Пропущенных значений нет, будем генерить фичи

In [62]:
data['pickup_longitude'] = np.round(data['pickup_longitude'],4)
data['pickup_latitude'] =  np.round(data['pickup_latitude'],4)
data['dropoff_longitude'] =  np.round(data['dropoff_longitude'],4)
data['dropoff_latitude'] =  np.round(data['dropoff_latitude'],4)

data['pickup_datetime'] = pd.to_datetime(data.pickup_datetime)
data['dropoff_datetime'] = pd.to_datetime(data.dropoff_datetime)

data['weekday'] = data['pickup_datetime'].map(lambda x:x.isoweekday())
data['month'] = data['pickup_datetime'].map(lambda x:x.month)
data['day'] = data['pickup_datetime'].map(lambda x:x.day)

data['pickup_hour'] = data['pickup_datetime'].map(lambda x:x.hour)

In [63]:
coords = np.vstack((data[['pickup_latitude', 'pickup_longitude']].values,
                    data[['dropoff_latitude', 'dropoff_longitude']].values))

pca = PCA().fit(coords)
data['pickup_pca0'] = pca.transform(data[['pickup_latitude', 'pickup_longitude']])[:, 0]
data['pickup_pca1'] = pca.transform(data[['pickup_latitude', 'pickup_longitude']])[:, 1]
data['dropoff_pca0'] = pca.transform(data[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
data['dropoff_pca1'] = pca.transform(data[['dropoff_latitude', 'dropoff_longitude']])[:, 1]

In [64]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

data.loc[:, 'distance_haversine'] = haversine_array(data['pickup_latitude'].values, data['pickup_longitude'].values, data['dropoff_latitude'].values, data['dropoff_longitude'].values)
data.loc[:, 'distance_dummy_manhattan'] = dummy_manhattan_distance(data['pickup_latitude'].values, data['pickup_longitude'].values, data['dropoff_latitude'].values, data['dropoff_longitude'].values)
data.loc[:, 'direction'] = bearing_array(data['pickup_latitude'].values, data['pickup_longitude'].values, data['dropoff_latitude'].values, data['dropoff_longitude'].values)
data.loc[:, 'pca_manhattan'] = np.abs(data['dropoff_pca1'] - data['pickup_pca1']) + np.abs(data['dropoff_pca0'] - data['pickup_pca0'])

data.loc[:, 'center_latitude'] = (data['pickup_latitude'].values + data['dropoff_latitude'].values) / 2
data.loc[:, 'center_longitude'] = (data['pickup_longitude'].values + data['dropoff_longitude'].values) / 2

In [65]:
#fig, ax = plt.subplots(ncols=1, nrows=1,figsize=(12,10))
#plt.ylim(40.6, 40.9)
#plt.xlim(-74.1,-73.7)
#ax.scatter(data['pickup_longitude'],data['pickup_latitude'], s=0.0002, alpha=1)
#plt.show()

In [66]:
#data[data['pickup_longitude']>-73.76]

In [67]:
data.head()

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,id,passenger_count,pickup_datetime,pickup_latitude,pickup_longitude,store_and_fwd_flag,vendor_id,...,pickup_pca0,pickup_pca1,dropoff_pca0,dropoff_pca1,distance_haversine,distance_dummy_manhattan,direction,pca_manhattan,center_latitude,center_longitude
0,2016-03-14 17:32:30,40.7656,-73.9646,id2875421,1,2016-03-14 17:24:55,40.7679,-73.9822,N,2,...,0.007739,0.017019,-0.009696,0.013691,1.504107,1.737927,99.784013,0.020763,40.76675,-73.9734
1,2016-06-12 00:54:38,40.7312,-73.9995,id2377394,1,2016-06-12 00:43:35,40.7386,-73.9804,N,1,...,0.00766,-0.012336,0.027161,-0.018603,1.807461,2.432052,-117.074646,0.025768,40.7349,-73.98995
2,2016-01-19 12:10:48,40.7101,-74.0053,id3858529,1,2016-01-19 11:35:24,40.7639,-73.979,N,2,...,0.004779,0.012839,0.034189,-0.039326,6.379488,8.197267,-159.666475,0.081575,40.737,-73.99215
3,2016-04-06 19:39:40,40.7067,-74.0123,id3504673,1,2016-04-06 19:32:31,40.72,-74.01,N,2,...,0.0383,-0.029168,0.041376,-0.04231,1.491543,1.672726,-172.531514,0.016218,40.71335,-74.01115
4,2016-03-26 13:38:10,40.7825,-73.9729,id2181028,1,2016-03-26 13:30:55,40.7932,-73.9731,N,2,...,-0.00283,0.041742,-0.002402,0.031049,1.189905,1.206622,179.189137,0.011121,40.78785,-73.973


In [68]:
#lyon = (45.7597, 4.8422)
#paris = (48.8567, 2.3508)
#haversine(lyon, paris)
print(haversine((train.pickup_latitude[0], train.pickup_longitude[0]),(train.dropoff_latitude[0], train.dropoff_longitude[0])))

1.4985207796458557


In [69]:
a = (48.8561, 2.3501)
b = (48.8560, 2.3500)
haversine(a,b)

0.013310465919017278

In [70]:
sample_ind = np.random.permutation(len(coords))
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])

data.loc[:, 'pickup_cluster'] = kmeans.predict(data[['pickup_latitude', 'pickup_longitude']])
data.loc[:, 'dropoff_cluster'] = kmeans.predict(data[['dropoff_latitude', 'dropoff_longitude']])

In [79]:
data.TimeGrouper('1day').head()

AttributeError: 'DataFrame' object has no attribute 'TimeGrouper'

In [80]:
# Count how many trips are going to each cluster over time
group_freq = '60min'

df_dropoff_counts = data \
    .set_index('pickup_datetime') \
    .groupby([pd.TimeGrouper(group_freq), 'dropoff_cluster']) \
    .agg({'id': 'count'}) \
    .reset_index().set_index('pickup_datetime') \
    .groupby('dropoff_cluster').rolling('240min').mean() \
    .drop('dropoff_cluster', axis=1) \
    .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() \
    .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'dropoff_cluster_count'})
    
data['pickup_datetime_group'] = data['pickup_datetime'].dt.round(group_freq)

data['dropoff_cluster_count'] = \
    data[['pickup_datetime_group', 'dropoff_cluster']].merge(df_dropoff_counts, 
        on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0)

In [81]:
data.head(20)

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,id,passenger_count,pickup_datetime,pickup_latitude,pickup_longitude,store_and_fwd_flag,vendor_id,...,distance_haversine,distance_dummy_manhattan,direction,pca_manhattan,center_latitude,center_longitude,pickup_cluster,dropoff_cluster,pickup_datetime_group,dropoff_cluster_count
0,2016-03-14 17:32:30,40.7656,-73.9646,id2875421,1,2016-03-14 17:24:55,40.7679,-73.9822,N,2,...,1.504107,1.737927,99.784013,0.020763,40.76675,-73.9734,70,17,2016-03-14 17:00:00,18.0
1,2016-06-12 00:54:38,40.7312,-73.9995,id2377394,1,2016-06-12 00:43:35,40.7386,-73.9804,N,1,...,1.807461,2.432052,-117.074646,0.025768,40.7349,-73.98995,27,69,2016-06-12 01:00:00,12.75
2,2016-01-19 12:10:48,40.7101,-74.0053,id3858529,1,2016-01-19 11:35:24,40.7639,-73.979,N,2,...,6.379488,8.197267,-159.666475,0.081575,40.737,-73.99215,97,3,2016-01-19 12:00:00,11.75
3,2016-04-06 19:39:40,40.7067,-74.0123,id3504673,1,2016-04-06 19:32:31,40.72,-74.01,N,2,...,1.491543,1.672726,-172.531514,0.016218,40.71335,-74.01115,25,3,2016-04-06 20:00:00,10.75
4,2016-03-26 13:38:10,40.7825,-73.9729,id2181028,1,2016-03-26 13:30:55,40.7932,-73.9731,N,2,...,1.189905,1.206622,179.189137,0.011121,40.78785,-73.973,29,73,2016-03-26 14:00:00,11.75
5,2016-01-30 22:09:03,40.7492,-73.9921,id0801584,6,2016-01-30 22:01:40,40.7422,-73.9829,N,2,...,1.098421,1.553439,-44.874157,0.016301,40.7457,-73.9875,27,74,2016-01-30 22:00:00,24.0
6,2016-06-17 22:40:40,40.7659,-73.9574,id1813257,4,2016-06-17 22:34:59,40.7578,-73.969,N,1,...,1.328801,1.877718,47.323206,0.019461,40.76185,-73.9632,38,86,2016-06-17 23:00:00,10.5
7,2016-05-21 08:20:49,40.7606,-73.9225,id1324603,1,2016-05-21 07:54:58,40.7978,-73.9693,N,2,...,5.713001,8.075925,136.373904,0.084418,40.7792,-73.9459,29,22,2016-05-21 08:00:00,2.25
8,2016-05-27 23:16:38,40.7328,-73.9858,id1301050,1,2016-05-27 23:12:23,40.7384,-73.9995,N,1,...,1.311548,1.776945,118.340301,0.019742,40.7356,-73.99265,36,65,2016-05-27 23:00:00,9.0
9,2016-03-10 22:05:26,40.79,-73.973,id0012891,1,2016-03-10 21:45:01,40.7443,-73.981,N,2,...,5.126075,5.755565,7.549681,0.055819,40.76715,-73.977,27,73,2016-03-10 22:00:00,12.75


In [71]:
def submit(preds,model):
    submission = pd.DataFrame({
        "id": test["id"],
        "trip_duration": preds.astype(int)
        })
    submission.to_csv('NYC_Taxi_'+str(model)+'.csv', index=False)

### Google maps

In [97]:
geolocator = Nominatim()
address = geolocator.reverse("52.509669, 13.376294")
destination = geolocator.reverse("52.509669, 13.376294")
print(address)
print(destination)

Gallo, Potsdamer Platz, Tiergarten, Mitte, Berlin, 10785, Deutschland
Gallo, Potsdamer Platz, Tiergarten, Mitte, Berlin, 10785, Deutschland


In [98]:
k = 1
geolocator = Nominatim()
address = geolocator.reverse(str(train.pickup_latitude[k])+", "+str(train.pickup_longitude[k]))#()"52.509669, 13.376294")
destination = geolocator.reverse(str(train.dropoff_latitude[k])+", "+str(train.dropoff_longitude[k]))#("52.509669, 13.376294")
print(address)
print(destination)
directions = gmaps.directions(address, destination)
print(directions, y[k])

423B, 2nd Avenue, Kips Bay, Manhattan, Manhattan Community Board 6, New York County, NYC, New York, 10010, United States of America
37, Washington Square West, Washington Square Village, Manhattan, Manhattan Community Board 2, New York County, NYC, New York, 10011, United States of America


ValueError: could not convert string to float: '423B, 2nd Avenue, Kips Bay, Manhattan, Manhattan Community Board 6, New York County, NYC, New York, 10010, United States of America'

In [92]:
directions

[{'bounds': {'northeast': {'lat': 37.4232205, 'lng': -122.0853998},
   'southwest': {'lat': 37.4232205, 'lng': -122.0853998}},
  'copyrights': 'Map data ©2017 Google',
  'legs': [{'distance': {'text': '1 ft', 'value': 0},
    'duration': {'text': '1 min', 'value': 0},
    'end_address': 'Google Bldg 41, 1600 Amphitheatre Pkwy, Mountain View, CA 94043, USA',
    'end_location': {'lat': 37.4232205, 'lng': -122.0853998},
    'start_address': 'Google Bldg 41, 1600 Amphitheatre Pkwy, Mountain View, CA 94043, USA',
    'start_location': {'lat': 37.4232205, 'lng': -122.0853998},
    'steps': [{'distance': {'text': '1 ft', 'value': 0},
      'duration': {'text': '1 min', 'value': 0},
      'end_location': {'lat': 37.4232205, 'lng': -122.0853998},
      'html_instructions': 'Head on <b>Amphitheatre Pkwy</b>',
      'polyline': {'points': 'cflcFvxchV'},
      'start_location': {'lat': 37.4232205, 'lng': -122.0853998},
      'travel_mode': 'DRIVING'}],
    'traffic_speed_entry': [],
    'via_wayp

In [89]:
gmaps = googlemaps.Client(key='AIzaSyB1Svr6SrQ6LDKBQcy4O5ELScPszrk6jyY')

# Geocoding an address
geocode_result = gmaps.geocode('1600 Amphitheatre Parkway, Mountain View, CA')

# Look up an address with reverse geocoding
reverse_geocode_result = gmaps.reverse_geocode((40.714224, -73.961452))

# Request directions via public transit
now = datetime.now()
directions_result = gmaps.directions("Sydney Town Hall",
                                     "Parramatta, NSW",
                                     mode="transit",
                                     departure_time=now)

ApiError: REQUEST_DENIED (This API project is not authorized to use this API. Please ensure this API is activated in the Google Developers Console: https://console.developers.google.com/apis/api/geocoding_backend?project=_)

### KNN

In [64]:
knn_cols = ['pickup_latitude',
            'pickup_longitude',
            'dropoff_latitude',
            'dropoff_longitude',
            #'passenger_count',
            'vendor_id',
            'weekday',
            'month',
            'day',
            'pickup_hour']
train_for_knn = data[knn_cols][:train.shape[0]]
test_for_knn = data[knn_cols][train.shape[0]:]

In [65]:
knn = KNeighborsClassifier(n_neighbors=1, weights='distance', algorithm='auto', leaf_size=30, p=1)
knn.fit(train_for_knn,y)
preds = knn.predict(test_for_knn)

In [66]:
submit(preds,"KNN")

### CatBoost

In [72]:
y = np.log(y+1)

In [75]:
CB_cols = ['pickup_latitude', #0
            'pickup_longitude', #1
            'dropoff_latitude', #2
            'dropoff_longitude', #3
            'passenger_count', #4
            'vendor_id', #5
            'weekday', #6
            'month', #7
            'day', #8
            'pickup_hour', #9
            'pickup_pca0', #10
            'pickup_pca1', #11
            'dropoff_pca0', #12
            'dropoff_pca1', #13
            'distance_haversine', #14
            'distance_dummy_manhattan', #15
            'direction', #16
            'pca_manhattan', #17
            'center_latitude', #18
            'center_longitude', #19
            'pickup_cluster', #20
            'dropoff_cluster', #21
            'dropoff_cluster_count' #22
          ]
train_for_CB = data[CB_cols][:train.shape[0]]
test_for_CB = data[CB_cols][train.shape[0]:]

In [86]:
CatBoost = CatBoostRegressor(random_state = 42, 
                             iterations = 1000, 
                             depth = 10, 
                             learning_rate = 0.03, 
                             loss_function='RMSE', 
                             thread_count = 4,
                             verbose=True)
CatBoost.fit(train_for_CB, y, cat_features=[4,5,6,7,8,9,20,21])
preds = CatBoost.predict(test_for_CB)
preds = np.exp(preds)-1

Borders generated
0:	learn 6.321708918passed: 12.1 sec	total: 43.1s	remaining: 11h 58m 21s
1:	learn 6.133856769passed: 16.2 sec	total: 59.3s	remaining: 8h 13m 15s
2:	learn 5.951838614passed: 9.46 sec	total: 1m 8s	remaining: 6h 20m 52s
3:	learn 5.774967341passed: 15.5 sec	total: 1m 24s	remaining: 5h 49m 51s
4:	learn 5.603492476passed: 15.3 sec	total: 1m 39s	remaining: 5h 30m 29s
5:	learn 5.437406752passed: 1.4 sec	total: 1m 41s	remaining: 4h 39m
6:	learn 5.276413212passed: 1.38 sec	total: 1m 42s	remaining: 4h 2m 10s
7:	learn 5.120196826passed: 10.7 sec	total: 1m 53s	remaining: 3h 53m 45s
8:	learn 4.968552094passed: 15.1 sec	total: 2m 8s	remaining: 3h 55m 21s
9:	learn 4.821257507passed: 16.9 sec	total: 2m 25s	remaining: 3h 59m 29s
10:	learn 4.678421571passed: 7.74 sec	total: 2m 32s	remaining: 3h 49m 6s
11:	learn 4.54038611passed: 1.48 sec	total: 2m 34s	remaining: 3h 31m 49s
12:	learn 4.406111883passed: 10.2 sec	total: 2m 44s	remaining: 3h 28m 10s
13:	learn 4.275948375passed: 9.87 sec	tot

112:	learn 0.4868722584passed: 13 sec	total: 22m 44s	remaining: 2h 58m 30s
113:	learn 0.4838647711passed: 13.4 sec	total: 22m 57s	remaining: 2h 58m 28s
114:	learn 0.4809361561passed: 11.6 sec	total: 23m 9s	remaining: 2h 58m 12s
115:	learn 0.4782286931passed: 12.9 sec	total: 23m 22s	remaining: 2h 58m 7s
116:	learn 0.47544572passed: 10.7 sec	total: 23m 33s	remaining: 2h 57m 44s
117:	learn 0.4728131572passed: 11.3 sec	total: 23m 44s	remaining: 2h 57m 26s
118:	learn 0.470430853passed: 13.8 sec	total: 23m 58s	remaining: 2h 57m 27s
119:	learn 0.4680815004passed: 10.4 sec	total: 24m 8s	remaining: 2h 57m 2s
120:	learn 0.4658594681passed: 13.8 sec	total: 24m 22s	remaining: 2h 57m 3s
121:	learn 0.4637710997passed: 10.1 sec	total: 24m 32s	remaining: 2h 56m 37s
122:	learn 0.4618699362passed: 15.7 sec	total: 24m 48s	remaining: 2h 56m 50s
123:	learn 0.4599799384passed: 15.1 sec	total: 25m 3s	remaining: 2h 56m 59s
124:	learn 0.4582998981passed: 13.2 sec	total: 25m 16s	remaining: 2h 56m 55s
125:	learn

220:	learn 0.4158174448passed: 13.2 sec	total: 45m 15s	remaining: 2h 39m 30s
221:	learn 0.4156807653passed: 10.6 sec	total: 45m 25s	remaining: 2h 39m 12s
222:	learn 0.4155605404passed: 10.7 sec	total: 45m 36s	remaining: 2h 38m 54s
223:	learn 0.4154321828passed: 9.15 sec	total: 45m 45s	remaining: 2h 38m 31s
224:	learn 0.415321314passed: 15.5 sec	total: 46m	remaining: 2h 38m 29s
225:	learn 0.4152073463passed: 12.8 sec	total: 46m 13s	remaining: 2h 38m 19s
226:	learn 0.4151111509passed: 12.2 sec	total: 46m 25s	remaining: 2h 38m 6s
227:	learn 0.4150361018passed: 14.6 sec	total: 46m 40s	remaining: 2h 38m 2s
228:	learn 0.4149712274passed: 15.8 sec	total: 46m 56s	remaining: 2h 38m 2s
229:	learn 0.4148791379passed: 14 sec	total: 47m 10s	remaining: 2h 37m 55s
230:	learn 0.4147560979passed: 13.3 sec	total: 47m 23s	remaining: 2h 37m 46s
231:	learn 0.4146511153passed: 12.4 sec	total: 47m 36s	remaining: 2h 37m 34s
232:	learn 0.4145632251passed: 13.5 sec	total: 47m 49s	remaining: 2h 37m 26s
233:	lear

327:	learn 0.4080536343passed: 15 sec	total: 1h 7m 56s	remaining: 2h 19m 12s
328:	learn 0.4080209197passed: 12.9 sec	total: 1h 8m 9s	remaining: 2h 19m 1s
329:	learn 0.407956813passed: 9.42 sec	total: 1h 8m 19s	remaining: 2h 18m 42s
330:	learn 0.4078987612passed: 10.4 sec	total: 1h 8m 29s	remaining: 2h 18m 25s
331:	learn 0.4078243189passed: 9.34 sec	total: 1h 8m 38s	remaining: 2h 18m 7s
332:	learn 0.4077777157passed: 13.9 sec	total: 1h 8m 52s	remaining: 2h 17m 57s
333:	learn 0.4077436834passed: 16.3 sec	total: 1h 9m 9s	remaining: 2h 17m 53s
334:	learn 0.4076867262passed: 13.9 sec	total: 1h 9m 23s	remaining: 2h 17m 43s
335:	learn 0.4076228887passed: 13.8 sec	total: 1h 9m 36s	remaining: 2h 17m 34s
336:	learn 0.4075615135passed: 9.34 sec	total: 1h 9m 46s	remaining: 2h 17m 15s
337:	learn 0.4075005023passed: 11.3 sec	total: 1h 9m 57s	remaining: 2h 17m
338:	learn 0.4074266272passed: 9.69 sec	total: 1h 10m 7s	remaining: 2h 16m 43s
339:	learn 0.4073647209passed: 10.4 sec	total: 1h 10m 17s	remai

432:	learn 0.403306755passed: 14.4 sec	total: 1h 29m 59s	remaining: 1h 57m 50s
433:	learn 0.4032853203passed: 13.3 sec	total: 1h 30m 12s	remaining: 1h 57m 38s
434:	learn 0.4032395865passed: 13.1 sec	total: 1h 30m 25s	remaining: 1h 57m 27s
435:	learn 0.4031866429passed: 11.3 sec	total: 1h 30m 37s	remaining: 1h 57m 13s
436:	learn 0.4031412896passed: 15.6 sec	total: 1h 30m 52s	remaining: 1h 57m 4s
437:	learn 0.4031003286passed: 11.9 sec	total: 1h 31m 4s	remaining: 1h 56m 51s
438:	learn 0.4030676587passed: 14.4 sec	total: 1h 31m 18s	remaining: 1h 56m 41s
439:	learn 0.4030381298passed: 9.25 sec	total: 1h 31m 28s	remaining: 1h 56m 25s
440:	learn 0.4030040258passed: 9.15 sec	total: 1h 31m 37s	remaining: 1h 56m 8s
441:	learn 0.4029802239passed: 13.9 sec	total: 1h 31m 51s	remaining: 1h 55m 57s
442:	learn 0.4029394179passed: 12.9 sec	total: 1h 32m 4s	remaining: 1h 55m 45s
443:	learn 0.4028893866passed: 14.3 sec	total: 1h 32m 18s	remaining: 1h 55m 35s
444:	learn 0.4028494858passed: 12.7 sec	total

536:	learn 0.3996906044passed: 10.7 sec	total: 1h 51m 14s	remaining: 1h 35m 54s
537:	learn 0.3996554311passed: 12.6 sec	total: 1h 51m 27s	remaining: 1h 35m 42s
538:	learn 0.399625682passed: 11.3 sec	total: 1h 51m 38s	remaining: 1h 35m 29s
539:	learn 0.3995784349passed: 12.6 sec	total: 1h 51m 51s	remaining: 1h 35m 16s
540:	learn 0.3995463505passed: 9.1 sec	total: 1h 52m	remaining: 1h 35m 1s
541:	learn 0.3995085837passed: 14.4 sec	total: 1h 52m 14s	remaining: 1h 34m 50s
542:	learn 0.3994844559passed: 14.6 sec	total: 1h 52m 29s	remaining: 1h 34m 40s
543:	learn 0.3994524682passed: 9.11 sec	total: 1h 52m 38s	remaining: 1h 34m 25s
544:	learn 0.3994250916passed: 12.2 sec	total: 1h 52m 50s	remaining: 1h 34m 12s
545:	learn 0.3993973269passed: 9.42 sec	total: 1h 52m 59s	remaining: 1h 33m 57s
546:	learn 0.3993795739passed: 13 sec	total: 1h 53m 12s	remaining: 1h 33m 45s
547:	learn 0.3993617797passed: 14.3 sec	total: 1h 53m 27s	remaining: 1h 33m 34s
548:	learn 0.3993352453passed: 10.6 sec	total: 1h

641:	learn 0.3968298234passed: 15.9 sec	total: 2h 12m 44s	remaining: 1h 14m 1s
642:	learn 0.3968074697passed: 10 sec	total: 2h 12m 54s	remaining: 1h 13m 47s
643:	learn 0.3967672822passed: 14.2 sec	total: 2h 13m 8s	remaining: 1h 13m 36s
644:	learn 0.3967229225passed: 11.4 sec	total: 2h 13m 20s	remaining: 1h 13m 23s
645:	learn 0.3966917692passed: 13.2 sec	total: 2h 13m 33s	remaining: 1h 13m 11s
646:	learn 0.3966656806passed: 9.67 sec	total: 2h 13m 43s	remaining: 1h 12m 57s
647:	learn 0.3966429228passed: 12.2 sec	total: 2h 13m 55s	remaining: 1h 12m 44s
648:	learn 0.3966173358passed: 14.6 sec	total: 2h 14m 9s	remaining: 1h 12m 33s
649:	learn 0.3965904922passed: 14.1 sec	total: 2h 14m 24s	remaining: 1h 12m 22s
650:	learn 0.3965685733passed: 9.62 sec	total: 2h 14m 33s	remaining: 1h 12m 8s
651:	learn 0.3965517995passed: 13.8 sec	total: 2h 14m 47s	remaining: 1h 11m 56s
652:	learn 0.3965354013passed: 13.6 sec	total: 2h 15m 1s	remaining: 1h 11m 44s
653:	learn 0.396512486passed: 14.1 sec	total: 2

747:	learn 0.3944409258passed: 13.1 sec	total: 2h 34m 11s	remaining: 51m 56s
748:	learn 0.3944276981passed: 9.69 sec	total: 2h 34m 21s	remaining: 51m 43s
749:	learn 0.3944026891passed: 12.9 sec	total: 2h 34m 34s	remaining: 51m 31s
750:	learn 0.3943826359passed: 13.3 sec	total: 2h 34m 47s	remaining: 51m 19s
751:	learn 0.3943626339passed: 9.92 sec	total: 2h 34m 57s	remaining: 51m 6s
752:	learn 0.3943376257passed: 10.3 sec	total: 2h 35m 8s	remaining: 50m 53s
753:	learn 0.3943130221passed: 9.2 sec	total: 2h 35m 17s	remaining: 50m 39s
754:	learn 0.3942913797passed: 12.5 sec	total: 2h 35m 29s	remaining: 50m 27s
755:	learn 0.3942689236passed: 9.05 sec	total: 2h 35m 38s	remaining: 50m 14s
756:	learn 0.3942461846passed: 9.06 sec	total: 2h 35m 47s	remaining: 50m
757:	learn 0.394220521passed: 15.3 sec	total: 2h 36m 3s	remaining: 49m 49s
758:	learn 0.3941930595passed: 12.8 sec	total: 2h 36m 16s	remaining: 49m 37s
759:	learn 0.39417382passed: 12.3 sec	total: 2h 36m 28s	remaining: 49m 24s
760:	learn

855:	learn 0.3924347754passed: 11 sec	total: 2h 56m 28s	remaining: 29m 41s
856:	learn 0.392419154passed: 10 sec	total: 2h 56m 38s	remaining: 29m 28s
857:	learn 0.3924107257passed: 15.1 sec	total: 2h 56m 53s	remaining: 29m 16s
858:	learn 0.3923933622passed: 14 sec	total: 2h 57m 7s	remaining: 29m 4s
859:	learn 0.3923729811passed: 13 sec	total: 2h 57m 20s	remaining: 28m 52s
860:	learn 0.3923514184passed: 15 sec	total: 2h 57m 35s	remaining: 28m 40s
861:	learn 0.3923328549passed: 10.5 sec	total: 2h 57m 46s	remaining: 28m 27s
862:	learn 0.3923184285passed: 13.1 sec	total: 2h 57m 59s	remaining: 28m 15s
863:	learn 0.3923028353passed: 12 sec	total: 2h 58m 11s	remaining: 28m 2s
864:	learn 0.3922844007passed: 11.5 sec	total: 2h 58m 22s	remaining: 27m 50s
865:	learn 0.3922694063passed: 12.9 sec	total: 2h 58m 35s	remaining: 27m 38s
866:	learn 0.3922586906passed: 13.5 sec	total: 2h 58m 49s	remaining: 27m 25s
867:	learn 0.3922497584passed: 13.5 sec	total: 2h 59m 2s	remaining: 27m 13s
868:	learn 0.392

964:	learn 0.3906101956passed: 13 sec	total: 3h 19m 3s	remaining: 7m 13s
965:	learn 0.3905922001passed: 11.1 sec	total: 3h 19m 14s	remaining: 7m
966:	learn 0.390570017passed: 14.4 sec	total: 3h 19m 28s	remaining: 6m 48s
967:	learn 0.3905530683passed: 10.7 sec	total: 3h 19m 39s	remaining: 6m 36s
968:	learn 0.3905325284passed: 10.2 sec	total: 3h 19m 49s	remaining: 6m 23s
969:	learn 0.390517292passed: 9.57 sec	total: 3h 19m 58s	remaining: 6m 11s
970:	learn 0.3904951496passed: 13.7 sec	total: 3h 20m 12s	remaining: 5m 58s
971:	learn 0.3904787569passed: 15.4 sec	total: 3h 20m 27s	remaining: 5m 46s
972:	learn 0.3904628718passed: 10.7 sec	total: 3h 20m 38s	remaining: 5m 34s
973:	learn 0.3904557014passed: 11.3 sec	total: 3h 20m 49s	remaining: 5m 21s
974:	learn 0.3904369484passed: 14.2 sec	total: 3h 21m 4s	remaining: 5m 9s
975:	learn 0.3904133465passed: 11.6 sec	total: 3h 21m 15s	remaining: 4m 56s
976:	learn 0.3903847031passed: 13.1 sec	total: 3h 21m 28s	remaining: 4m 44s
977:	learn 0.3903621299

In [87]:
preds

array([  866.05810455,   599.14768286,   451.74159481, ...,  1521.69599241,
        1955.19978828,  1274.07970157])

In [88]:
submit(preds,"CatBoost_add_cluster_1000_numtrips")

### XGBoost

In [17]:
y = np.log(y+1)

In [18]:
XGB_cols = ['pickup_latitude',
            'pickup_longitude',
            'dropoff_latitude',
            'dropoff_longitude',
            'passenger_count',
            'vendor_id',
            'weekday',
            'month',
            'day',
            'pickup_hour',
            'pickup_pca0',
            'pickup_pca1',
            'dropoff_pca0',
            'dropoff_pca1',
            'distance_haversine',
            'distance_dummy_manhattan',
            'direction',
            'pca_manhattan',
            'center_latitude',
            'center_longitude']
train_for_XGB = data[XGB_cols][:train.shape[0]]
test_for_XGB = data[XGB_cols][train.shape[0]:]

In [None]:
xgb_params = {'min_child_weight': 50, 
            'eta': 0.3, 
            'colsample_bytree': 0.3, 
            'max_depth': 6,
            'subsample': 0.8, 
            'lambda': 1., 
            'nthread': -1, 
            'booster' : 'gbtree', 
            'silent': 1,
            'eval_metric': 'rmse', 
            'objective': 'reg:linear'}

Mtrain = xgb.DMatrix(train_for_XGB, label=y)
Mtest = xgb.DMatrix(test_for_XGB)
watchlist = [(Mtrain, 'train'), (Mtest, 'test')]

XGB_model = xgb.train(xgb_params, Mtrain, 50, watchlist, early_stopping_rounds=50, maximize=False, verbose_eval=20)
preds = XGB_model.predict(Mtest)
preds = np.exp(preds)-1

#XGB_model.fit(train_for_CB,y)
#preds = XGB_model.predict(test_for_CB)
#xgb_model = xgb.train(xgb_pars, dtrain, 50, watchlist, early_stopping_rounds=50, maximize=False, verbose_eval=20)


In [None]:
submit(preds,"XGB")