In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(601)

from datetime import datetime, timedelta

from tqdm import tqdm_notebook
import time

from haversine import haversine

import pickle

from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans

# 1. 데이터 Load

In [2]:
train = pd.read_csv('data/train.csv', parse_dates=['pickup_datetime','dropoff_datetime'])
test = pd.read_csv('data/test.csv', parse_dates=['pickup_datetime'])
sample_submission = pd.read_csv('data/sample_submission.csv')

In [3]:
# 순수하게 Load 해서 거리만 넣은 피클 
"""
with open("train.pkl", "wb") as f:
    pickle.dump(train, f)
    
with open("test.pkl", "wb") as f:
    pickle.dump(test, f)    
"""
with open("train.pkl", "rb") as f:
    train = pickle.load(f)    
    
with open("test.pkl", "rb") as f:
    test = pickle.load(f)        

'\nwith open("train.pkl", "wb") as f:\n    pickle.dump(train, f)\n    \nwith open("test.pkl", "wb") as f:\n    pickle.dump(test, f)    \n'

## 외부데이터

In [4]:
fr1 = pd.read_csv('new-york-city-taxi-with-osrm/fastest_routes_train_part_1.csv',
                  usecols=['id', 'total_distance', 'total_travel_time',  'number_of_steps'])
fr2 = pd.read_csv('new-york-city-taxi-with-osrm/fastest_routes_train_part_2.csv',
                  usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])
test_street_info = pd.read_csv('new-york-city-taxi-with-osrm/fastest_routes_test.csv',
                               usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])

In [5]:
train_street_info = pd.concat((fr1, fr2))

train = train.merge(train_street_info, how='left', on='id')
test = test.merge(test_street_info, how='left', on='id')
train_street_info.head()

Unnamed: 0,id,total_distance,total_travel_time,number_of_steps
0,id2875421,2009.1,164.9,5
1,id2377394,2513.2,332.0,6
2,id3504673,1779.4,235.8,4
3,id2181028,1614.9,140.1,5
4,id0801584,1393.5,189.4,5


In [6]:
train.isnull().sum()
test.isnull().sum().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dist                  0
total_distance        1
total_travel_time     1
number_of_steps       1
dtype: int64

0

In [7]:
idx = train[(train.total_distance.isnull()==True)].index
train = train.drop(idx)

In [8]:
train.head()
test.head()
sample_submission.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,dist,total_distance,total_travel_time,number_of_steps
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1.498523,2009.1,164.9,5.0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1.80551,2513.2,332.0,6.0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,6.385107,11060.8,767.6,16.0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1.4855,1779.4,235.8,4.0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1.18859,1614.9,140.1,5.0


Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,dist,total_distance,total_travel_time,number_of_steps
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N,2.74643,3795.9,424.6,4
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N,2.759243,2904.5,200.0,4
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N,1.306157,1499.5,193.2,4
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N,5.269095,7023.9,494.8,11
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N,0.960843,1108.2,103.2,4


Unnamed: 0,id,trip_duration
0,id3004672,959
1,id3505355,959
2,id1217141,959
3,id2150126,959
4,id1598245,959


# 2. 데이터 전처리
## 2.1 EDA

In [None]:
train.shape
test.shape
sample_submission.shape

In [None]:
train.columns
test.columns
sample_submission.columns

In [None]:
train.info()

In [None]:
train.describe()

0

0

### 기간

In [None]:
train.pickup_datetime.min(), train.pickup_datetime.max()
# 년도는 동일, 월, 일, 시간, 요일 추출 

test.pickup_datetime.min(), test.pickup_datetime.max()

In [None]:
train.head()

### Outlier - 운행기간

In [11]:
# 운행 기간 Outlier 삭제 
train[(train.dropoff_datetime - train.pickup_datetime).dt.days > 1]
idx = train[(train.dropoff_datetime - train.pickup_datetime).dt.days > 1].index

train = train.drop(idx)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,dist,total_distance,total_travel_time,number_of_steps
355003,id1864733,1,2016-01-05 00:19:42,2016-01-27 11:08:38,1,-73.78965,40.643559,-73.95681,40.773087,N,1939736,20.148692,29232.6,1635.0,19.0
680594,id0369307,1,2016-02-13 22:38:00,2016-03-08 15:57:38,2,-73.921677,40.735252,-73.984749,40.759979,N,2049578,5.982495,7503.0,506.4,10.0
924150,id1325766,1,2016-01-05 06:14:15,2016-01-31 01:01:07,1,-73.983788,40.742325,-73.985489,40.727676,N,2227612,1.63513,2318.9,315.0,6.0
978383,id0053347,1,2016-02-13 22:46:52,2016-03-25 18:18:14,1,-73.783905,40.648632,-73.978271,40.750202,N,3526282,19.900689,27223.8,1406.9,14.0


In [None]:
train.shape # (1458644, 12) -> (1458640, 12)

### Target Date 시각화

In [None]:
train.trip_duration.plot()

### Missing Value

In [None]:
train.isnull().sum().sum()
test.isnull().sum().sum()

### Value Counts

In [None]:
def value_counts(df):
    for col in df.columns:
        print(col)
        print(train[col].value_counts())
        
value_counts(train[['passenger_count']])        

In [None]:
train[train['passenger_count'] >= 7]
test[test['passenger_count'] >= 7]

In [None]:
train[train['dist'] == 0].head()
test[test['dist'] == 0].head()

In [12]:
# 위도 경도 같은 건 삭제 

idx = train[train['dist'] == 0].index

train = train.drop(idx)

In [None]:
train

In [None]:
pd.crosstab(train['vendor_id'], train['store_and_fwd_flag'], margins=True)

### trip_duration dropoff_datetime pickup_datetime 관계 확인 -> dropoff_datetime 빼고 학습?

In [None]:
(train['trip_duration'] == (train['dropoff_datetime'] - train['pickup_datetime']).astype('timedelta64[s]')).value_counts()

## 시각화
### 지역: 경도/위도

#### x: 위도(latitude)
#### y: 경도(longitude) 

(pickup_latitude, pickup_longitude) -> (dropoff_latitude, dropoff_longitude)

In [None]:
InteractiveShell.ast_node_interactivity = "none"

In [None]:
fig, ax = plt.subplots(1,2, figsize=(20, 10), sharex=True, sharey=True)
ax[0].set_title('pickup point')
ax[0].set_xlabel('latitude')
ax[0].set_ylabel('longitude')
ax[0].scatter(train.pickup_latitude, train.pickup_longitude, marker='o')
ax[1].set_title('dropoff point')
ax[1].set_xlabel('latitude')
ax[1].set_ylabel('longitude')
ax[1].scatter(train.dropoff_latitude, train.dropoff_longitude, marker='o')
plt.show()

In [None]:
InteractiveShell.ast_node_interactivity = "all"

In [14]:
# pickup point outlier 
idx = train[(train.pickup_longitude == train.pickup_longitude.min()) | (train.pickup_latitude == train.pickup_latitude.max())].index
train = train.drop(idx)

In [None]:
train.shape # (1458640, 12) -> (1458638, 12)

In [None]:
InteractiveShell.ast_node_interactivity = "none"

In [None]:
fig, ax = plt.subplots(1,2, figsize=(20, 10))
ax[0].set_title('pickup point')
ax[0].set_xlabel('latitude')
ax[0].set_ylabel('longitude')
ax[0].scatter(train.pickup_latitude, train.pickup_longitude, marker='o')
ax[1].set_title('dropoff point')
ax[1].set_xlabel('latitude')
ax[1].set_ylabel('longitude')
ax[1].scatter(train.dropoff_latitude, train.dropoff_longitude, marker='o')
plt.show()

In [None]:
InteractiveShell.ast_node_interactivity = "all"

In [None]:
len(train) # 1458637
train.index.size

outlier 가져갈것인가?

In [None]:
start = time.time()  # 시작 시간 저장

# total = tqdm_notebook(range(len(train))) # outlier 삭제로 전체 건수 loop 안됨 

fig, ax = plt.subplots(1,1, figsize=(20, 10))

for i in train.index: # total:
    plt.arrow(train.pickup_latitude[i], train.pickup_longitude[i]
            , (train.dropoff_latitude[i]-train.pickup_latitude[i])
            , (train.dropoff_longitude[i]-train.pickup_longitude[i]), head_width=0.01, head_length=0.01, fc='k', ec='k')
    
ax.set_title('start -> end arrow')
ax.set_xlim(32, 42)
ax.set_ylim(-90, -60)

print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

In [None]:
train['dist'].sort_values()

In [None]:
train[train['dist'] == train['dist'].max()]

In [13]:
# 시간대비 거리가 너무 김 outlier 
idx = train[train['dist'] == train['dist'].max()].index
train = train.drop(idx)

In [None]:
train.shape # (1458638, 12) -> (1458637, 12)

In [None]:
start = time.time()  # 시작 시간 저장

# total = tqdm_notebook(range(len(train))) # outlier 삭제로 전체 건수 loop 안됨 

fig, ax = plt.subplots(1,1, figsize=(20, 10))

for i in train.index: # total:
    plt.arrow(train.pickup_latitude[i], train.pickup_longitude[i]
            , (train.dropoff_latitude[i]-train.pickup_latitude[i])
            , (train.dropoff_longitude[i]-train.pickup_longitude[i]), head_width=0.01, head_length=0.01, fc='k', ec='k')
    
ax.set_title('start -> end arrow')
ax.set_xlim(32, 42)
ax.set_ylim(-90, -60)

print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

### 위도 경도로 거리 구하기 -> haversine

In [None]:
train['dist'] = 0.0

start = time.time()  # 시작 시간 저장

total = tqdm_notebook(range(len(train)))

for i in total:
    train['dist'][i] = haversine((train.pickup_latitude[i], train.pickup_longitude[i]), (train.dropoff_latitude[i], train.dropoff_longitude[i]))
print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

In [None]:
test['dist'] = 0.0

start = time.time()  # 시작 시간 저장

total = tqdm_notebook(range(len(test)))

for i in total:
    test['dist'][i] = haversine((test.pickup_latitude[i], test.pickup_longitude[i]), (test.dropoff_latitude[i], test.dropoff_longitude[i]))
print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

In [None]:
InteractiveShell.ast_node_interactivity = "all"

In [None]:
train.head()
test.head()

### 시간: 픽업시간 (시간 -> 요일(평일/주말(공휴일)))

In [None]:
InteractiveShell.ast_node_interactivity = "none"

In [None]:
fig, ax = plt.subplots(1,3, figsize=(20, 10))

# 날짜별 분포
ymd = train.pickup_datetime.dt.strftime('%Y-%m-%d').value_counts().sort_index()
ax[0].set_title('date hist')
ax[0].set_xlabel('date')
ax[0].set_ylabel('count')
ax[0].plot(ymd)

# 평일/주말 분포 # (0:월, 1:화, 2:수, 3:목, 4:금, 5:토, 6:일)
wk = train.pickup_datetime.dt.weekday.value_counts().sort_index()
ax[1].set_title('weekday hist')
ax[1].set_xlabel('weekday')
ax[1].set_ylabel('count')
ax[1].plot(wk)


# 시간별 분포 
hr = train.pickup_datetime.dt.strftime('%H').value_counts().sort_index()
ax[2].set_title('hour hist')
ax[2].set_xlabel('hour')
ax[2].set_ylabel('count')
ax[2].plot(hr)

plt.show()

In [None]:
InteractiveShell.ast_node_interactivity = "all"

#### 1) date hist: 특정 날에 건수가 없는 날이 있음 제외 고려, 년/월/일/시 쪼개서 넣을것인가? 저 특이한 날은 제외하는게 학습에 도움이 되는가? 상관이 없는가? count 가 적으므로 dist 도 적어질테니까 없애는게 나을거 같은데... 
#### 2) weekday hist(0:월, 1:화, 2:수, 3:목, 4:금, 5:토, 6:일): 요일별 편차가 존재함
#### 3) hour hist: 새벽시간 건수가 적음 -> 시간별로 묶는 변수를 추가할까? 

In [None]:
InteractiveShell.ast_node_interactivity = "all"

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20, 10))

# 시간별 탑승 건수
datetimeCnt = train.pickup_datetime.dt.strftime('%Y-%m-%d %H').value_counts().sort_index()

ax.set_title('datetime - count')
ax.set_xlabel('datetime')
ax.set_ylabel('count')
ax.plot(datetimeCnt)

plt.show()

In [None]:
# 거리 분포
train.dist.plot()

In [None]:
train['dist'].groupby(train.pickup_datetime.dt.strftime('%Y-%m-%d')).sum()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20, 10))

# 날자별 탑승 거리 
dateDist = train['dist'].groupby(train.pickup_datetime.dt.strftime('%Y-%m-%d')).sum()

ax.set_title('date - dist')
ax.set_xlabel('date')
ax.set_ylabel('dist')
ax.plot(dateDist)

plt.show()

In [None]:
dateDist[dateDist == dateDist.min()]

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20, 10))

# 날자별 탑승 거리 
dateDist_drop = train_drop['dist'].groupby(train_drop.pickup_datetime.dt.strftime('%Y-%m-%d')).sum()

ax.set_title('date - dist')
ax.set_xlabel('date')
ax.set_ylabel('dist')
ax.plot(dateDist_drop)

plt.show()

In [None]:
dateDist_drop[dateDist_drop == dateDist_drop.min()]

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20, 10))

# 날자별 탑승 거리 
dateDist_drop = train_drop['dist'].groupby(train_drop.pickup_datetime.dt.strftime('%Y-%m-%d')).sum()

ax.set_title('date - dist')
ax.set_xlabel('date')
ax.set_ylabel('dist')
ax.plot(dateDist_drop)

plt.show()

## 클러스터링
### 지역 클러스터: 경도/위도
-> 거리 
### 시간 클러스터: 픽업시간 시간 
-> 요일(평일/주말(공휴일)))

=> 클러스터링 결과로?
1) 클러스터로 묶어서 레코드를 추가할 수도 있고
2) 클러스터끼리 학습할 수도 있을것 같고

In [15]:
# Feature Extraction
coords = np.vstack((train[['pickup_latitude', 'pickup_longitude']].values,
                    train[['dropoff_latitude', 'dropoff_longitude']].values,
                    test[['pickup_latitude', 'pickup_longitude']].values,
                    test[['dropoff_latitude', 'dropoff_longitude']].values))

In [16]:
kmeans = MiniBatchKMeans(n_clusters=3, init='k-means++').fit(coords)

In [17]:
train.loc[:, 'pickup_cluster'] = kmeans.predict(train[['pickup_latitude', 'pickup_longitude']])
train.loc[:, 'dropoff_cluster'] = kmeans.predict(train[['dropoff_latitude', 'dropoff_longitude']])
test.loc[:, 'pickup_cluster'] = kmeans.predict(test[['pickup_latitude', 'pickup_longitude']])
test.loc[:, 'dropoff_cluster'] = kmeans.predict(test[['dropoff_latitude', 'dropoff_longitude']])

In [None]:
pca = PCA().fit(coords)
train['pickup_pca0'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 0]
train['pickup_pca1'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 1]
train['dropoff_pca0'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
train['dropoff_pca1'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 1]
test['pickup_pca0'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 0]
test['pickup_pca1'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 1]
test['dropoff_pca0'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
test['dropoff_pca1'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 1]

In [None]:
InteractiveShell.ast_node_interactivity = "all"

In [18]:
# 년도는 동일, 월, 일, 시간, 요일 추출 
train['pickup_mm'] = train.pickup_datetime.dt.strftime('%m').astype('int')
train['pickup_dd'] = train.pickup_datetime.dt.strftime('%d').astype('int')
#train['pickup_date'] = train.pickup_datetime.dt.strftime('%m%d')
train['pickup_day'] = train['pickup_datetime'].dt.day
train['pickup_hour'] = train['pickup_datetime'].dt.hour
train['pickup_weekday'] = train.pickup_datetime.dt.weekday

test['pickup_mm'] = test.pickup_datetime.dt.strftime('%m').astype('int')
test['pickup_dd'] = test.pickup_datetime.dt.strftime('%d').astype('int')
#test['pickup_date'] = test.pickup_datetime.dt.strftime('%m%d')
test['pickup_day'] = test['pickup_datetime'].dt.day
test['pickup_hour'] = test['pickup_datetime'].dt.hour
test['pickup_weekday'] = test.pickup_datetime.dt.weekday

In [19]:
train['pickup_week'] = train.pickup_datetime.dt.dayofweek
test['pickup_week'] = train.pickup_datetime.dt.dayofweek

## 인코딩

In [None]:
train.columns

In [None]:
train = pd.get_dummies(train, columns=['passenger_count'], prefix='passenger_count')
test = pd.get_dummies(test, columns=['passenger_count'], prefix='passenger_count')

In [20]:
train = pd.get_dummies(train, columns=['vendor_id'], prefix='vendor_id')
test = pd.get_dummies(test, columns=['vendor_id'], prefix='vendor_id')

In [None]:
train = pd.get_dummies(train, columns=['pickup_weekday'], prefix='pickup_weekday')
test = pd.get_dummies(test, columns=['pickup_weekday'], prefix='pickup_weekday')

In [21]:
train = pd.get_dummies(train, columns=['store_and_fwd_flag'], prefix='store_and_fwd_flag')
test = pd.get_dummies(test, columns=['store_and_fwd_flag'], prefix='store_and_fwd_flag')

In [None]:
train['store_and_fwd_flag'] = 1 * (train.store_and_fwd_flag.values == 'Y')
test['store_and_fwd_flag'] = 1 * (test.store_and_fwd_flag.values == 'Y')

## corr

In [None]:
columns = [
           #'vendor_id'
           'vendor_id_1','vendor_id_2',
    
           'passenger_count'
    
           #, 'pickup_datetime'
           #,'pickup_date'
    
           ,'pickup_mm'
           ,'pickup_day','pickup_hour','pickup_weekday'
           ,'pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'
           ,'dist'
           ,'store_and_fwd_flag'
           #,'store_and_fwd_flag_N','store_and_fwd_flag_Y'
            ,'trip_duration'
          ]
train_corr = train[columns]
train_corr.head()

In [None]:
plt.figure(figsize=(14, 12))
sns.heatmap(train_corr.corr(), linewidths=0.1, vmax=1.0,
           square=True, cmap=plt.cm.RdBu, linecolor='white', annot=True, fmt='.3f', annot_kws={"size": 16})

# 3. 모델학습

In [22]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
train.columns

In [None]:
test.columns

In [23]:
#train.columns

columns = [
           #'vendor_id'
           'vendor_id_1','vendor_id_2',
    
           'passenger_count'
       #'passenger_count_0', 'passenger_count_1',
       #'passenger_count_2', 'passenger_count_3', 'passenger_count_4',
       #'passenger_count_5', 'passenger_count_6', 'passenger_count_7',
       #'passenger_count_8', 'passenger_count_9'
    
           #, 'pickup_datetime'
           #,'pickup_date'
    
           ,'pickup_mm'#,'pickup_dd'
           ,'pickup_day','pickup_hour'
           #,'pickup_week'
           ,'pickup_weekday'
           #,'pickup_weekday_0','pickup_weekday_1','pickup_weekday_2'
           #,'pickup_weekday_3','pickup_weekday_4','pickup_weekday_5','pickup_weekday_6'
           ,'pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'
    
           ,'dist'
           
           #,'store_and_fwd_flag'
           #,'store_and_fwd_flag_0','store_and_fwd_flag_1'
           ,'store_and_fwd_flag_N','store_and_fwd_flag_Y'
    
           ,'pickup_cluster', 'dropoff_cluster'
    
          #, 'pickup_pca0', 'pickup_pca1','dropoff_pca0', 'dropoff_pca1'
    
          ,'total_distance', 'total_travel_time', 'number_of_steps'
          ]
X_train = train[columns]
#y_train = train['trip_duration']
y_train = np.log1p(train['trip_duration'])

X_test = test[columns]

In [None]:
X_train.head()
X_train.info()  # 1458637 

In [26]:
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

In [None]:
n_iter = 0
cv_score = []
xgb_reg = XGBRegressor()

kfold = KFold(n_splits=5)
for train_index, test_index in kfold.split(X_train):
    _X_train, _X_test = X_train.iloc[train_index], X_train.iloc[test_index]
    _y_train, _y_test = y_train.iloc[train_index], y_train.iloc[test_index]
    
    xgb_reg.fit(_X_train, _y_train)
    pred = xgb_reg.predict(_X_test)
    n_iter += 1
    
    cv_score.append(mean_squared_log_error(_y_test, pred))
    
print(np.mean(cv_score))

In [52]:
from sklearn.model_selection import cross_val_score, cross_validate

In [None]:
xgb_reg = XGBRegressor()
scores = cross_val_score(xgb_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=3)

rmse_score = np.sqrt(-scores) 
print('mean', np.mean(rmse_score))

In [59]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = RandomForestRegressor()

grid_parameters = {'max_depth':[18,21]
                  #,'subsample': [0.6, 0.8, 1.0]
                  }

grid_model = GridSearchCV(model, param_grid=grid_parameters, cv=3, refit=True)
grid_model.fit(X_train, y_train)

In [78]:
score_df = pd.DataFrame(grid_model.cv_results_)
score_df[score_df['rank_test_score'] == 1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
1,981.993699,3.745463,11.245164,0.044161,18,{'max_depth': 18},0.755745,0.755218,0.75702,0.755994,0.000757,1


In [79]:
y_pred_xgb = grid_model.predict(X_test)
#sample_submission['trip_duration'] = y_pred_xgb
sample_submission['trip_duration'] = np.expm1(y_pred_xgb)
sample_submission.to_csv('./output/20200825-3.csv', index=False)

In [None]:
rf_reg = RandomForestRegressor(n_jobs=-1)
rf_reg.fit(X_train, y_train)

In [None]:
xgb_reg = XGBRegressor()
xgb_reg.fit(X_train, y_train)

In [None]:
lgb_reg = LGBMRegressor()
lgb_reg.fit(X_train, y_train)

oneHot-store_and_fwd_flag-LinearRegression
oneHot-store_and_fwd_flag-RandomForestRegressor
oneHot-store_and_fwd_flag-XGBRegressor
oneHot-store_and_fwd_flag-LGBMRegressor

In [None]:
# RandomForestRegressor
y_pred_rf = rf_reg.predict(X_test)
#sample_submission['trip_duration'] = y_pred_rf
sample_submission['trip_duration'] = np.expm1(y_pred_rf)
sample_submission.to_csv('./output/20200824-4.csv', index=False)

In [57]:
# XGBRegressor
y_pred_xgb = xgb_reg.predict(X_test)
#sample_submission['trip_duration'] = y_pred_xgb
sample_submission['trip_duration'] = np.expm1(y_pred_xgb)
sample_submission.to_csv('./output/20200825-1.csv', index=False)

XGBoostError: need to call fit or load_model beforehand

In [None]:
# LGBMRegressor
y_pred_lgb = lgb_reg.predict(X_test)
#sample_submission['trip_duration'] = y_pred_lgb
sample_submission['trip_duration'] = np.expm1(y_pred_lgb)
sample_submission.to_csv('./output/20200824-6.csv', index=False)

dayofweek - RandomForestRegressor

dayofweek - XGBRegressor
dayofweek - LGBMRegressor

In [None]:
model = rf_reg 

frt_importances = pd.Series(model.feature_importances_, index=X_test.columns)

ftr_top = frt_importances.sort_values(ascending=False)[:10]

sns.barplot(x=ftr_top, y=ftr_top.index)

In [None]:
model = xgb_reg 

frt_importances = pd.Series(model.feature_importances_, index=X_test.columns)

ftr_top = frt_importances.sort_values(ascending=False)[:10]

sns.barplot(x=ftr_top, y=ftr_top.index)

In [None]:
model = lgb_reg 

frt_importances = pd.Series(model.feature_importances_, index=X_test.columns)

ftr_top = frt_importances.sort_values(ascending=False)[:10]

sns.barplot(x=ftr_top, y=ftr_top.index)

In [71]:
import gc
gc.collect()

20