In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(601)

from datetime import datetime, timedelta

from tqdm import tqdm_notebook
import time

from haversine import haversine

import pickle

# 1. 데이터 Load

In [2]:
train = pd.read_csv('data/train.csv', parse_dates=['pickup_datetime','dropoff_datetime'])
test = pd.read_csv('data/test.csv', parse_dates=['pickup_datetime'])
sample_submission = pd.read_csv('data/sample_submission.csv')

In [None]:
train.head()
test.head()
sample_submission.head()

# 2. 데이터 전처리
## 2.1 EDA

In [None]:
train.shape
test.shape
sample_submission.shape

In [None]:
train.columns
test.columns
sample_submission.columns

In [None]:
train.info()

In [None]:
train.describe()

### 기간

In [None]:
train.pickup_datetime.min(), train.pickup_datetime.max()
# 년도는 동일, 월, 일, 시간, 요일 추출 

test.pickup_datetime.min(), test.pickup_datetime.max()

In [None]:
train.head()

### Outlier - 운행기간

In [5]:
# 운행 기간 Outlier 삭제 
train[(train.dropoff_datetime - train.pickup_datetime).dt.days > 1]
idx = train[(train.dropoff_datetime - train.pickup_datetime).dt.days > 1].index

train = train.drop(idx)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,dist
355003,id1864733,1,2016-01-05 00:19:42,2016-01-27 11:08:38,1,-73.78965,40.643559,-73.95681,40.773087,N,1939736,20.148692
680594,id0369307,1,2016-02-13 22:38:00,2016-03-08 15:57:38,2,-73.921677,40.735252,-73.984749,40.759979,N,2049578,5.982495
924150,id1325766,1,2016-01-05 06:14:15,2016-01-31 01:01:07,1,-73.983788,40.742325,-73.985489,40.727676,N,2227612,1.63513
978383,id0053347,1,2016-02-13 22:46:52,2016-03-25 18:18:14,1,-73.783905,40.648632,-73.978271,40.750202,N,3526282,19.900689


In [4]:
train.shape # (1458644, 12) -> (1458640, 12)

(1458644, 12)

### Target Date 시각화

In [None]:
train.trip_duration.plot()

### Missing Value

In [None]:
train.isnull().sum().sum()

### Value Counts

In [None]:
def value_counts(df):
    for col in df.columns:
        print(col)
        print(train[col].value_counts())
        
value_counts(train[['passenger_count']])        

In [None]:
train[train['passenger_count'] >= 7]
test[test['passenger_count'] >= 7]

In [None]:
train[train['dist'] == 0].head()
test[test['dist'] == 0].head()

In [None]:
# 위도 경도 같은 건 삭제 

idx = train[train['dist'] == 0].index

train = train.drop(idx)

In [None]:
train

In [None]:
pd.crosstab(train['vendor_id'], train['store_and_fwd_flag'], margins=True)

### trip_duration dropoff_datetime pickup_datetime 관계 확인 -> dropoff_datetime 빼고 학습?

In [None]:
(train['trip_duration'] == (train['dropoff_datetime'] - train['pickup_datetime']).astype('timedelta64[s]')).value_counts()

## 시각화
### 지역: 경도/위도

#### x: 위도(latitude)
#### y: 경도(longitude) 

(pickup_latitude, pickup_longitude) -> (dropoff_latitude, dropoff_longitude)

In [None]:
InteractiveShell.ast_node_interactivity = "none"

In [None]:
fig, ax = plt.subplots(1,2, figsize=(20, 10))
ax[0].set_title('pickup point')
ax[0].set_xlabel('latitude')
ax[0].set_ylabel('longitude')
ax[0].scatter(train.pickup_latitude, train.pickup_longitude, marker='o')
ax[1].set_title('dropoff point')
ax[1].set_xlabel('latitude')
ax[1].set_ylabel('longitude')
ax[1].scatter(train.dropoff_latitude, train.dropoff_longitude, marker='o')
plt.show()

In [None]:
InteractiveShell.ast_node_interactivity = "all"

In [6]:
# pickup point outlier 
idx = train[(train.pickup_longitude == train.pickup_longitude.min()) | (train.pickup_latitude == train.pickup_latitude.max())].index
train = train.drop(idx)

In [7]:
train.shape # (1458640, 12) -> (1458638, 12)

(1458638, 12)

In [None]:
InteractiveShell.ast_node_interactivity = "none"

In [None]:
fig, ax = plt.subplots(1,2, figsize=(20, 10))
ax[0].set_title('pickup point')
ax[0].set_xlabel('latitude')
ax[0].set_ylabel('longitude')
ax[0].scatter(train.pickup_latitude, train.pickup_longitude, marker='o')
ax[1].set_title('dropoff point')
ax[1].set_xlabel('latitude')
ax[1].set_ylabel('longitude')
ax[1].scatter(train.dropoff_latitude, train.dropoff_longitude, marker='o')
plt.show()

In [None]:
InteractiveShell.ast_node_interactivity = "all"

In [None]:
len(train) # 1458637
train.index.size

outlier 가져갈것인가?

In [None]:
start = time.time()  # 시작 시간 저장

# total = tqdm_notebook(range(len(train))) # outlier 삭제로 전체 건수 loop 안됨 

fig, ax = plt.subplots(1,1, figsize=(20, 10))

for i in train.index: # total:
    plt.arrow(train.pickup_latitude[i], train.pickup_longitude[i]
            , (train.dropoff_latitude[i]-train.pickup_latitude[i])
            , (train.dropoff_longitude[i]-train.pickup_longitude[i]), head_width=0.01, head_length=0.01, fc='k', ec='k')
    
ax.set_title('start -> end arrow')
ax.set_xlim(32, 42)
ax.set_ylim(-90, -60)

print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

In [None]:
train['dist'].sort_values()

In [None]:
train[train['dist'] == train['dist'].max()]

In [8]:
# 시간대비 거리가 너무 김 outlier 
idx = train[train['dist'] == train['dist'].max()].index
train = train.drop(idx)

In [9]:
train.shape # (1458638, 12) -> (1458637, 12)

(1458637, 12)

In [None]:
start = time.time()  # 시작 시간 저장

# total = tqdm_notebook(range(len(train))) # outlier 삭제로 전체 건수 loop 안됨 

fig, ax = plt.subplots(1,1, figsize=(20, 10))

for i in train.index: # total:
    plt.arrow(train.pickup_latitude[i], train.pickup_longitude[i]
            , (train.dropoff_latitude[i]-train.pickup_latitude[i])
            , (train.dropoff_longitude[i]-train.pickup_longitude[i]), head_width=0.01, head_length=0.01, fc='k', ec='k')
    
ax.set_title('start -> end arrow')
ax.set_xlim(32, 42)
ax.set_ylim(-90, -60)

print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

### 위도 경도로 거리 구하기 -> haversine

In [None]:
train['dist'] = 0.0

start = time.time()  # 시작 시간 저장

total = tqdm_notebook(range(len(train)))

for i in total:
    train['dist'][i] = haversine((train.pickup_latitude[i], train.pickup_longitude[i]), (train.dropoff_latitude[i], train.dropoff_longitude[i]))
print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

In [None]:
test['dist'] = 0.0

start = time.time()  # 시작 시간 저장

total = tqdm_notebook(range(len(test)))

for i in total:
    test['dist'][i] = haversine((test.pickup_latitude[i], test.pickup_longitude[i]), (test.dropoff_latitude[i], test.dropoff_longitude[i]))
print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

In [None]:
InteractiveShell.ast_node_interactivity = "all"

순수하게 Load 해서 거리만 넣은 피클 

In [None]:
"""
with open("train.pkl", "wb") as f:
    pickle.dump(train, f)
    
with open("test.pkl", "wb") as f:
    pickle.dump(test, f)    
"""

In [3]:
with open("train.pkl", "rb") as f:
    train = pickle.load(f)    
    
with open("test.pkl", "rb") as f:
    test = pickle.load(f)        

In [None]:
train.head()
test.head()

### 시간: 픽업시간 (시간 -> 요일(평일/주말(공휴일)))

In [None]:
InteractiveShell.ast_node_interactivity = "none"

In [None]:
fig, ax = plt.subplots(1,3, figsize=(20, 10))

# 날짜별 분포
ymd = train.pickup_datetime.dt.strftime('%Y-%m-%d').value_counts().sort_index()
ax[0].set_title('date hist')
ax[0].set_xlabel('date')
ax[0].set_ylabel('count')
ax[0].plot(ymd)

# 평일/주말 분포 # (0:월, 1:화, 2:수, 3:목, 4:금, 5:토, 6:일)
wk = train.pickup_datetime.dt.weekday.value_counts().sort_index()
ax[1].set_title('weekday hist')
ax[1].set_xlabel('weekday')
ax[1].set_ylabel('count')
ax[1].plot(wk)


# 시간별 분포 
hr = train.pickup_datetime.dt.strftime('%H').value_counts().sort_index()
ax[2].set_title('hour hist')
ax[2].set_xlabel('hour')
ax[2].set_ylabel('count')
ax[2].plot(hr)

plt.show()

In [None]:
InteractiveShell.ast_node_interactivity = "all"

#### 1) date hist: 특정 날에 건수가 없는 날이 있음 제외 고려, 년/월/일/시 쪼개서 넣을것인가? 저 특이한 날은 제외하는게 학습에 도움이 되는가? 상관이 없는가? count 가 적으므로 dist 도 적어질테니까 없애는게 나을거 같은데... 
#### 2) weekday hist(0:월, 1:화, 2:수, 3:목, 4:금, 5:토, 6:일): 요일별 편차가 존재함
#### 3) hour hist: 새벽시간 건수가 적음 -> 시간별로 묶는 변수를 추가할까? 

In [None]:
InteractiveShell.ast_node_interactivity = "all"

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20, 10))

# 시간별 탑승 건수
datetimeCnt = train.pickup_datetime.dt.strftime('%Y-%m-%d %H').value_counts().sort_index()

ax.set_title('datetime - count')
ax.set_xlabel('datetime')
ax.set_ylabel('count')
ax.plot(datetimeCnt)

plt.show()

In [None]:
# 거리 분포
train.dist.plot()

In [None]:
train['dist'].groupby(train.pickup_datetime.dt.strftime('%Y-%m-%d')).sum()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20, 10))

# 날자별 탑승 거리 
dateDist = train['dist'].groupby(train.pickup_datetime.dt.strftime('%Y-%m-%d')).sum()

ax.set_title('date - dist')
ax.set_xlabel('date')
ax.set_ylabel('dist')
ax.plot(dateDist)

plt.show()

In [None]:
dateDist[dateDist == dateDist.min()]

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20, 10))

# 날자별 탑승 거리 
dateDist_drop = train_drop['dist'].groupby(train_drop.pickup_datetime.dt.strftime('%Y-%m-%d')).sum()

ax.set_title('date - dist')
ax.set_xlabel('date')
ax.set_ylabel('dist')
ax.plot(dateDist_drop)

plt.show()

In [None]:
dateDist_drop[dateDist_drop == dateDist_drop.min()]

In [None]:
# '2016-01-24' 날짜 지워보기
idx = train_drop[(train_drop.pickup_datetime.dt.strftime('%Y-%m-%d') == '2016-01-24')].index

train_drop = train_drop.drop(idx)
train_drop

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20, 10))

# 날자별 탑승 거리 
dateDist_drop = train_drop['dist'].groupby(train_drop.pickup_datetime.dt.strftime('%Y-%m-%d')).sum()

ax.set_title('date - dist')
ax.set_xlabel('date')
ax.set_ylabel('dist')
ax.plot(dateDist_drop)

plt.show()

## 클러스터링
### 지역 클러스터: 경도/위도
-> 거리 
### 시간 클러스터: 픽업시간 시간 
-> 요일(평일/주말(공휴일)))

=> 클러스터링 결과로?
1) 클러스터로 묶어서 레코드를 추가할 수도 있고
2) 클러스터끼리 학습할 수도 있을것 같고

In [13]:
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans

In [14]:
# Feature Extraction
coords = np.vstack((train[['pickup_latitude', 'pickup_longitude']].values,
                    train[['dropoff_latitude', 'dropoff_longitude']].values,
                    test[['pickup_latitude', 'pickup_longitude']].values,
                    test[['dropoff_latitude', 'dropoff_longitude']].values))


In [28]:
kmeans = MiniBatchKMeans(n_clusters=5, init='k-means++').fit(coords)

In [None]:
kmeans

In [None]:
pca = PCA().fit(coords)

In [None]:
# Clustering
sample_ind = np.random.permutation(len(coords))[:500000]
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])

In [29]:
train.loc[:, 'pickup_cluster'] = kmeans.predict(train[['pickup_latitude', 'pickup_longitude']])
train.loc[:, 'dropoff_cluster'] = kmeans.predict(train[['dropoff_latitude', 'dropoff_longitude']])
test.loc[:, 'pickup_cluster'] = kmeans.predict(test[['pickup_latitude', 'pickup_longitude']])
test.loc[:, 'dropoff_cluster'] = kmeans.predict(test[['dropoff_latitude', 'dropoff_longitude']])

In [None]:
train['pickup_cluster'].value_counts()

In [None]:
InteractiveShell.ast_node_interactivity = "all"

In [10]:
# 년도는 동일, 월, 일, 시간, 요일 추출 
train['pickup_mm'] = train.pickup_datetime.dt.strftime('%m').astype('int')
train['pickup_dd'] = train.pickup_datetime.dt.strftime('%d').astype('int')
#train['pickup_date'] = train.pickup_datetime.dt.strftime('%m%d')
train['pickup_day'] = train['pickup_datetime'].dt.day
train['pickup_hour'] = train['pickup_datetime'].dt.hour
train['pickup_weekday'] = train.pickup_datetime.dt.weekday

test['pickup_mm'] = test.pickup_datetime.dt.strftime('%m').astype('int')
test['pickup_dd'] = test.pickup_datetime.dt.strftime('%d').astype('int')
#test['pickup_date'] = test.pickup_datetime.dt.strftime('%m%d')
test['pickup_day'] = test['pickup_datetime'].dt.day
test['pickup_hour'] = test['pickup_datetime'].dt.hour
test['pickup_weekday'] = test.pickup_datetime.dt.weekday

In [39]:
train['pickup_week'] = train.pickup_datetime.dt.dayofweek
test['pickup_week'] = train.pickup_datetime.dt.dayofweek

## 인코딩

In [None]:
train.columns

In [None]:
train = pd.get_dummies(train, columns=['passenger_count'], prefix='passenger_count')
test = pd.get_dummies(test, columns=['passenger_count'], prefix='passenger_count')

In [11]:
train = pd.get_dummies(train, columns=['vendor_id'], prefix='vendor_id')
test = pd.get_dummies(test, columns=['vendor_id'], prefix='vendor_id')

In [None]:
train = pd.get_dummies(train, columns=['pickup_weekday'], prefix='pickup_weekday')
test = pd.get_dummies(test, columns=['pickup_weekday'], prefix='pickup_weekday')

In [12]:
train = pd.get_dummies(train, columns=['store_and_fwd_flag'], prefix='store_and_fwd_flag')
test = pd.get_dummies(test, columns=['store_and_fwd_flag'], prefix='store_and_fwd_flag')

## corr

In [None]:
columns = [
           #'vendor_id'
           'vendor_id_1','vendor_id_2',
    
           'passenger_count'
    
           #, 'pickup_datetime'
           #,'pickup_date'
    
           ,'pickup_mm'
           ,'pickup_day','pickup_hour','pickup_weekday'
           ,'pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'
           ,'dist'
           ,'store_and_fwd_flag_N','store_and_fwd_flag_Y'
            ,'trip_duration'
          ]
train_corr = train[columns]
train_corr.head()

In [None]:
plt.figure(figsize=(14, 12))
sns.heatmap(train_corr.corr(), linewidths=0.1, vmax=1.0,
           square=True, cmap=plt.cm.RdBu, linecolor='white', annot=True, fmt='.3f', annot_kws={"size": 16})

# 3. 모델학습

In [17]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [18]:
train.columns

Index(['id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'trip_duration', 'dist', 'pickup_mm', 'pickup_dd',
       'pickup_day', 'pickup_hour', 'pickup_weekday', 'vendor_id_1',
       'vendor_id_2', 'store_and_fwd_flag_N', 'store_and_fwd_flag_Y',
       'pickup_cluster', 'dropoff_cluster'],
      dtype='object')

In [None]:
test.columns

In [49]:
#train.columns

columns = [
           #'vendor_id'
           'vendor_id_1','vendor_id_2',
    
           'passenger_count'
       #'passenger_count_0', 'passenger_count_1',
       #'passenger_count_2', 'passenger_count_3', 'passenger_count_4',
       #'passenger_count_5', 'passenger_count_6', 'passenger_count_7',
       #'passenger_count_8', 'passenger_count_9'
    
           #, 'pickup_datetime'
           #,'pickup_date'
    
           ,'pickup_mm'#,'pickup_dd'
           ,'pickup_day','pickup_hour'
           #,'pickup_week'
           ,'pickup_weekday'
           #,'pickup_weekday_0','pickup_weekday_1','pickup_weekday_2'
           #,'pickup_weekday_3','pickup_weekday_4','pickup_weekday_5','pickup_weekday_6'
           ,'pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'
    
           ,'dist'
           
           ,'store_and_fwd_flag_N','store_and_fwd_flag_Y'
    
           ,'pickup_cluster', 'dropoff_cluster'
          ]
X_train = train[columns]
#y_train = train['trip_duration']
y_train = np.log1p(train['trip_duration'])

X_test = test[columns]

In [None]:
X_train.head()
X_train.info()  # 1458637 

In [None]:
li_reg = LinearRegression()
li_reg.fit(X_train, y_train)

In [23]:
from sklearn.linear_model import ElasticNet
el_reg = ElasticNet()
el_reg.fit(X_train, y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [53]:
rf_reg = RandomForestRegressor(n_jobs=-1)
rf_reg.fit(X_train, y_train)

MemoryError: could not allocate 58720256 bytes

In [50]:
xgb_reg = XGBRegressor()
xgb_reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [51]:
lgb_reg = LGBMRegressor()
lgb_reg.fit(X_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

oneHot-store_and_fwd_flag-LinearRegression
oneHot-store_and_fwd_flag-RandomForestRegressor
oneHot-store_and_fwd_flag-XGBRegressor
oneHot-store_and_fwd_flag-LGBMRegressor

In [None]:
# LinearRegression
y_pred_li = li_reg.predict(X_test)
#sample_submission['trip_duration'] = y_pred_li
sample_submission['trip_duration'] = np.expm1(y_pred_li)
sample_submission.to_csv('./output/20200820-33.csv', index=False)

In [24]:
# ElasticNet
y_pred_el = el_reg.predict(X_test)
#sample_submission['trip_duration'] = y_pred_el
sample_submission['trip_duration'] = np.expm1(y_pred_el)
sample_submission.to_csv('./output/20200820-33.csv', index=False)

In [None]:
# RandomForestRegressor
y_pred_rf = rf_reg.predict(X_test)
#sample_submission['trip_duration'] = y_pred_rf
sample_submission['trip_duration'] = np.expm1(y_pred_rf)
sample_submission.to_csv('./output/20200820-42.csv', index=False)

In [55]:
y_pred_rf = rf_reg.predict(X_test)

IndexError: list index out of range

In [None]:
y_pred_xgb = xgb_reg.predict(X_test)
y_pred_lgb = lgb_reg.predict(X_test)

In [52]:
# soft voting 

# y_pred = (0.5*y_pred_xgb + 0.5*y_pred_lgb) # 0.41729
y_pred = (0.5*y_pred_rf + 0.3*y_pred_xgb + 0.2*y_pred_lgb) # 


sample_submission['trip_duration'] = np.expm1(y_pred)
sample_submission.to_csv('./output/20200820-43.csv', index=False)

In [43]:
# XGBRegressor
y_pred_xgb = xgb_reg.predict(X_test)
#sample_submission['trip_duration'] = y_pred_xgb
sample_submission['trip_duration'] = np.expm1(y_pred_xgb)
sample_submission.to_csv('./output/20200820-40.csv', index=False)

In [44]:
# LGBMRegressor
y_pred_lgb = lgb_reg.predict(X_test)
#sample_submission['trip_duration'] = y_pred_lgb
sample_submission['trip_duration'] = np.expm1(y_pred_lgb)
sample_submission.to_csv('./output/20200820-41.csv', index=False)

dayofweek - RandomForestRegressor

dayofweek - XGBRegressor
dayofweek - LGBMRegressor