# Fast Campus Data Science School 7th

## Regression Team Project

### Overview
- Team: Team B-5 committer (김선웅, 이영인, 장승우)
- Subject: New York City Taxi Trip Duration
- Dataset: 2016 NYC Cab trip record data (by TLC)
- Objective: Building a model that predicts the duration of each trip in New York City.

#### Data fields
- id - 각 운행별 고유 id
- vendor_id - 운행별 각 택시 회사의 id
- pickup_datetime - 승차 날짜/시각 (미터기 기록 시작)
- dropoff_datetime - 하차 날짜/시각 (미터기 기록 종료)
- passenger_count - 승객 수
- pickup_longitude - 승차 위도
- pickup_latitude - 승차 경도
- dropoff_longitude - 하차 위도
- dropoff_latitude - 하차 경도
- store_and_fwd_flag - 운행 기록 서비 전송 전 차량 메모리 저장 여부 (Y: 저장/전송, N: 미저장/전송)
- trip_duration - 운행 소요 시간 (초)

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms 
import sklearn as sk

from patsy import dmatrix

import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

import platform
if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)

import warnings
warnings.filterwarnings("ignore")

  from pandas.core import datetools


In [2]:
train = pd.read_csv("../dataset/train.csv", parse_dates=["pickup_datetime", "dropoff_datetime"])
print("Shape of Train Data: ", train.shape)

Shape of Train Data:  (1458644, 11)


In [3]:
train["pickup_month"] = train["pickup_datetime"].dt.month
train["pickup_weekday"] = train["pickup_datetime"].dt.weekday
train["pickup_hour"] = train["pickup_datetime"].dt.hour

train.shape

(1458644, 14)

In [4]:
working_day = [0, 1, 2, 3, 4] # Mon-Fri
train["working_day"] = train["pickup_weekday"].isin(working_day).astype(int)

In [5]:
train["store_and_fwd_flag"] = 1 * (train.store_and_fwd_flag.values == 'Y')

In [6]:
lat1, lng1, lat2, lng2 = (train['pickup_latitude'].values, train['pickup_longitude'].values, 
                          train['dropoff_latitude'].values, train['dropoff_longitude'].values)

In [7]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

In [8]:
def bearing_array(lat1, lng1, lat2, lng2):
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return (np.degrees(np.arctan2(y, x)) + 360) % 360 # from -180/180 to 0/360

In [9]:
train = train.assign(distance = haversine_array(lat1, lng1, lat2, lng2))
train = train.assign(bearing = bearing_array(lat1, lng1, lat2, lng2))

In [10]:
X_cols = [col for col in train.columns if col != "trip_duration"]
train_X0 = train[X_cols]
train_y = train[["trip_duration"]]

In [11]:
col_real = ["passenger_count", "distance", "bearing", "pickup_longitude", 
            "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]

col_cats = [col for col in train.columns if col not in col_real]
for col in ["pickup_datetime", "dropoff_datetime", "id"]:
    col_cats.remove(col)
col_cats_nontime = ["vendor_id", "store_and_fwd_flag", "working_day"]

In [27]:
model = sm.OLS.from_formula("trip_duration ~ passenger_count + distance + bearing + pickup_month +" 
                            "pickup_weekday + pickup_hour + pickup_latitude + pickup_longitude +"
                            "dropoff_latitude + dropoff_longitude + C(vendor_id) + C(store_and_fwd_flag)"
                            "+ C(working_day)", train)
result_01 = model.fit()

In [28]:
# print(result_01.summary())

In [12]:
result_sets = pd.DataFrame(columns=["R_squared", "Adj.R_squared", "AIC", "BIC", "P_fstatics", 
                                    "P_omnibus", "P_jb", "Cond_no", "changed"])

In [13]:
# 수정본 (Team B-5)
# 전역변수로 result_sets을 선언하고 사용

def storage(result, change) :
    
    summary = result.summary()
    table = summary.tables[2]
    
    p_omnibus = float(table.data[1][1])
    p_jb = float(table.data[2][3])
    
    put = {
        "R_squared" : result.rsquared,
        "Adj.R_squared": result.rsquared_adj,
        "AIC" : result.aic,
        "BIC" : result.bic,
        "P_fstatics" : result.f_pvalue,
        "P_omnibus" : p_omnibus,
        "P_jb" : p_jb,
        "Cond_no" : result.condition_number, 
        "changed" : change,
    }
    
    result_sets.loc[len(result_sets)] = put

## 1. basic12와 동일 + pickup_hour 1~2

In [17]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ np.log1p(distance) + pickup_month +" 
                            "scale(bearing) + pickup_hour + I(pickup_hour ** 2) +"
                            "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train)
result_01 = model.fit()
print(result_01.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.590
Model:                                 OLS   Adj. R-squared:                  0.590
Method:                      Least Squares   F-statistic:                 2.626e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             11:57:14   Log-Likelihood:            -1.0858e+06
No. Observations:                  1458644   AIC:                         2.172e+06
Df Residuals:                      1458635   BIC:                         2.172e+06
Df Model:                                8                                         
Covariance Type:                 nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [18]:
storage(result_01, "basic12와 동일 + pickup_hour 1~2")

In [19]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.590242,0.59024,2171605.0,2171715.0,0.0,0.0,0.0,1396.564907,basic12와 동일 + pickup_hour 1~2


## 2. basic12와 동일 + pickup_hour 1~3

In [21]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ np.log1p(distance) + pickup_month +" 
                            "scale(bearing) + pickup_hour + I(pickup_hour ** 2) + I(pickup_hour ** 3) +"
                            "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train)
result_02 = model.fit()
print(result_02.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.594
Model:                                 OLS   Adj. R-squared:                  0.594
Method:                      Least Squares   F-statistic:                 2.368e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             12:02:19   Log-Likelihood:            -1.0797e+06
No. Observations:                  1458644   AIC:                         2.159e+06
Df Residuals:                      1458634   BIC:                         2.160e+06
Df Model:                                9                                         
Covariance Type:                 nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [22]:
storage(result_02, "basic12와 동일 + pickup_hour 1~3")

In [23]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.590242,0.59024,2171605.0,2171715.0,0.0,0.0,0.0,1396.564907,basic12와 동일 + pickup_hour 1~2
1,0.593644,0.593642,2159445.0,2159567.0,0.0,0.0,0.0,29799.564344,basic12와 동일 + pickup_hour 1~3


## 3. basic12와 동일 + pickup_hour 1~4

In [25]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ np.log1p(distance) + pickup_month +" 
                            "scale(bearing) + pickup_hour + I(pickup_hour ** 2) + I(pickup_hour ** 3) + I(pickup_hour ** 4) +"
                            "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train)
result_03 = model.fit()
print(result_03.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.599
Model:                                 OLS   Adj. R-squared:                  0.599
Method:                      Least Squares   F-statistic:                 2.177e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             12:10:08   Log-Likelihood:            -1.0705e+06
No. Observations:                  1458644   AIC:                         2.141e+06
Df Residuals:                      1458633   BIC:                         2.141e+06
Df Model:                               10                                         
Covariance Type:                 nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [26]:
storage(result_03, "basic12와 동일 + pickup_hour 1~4")

In [27]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.590242,0.59024,2171605.0,2171715.0,0.0,0.0,0.0,1396.564907,basic12와 동일 + pickup_hour 1~2
1,0.593644,0.593642,2159445.0,2159567.0,0.0,0.0,0.0,29799.564344,basic12와 동일 + pickup_hour 1~3
2,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,675016.119235,basic12와 동일 + pickup_hour 1~4


## 4. basic12와 동일 + pickup_hour 1~5

In [29]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ np.log1p(distance) + pickup_month +" 
                            "scale(bearing) + pickup_hour + I(pickup_hour ** 2) + I(pickup_hour ** 3) + I(pickup_hour ** 4) + I(pickup_hour ** 5) +"
                            "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train)
result_04 = model.fit()
print(result_04.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.599
Model:                                 OLS   Adj. R-squared:                  0.599
Method:                      Least Squares   F-statistic:                 1.980e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             12:14:15   Log-Likelihood:            -1.0702e+06
No. Observations:                  1458644   AIC:                         2.140e+06
Df Residuals:                      1458632   BIC:                         2.141e+06
Df Model:                               11                                         
Covariance Type:                 nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [30]:
storage(result_04, "basic12와 동일 + pickup_hour 1~5")

In [32]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.590242,0.59024,2171605.0,2171715.0,0.0,0.0,0.0,1396.565,basic12와 동일 + pickup_hour 1~2
1,0.593644,0.593642,2159445.0,2159567.0,0.0,0.0,0.0,29799.56,basic12와 동일 + pickup_hour 1~3
2,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,675016.1,basic12와 동일 + pickup_hour 1~4
3,0.598891,0.598888,2140495.0,2140641.0,0.0,0.0,0.0,16868870.0,basic12와 동일 + pickup_hour 1~5


## 5. basic12와 동일 + pickup_hour 1~4 scaled

In [61]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ np.log1p(distance) + pickup_month +" 
                            "scale(bearing) + scale(pickup_hour) + scale(I(pickup_hour**2)) + scale(I(pickup_hour**3)) + scale(I(pickup_hour**4) )+"
                            "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train)
result_05 = model.fit()
print(result_05.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.599
Model:                                 OLS   Adj. R-squared:                  0.599
Method:                      Least Squares   F-statistic:                 2.177e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             12:42:52   Log-Likelihood:            -1.0705e+06
No. Observations:                  1458644   AIC:                         2.141e+06
Df Residuals:                      1458633   BIC:                         2.141e+06
Df Model:                               10                                         
Covariance Type:                 nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------

In [62]:
storage(result_05, "basic12와 동일 + pickup_hour 1~4 scaled")

In [63]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.590242,0.59024,2171605.0,2171715.0,0.0,0.0,0.0,1396.565,basic12와 동일 + pickup_hour 1~2
1,0.593644,0.593642,2159445.0,2159567.0,0.0,0.0,0.0,29799.56,basic12와 동일 + pickup_hour 1~3
2,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,675016.1,basic12와 동일 + pickup_hour 1~4
3,0.598891,0.598888,2140495.0,2140641.0,0.0,0.0,0.0,16868870.0,basic12와 동일 + pickup_hour 1~5
4,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,690.7615,basic12와 동일 + pickup_hour 1~4 scaled


## 6. basic12와 동일 + pickup_hour 1~3 scaled + 종속변수, 독립변수 모두 scaled

In [65]:
model = sm.OLS.from_formula("scale(np.log1p(trip_duration)) ~ scale(np.log1p(distance)) + scale(pickup_month) +" 
                            "scale(bearing) + scale(pickup_hour) + scale(I(pickup_hour**2)) + scale(I(pickup_hour**3)) +"
                            "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train)
result_06 = model.fit()
print(result_06.summary())

                                  OLS Regression Results                                  
Dep. Variable:     scale(np.log1p(trip_duration))   R-squared:                       0.594
Model:                                        OLS   Adj. R-squared:                  0.594
Method:                             Least Squares   F-statistic:                 2.368e+05
Date:                            Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                                    12:49:56   Log-Likelihood:            -1.4130e+06
No. Observations:                         1458644   AIC:                         2.826e+06
Df Residuals:                             1458634   BIC:                         2.826e+06
Df Model:                                       9                                         
Covariance Type:                        nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.

In [66]:
storage(result_06, "basic12와 동일 + pickup_hour 1~3 scaled + 종속변수, 독립변수 모두 scaled")

In [67]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.590242,0.59024,2171605.0,2171715.0,0.0,0.0,0.0,1396.565,basic12와 동일 + pickup_hour 1~2
1,0.593644,0.593642,2159445.0,2159567.0,0.0,0.0,0.0,29799.56,basic12와 동일 + pickup_hour 1~3
2,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,675016.1,basic12와 동일 + pickup_hour 1~4
3,0.598891,0.598888,2140495.0,2140641.0,0.0,0.0,0.0,16868870.0,basic12와 동일 + pickup_hour 1~5
4,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,690.7615,basic12와 동일 + pickup_hour 1~4 scaled
5,0.593644,0.593642,2825925.0,2826047.0,0.0,0.0,0.0,48.51611,"basic12와 동일 + pickup_hour 1~3 scaled + 종속변수, 독..."


## 7. basic12와 동일 + pickup_hour 1~2 scaled + 종속변수, 독립변수 모두 scaled

In [68]:
model = sm.OLS.from_formula("scale(np.log1p(trip_duration)) ~ scale(np.log1p(distance)) + scale(pickup_month) +" 
                            "scale(bearing) + scale(pickup_hour) + scale(I(pickup_hour**2))+"
                            "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train)
result_07 = model.fit()
print(result_07.summary())

                                  OLS Regression Results                                  
Dep. Variable:     scale(np.log1p(trip_duration))   R-squared:                       0.590
Model:                                        OLS   Adj. R-squared:                  0.590
Method:                             Least Squares   F-statistic:                 2.626e+05
Date:                            Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                                    12:53:59   Log-Likelihood:            -1.4190e+06
No. Observations:                         1458644   AIC:                         2.838e+06
Df Residuals:                             1458635   BIC:                         2.838e+06
Df Model:                                       8                                         
Covariance Type:                        nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.

In [74]:
storage(result_07, "basic12와 동일 + pickup_hour 1~2 scaled + 종속변수, 독립변수 모두 scaled")

In [75]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.590242,0.59024,2171605.0,2171715.0,0.0,0.0,0.0,1396.565,basic12와 동일 + pickup_hour 1~2
1,0.593644,0.593642,2159445.0,2159567.0,0.0,0.0,0.0,29799.56,basic12와 동일 + pickup_hour 1~3
2,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,675016.1,basic12와 동일 + pickup_hour 1~4
3,0.598891,0.598888,2140495.0,2140641.0,0.0,0.0,0.0,16868870.0,basic12와 동일 + pickup_hour 1~5
4,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,690.7615,basic12와 동일 + pickup_hour 1~4 scaled
5,0.593644,0.593642,2825925.0,2826047.0,0.0,0.0,0.0,48.51611,"basic12와 동일 + pickup_hour 1~3 scaled + 종속변수, 독..."
6,0.590242,0.59024,2838085.0,2838195.0,0.0,0.0,0.0,7.6146,"basic12와 동일 + pickup_hour 1~2 scaled + 종속변수, 독..."


## 8. basic12와 동일 + pickup_hour 2 scaled + 종속변수, 독립변수 모두 scaled

In [69]:
model = sm.OLS.from_formula("scale(np.log1p(trip_duration)) ~ scale(np.log1p(distance)) + scale(pickup_month) +" 
                            "scale(bearing) + scale(I(pickup_hour**2))+"
                            "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train)
result_08 = model.fit()
print(result_08.summary())

                                  OLS Regression Results                                  
Dep. Variable:     scale(np.log1p(trip_duration))   R-squared:                       0.573
Model:                                        OLS   Adj. R-squared:                  0.573
Method:                             Least Squares   F-statistic:                 2.792e+05
Date:                            Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                                    12:56:01   Log-Likelihood:            -1.4497e+06
No. Observations:                         1458644   AIC:                         2.899e+06
Df Residuals:                             1458636   BIC:                         2.900e+06
Df Model:                                       7                                         
Covariance Type:                        nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.

In [76]:
storage(result_08, "basic12와 동일 + pickup_hour 2 scaled + 종속변수, 독립변수 모두 scaled")

In [77]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.590242,0.59024,2171605.0,2171715.0,0.0,0.0,0.0,1396.565,basic12와 동일 + pickup_hour 1~2
1,0.593644,0.593642,2159445.0,2159567.0,0.0,0.0,0.0,29799.56,basic12와 동일 + pickup_hour 1~3
2,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,675016.1,basic12와 동일 + pickup_hour 1~4
3,0.598891,0.598888,2140495.0,2140641.0,0.0,0.0,0.0,16868870.0,basic12와 동일 + pickup_hour 1~5
4,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,690.7615,basic12와 동일 + pickup_hour 1~4 scaled
5,0.593644,0.593642,2825925.0,2826047.0,0.0,0.0,0.0,48.51611,"basic12와 동일 + pickup_hour 1~3 scaled + 종속변수, 독..."
6,0.590242,0.59024,2838085.0,2838195.0,0.0,0.0,0.0,7.6146,"basic12와 동일 + pickup_hour 1~2 scaled + 종속변수, 독..."
7,0.572637,0.572635,2899444.0,2899541.0,0.0,0.0,0.0,4.101156,"basic12와 동일 + pickup_hour 2 scaled + 종속변수, 독립변..."


## 9. poly7 + bearing 1~2 scaled

In [104]:
model = sm.OLS.from_formula("scale(np.log1p(trip_duration)) ~ scale(np.log1p(distance)) + scale(pickup_month) +" 
                            "scale(bearing) + scale(I(bearing**2))+ scale(pickup_hour) + scale(I(pickup_hour**2))+"
                            "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train)
result_09 = model.fit()
print(result_09.summary())

                                  OLS Regression Results                                  
Dep. Variable:     scale(np.log1p(trip_duration))   R-squared:                       0.590
Model:                                        OLS   Adj. R-squared:                  0.590
Method:                             Least Squares   F-statistic:                 2.337e+05
Date:                            Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                                    13:28:48   Log-Likelihood:            -1.4187e+06
No. Observations:                         1458644   AIC:                         2.837e+06
Df Residuals:                             1458634   BIC:                         2.837e+06
Df Model:                                       9                                         
Covariance Type:                        nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.

In [105]:
storage(result_09, "poly7 + bearing 1~2 scaled")

In [106]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.590242,0.59024,2171605.0,2171715.0,0.0,0.0,0.0,1396.565,basic12와 동일 + pickup_hour 1~2
1,0.593644,0.593642,2159445.0,2159567.0,0.0,0.0,0.0,29799.56,basic12와 동일 + pickup_hour 1~3
2,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,675016.1,basic12와 동일 + pickup_hour 1~4
3,0.598891,0.598888,2140495.0,2140641.0,0.0,0.0,0.0,16868870.0,basic12와 동일 + pickup_hour 1~5
4,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,690.7615,basic12와 동일 + pickup_hour 1~4 scaled
5,0.593644,0.593642,2825925.0,2826047.0,0.0,0.0,0.0,48.51611,"basic12와 동일 + pickup_hour 1~3 scaled + 종속변수, 독..."
6,0.590242,0.59024,2838085.0,2838195.0,0.0,0.0,0.0,7.6146,"basic12와 동일 + pickup_hour 1~2 scaled + 종속변수, 독..."
7,0.572637,0.572635,2899444.0,2899541.0,0.0,0.0,0.0,4.101156,"basic12와 동일 + pickup_hour 2 scaled + 종속변수, 독립변..."
8,0.590456,0.590454,2837324.0,2837446.0,0.0,0.0,0.0,7.782751,poly7 + bearing 1~2 scaled


## 10. hour 2~4 scaled + bearing제거

In [107]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ np.log1p(distance) +" 
                            "scale(I(pickup_hour**2)) + scale(I(pickup_hour**4))+"
                            "scale(dropoff_latitude) + C(working_day)", train)
result_10 = model.fit()
print(result_10.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.589
Model:                                 OLS   Adj. R-squared:                  0.589
Method:                      Least Squares   F-statistic:                 4.174e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             13:30:26   Log-Likelihood:            -1.0887e+06
No. Observations:                  1458644   AIC:                         2.177e+06
Df Residuals:                      1458638   BIC:                         2.177e+06
Df Model:                                5                                         
Covariance Type:                 nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------

In [108]:
storage(result_10, "hour 2~4 scaled + bearing제거")

In [109]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.590242,0.59024,2171605.0,2171715.0,0.0,0.0,0.0,1396.565,basic12와 동일 + pickup_hour 1~2
1,0.593644,0.593642,2159445.0,2159567.0,0.0,0.0,0.0,29799.56,basic12와 동일 + pickup_hour 1~3
2,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,675016.1,basic12와 동일 + pickup_hour 1~4
3,0.598891,0.598888,2140495.0,2140641.0,0.0,0.0,0.0,16868870.0,basic12와 동일 + pickup_hour 1~5
4,0.598758,0.598756,2140974.0,2141108.0,0.0,0.0,0.0,690.7615,basic12와 동일 + pickup_hour 1~4 scaled
5,0.593644,0.593642,2825925.0,2826047.0,0.0,0.0,0.0,48.51611,"basic12와 동일 + pickup_hour 1~3 scaled + 종속변수, 독..."
6,0.590242,0.59024,2838085.0,2838195.0,0.0,0.0,0.0,7.6146,"basic12와 동일 + pickup_hour 1~2 scaled + 종속변수, 독..."
7,0.572637,0.572635,2899444.0,2899541.0,0.0,0.0,0.0,4.101156,"basic12와 동일 + pickup_hour 2 scaled + 종속변수, 독립변..."
8,0.590456,0.590454,2837324.0,2837446.0,0.0,0.0,0.0,7.782751,poly7 + bearing 1~2 scaled
9,0.588629,0.588627,2177331.0,2177404.0,0.0,0.0,0.0,9.404033,hour 2~4 scaled + bearing제거


In [None]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ np.log1p(distance) +" 
                            "scale(I(pickup_hour**2)) + scale(I(pickup_hour**4))+"
                            "scale(dropoff_latitude) + C(working_day)", train)
result_10 = model.fit()
print(result_10.summary())

In [33]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet

In [35]:
from patsy import dmatrices

In [55]:
y, X = dmatrices("np.log1p(trip_duration) ~ np.log1p(distance) + pickup_month +" 
                "scale(bearing) + pickup_hour + I(pickup_hour ** 2) + I(pickup_hour ** 3) + I(pickup_hour ** 4) + I(pickup_hour ** 5) +"
                "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train, return_type = 'dataframe')

In [41]:
len(X.columns)

12

In [45]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
x = diabetes.data

In [48]:
df = pd.DataFrame(x, columns=diabetes.feature_names)
len(df.columns)

10

In [56]:
lasso_03 = Lasso(alpha=0.001).fit(X, y)

In [57]:
lasso_03.intercept_

array([ 4.8673271])

In [59]:
len(lasso_03.coef_)

12

In [60]:
X.columns

Index(['Intercept', 'C(vendor_id)[T.2]', 'C(working_day)[T.1]',
       'np.log1p(distance)', 'pickup_month', 'scale(bearing)', 'pickup_hour',
       'I(pickup_hour ** 2)', 'I(pickup_hour ** 3)', 'I(pickup_hour ** 4)',
       'I(pickup_hour ** 5)', 'scale(dropoff_latitude)'],
      dtype='object')

In [39]:
alpha = np.logspace(-3, 1, 5)

data = []
for i, a in enumerate(alpha):
    lasso = Lasso(alpha=a).fit(X, y)
    data.append(pd.Series(np.hstack([lasso.intercept_, lasso.coef_])))
    
df_lasso = pd.DataFrame(data, index=alpha).T
df_lasso

Unnamed: 0,0.001,0.01,0.1,1.0,10.0
0,4.867327,4.951657,5.343202,6.390808,6.420976
1,0.0,0.0,0.0,0.0,0.0
2,0.01757637,0.0,0.0,0.0,0.0
3,0.09776648,0.05290414,0.0,0.0,0.0
4,0.9653265,0.9429446,0.7145666,0.0,0.0
5,0.01525644,0.01228032,0.0,0.0,0.0
6,0.0340018,0.0260334,0.0,0.0,0.0
7,0.006411375,0.0,0.0,0.0,0.0
8,0.004986961,0.006047639,0.005338297,0.0,0.0
9,-0.0002386925,-0.00028814,-0.0002396156,0.0001365154,0.0


## 2. 종속변수 log

In [34]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ passenger_count + distance + bearing + pickup_month +" 
                            "pickup_weekday + pickup_hour + pickup_latitude + pickup_longitude +"
                            "dropoff_latitude + dropoff_longitude + C(vendor_id) + C(store_and_fwd_flag)"
                            "+ C(working_day)", train)
result_02 = model.fit()
print(result_02.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.344
Model:                                 OLS   Adj. R-squared:                  0.344
Method:                      Least Squares   F-statistic:                 5.895e+04
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             10:48:40   Log-Likelihood:            -1.4285e+06
No. Observations:                  1458644   AIC:                         2.857e+06
Df Residuals:                      1458630   BIC:                         2.857e+06
Df Model:                               13                                         
Covariance Type:                 nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------

In [35]:
storage(result_02, "종속변수 log")
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.009621,0.009612,29107830.0,29108000.0,0.0,0.0,0.0,420621.388753,기본 모델
1,0.344416,0.344411,2857118.0,2857289.0,0.0,0.0,0.0,420621.388753,종속변수 log


## 3. store_and_fwd_flag 제거

In [41]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ passenger_count + distance + bearing + pickup_month +" 
                            "pickup_weekday + pickup_hour + pickup_latitude + pickup_longitude +"
                            "dropoff_latitude + dropoff_longitude + C(vendor_id) + C(working_day)", train)
result_03 = model.fit()
print(result_03.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.344
Model:                                 OLS   Adj. R-squared:                  0.344
Method:                      Least Squares   F-statistic:                 6.386e+04
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             10:51:21   Log-Likelihood:            -1.4285e+06
No. Observations:                  1458644   AIC:                         2.857e+06
Df Residuals:                      1458631   BIC:                         2.857e+06
Df Model:                               12                                         
Covariance Type:                 nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

In [42]:
storage(result_03, "store_and_fwd_flag 제거")

In [43]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.009621,0.009612,29107830.0,29108000.0,0.0,0.0,0.0,420621.388753,기본 모델
1,0.344416,0.344411,2857118.0,2857289.0,0.0,0.0,0.0,420621.388753,종속변수 log
2,0.344416,0.34441,2857118.0,2857276.0,0.0,0.0,0.0,420610.107281,store_and_fwd_flag 제거


## 4. pickup_month 제거

In [44]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ passenger_count + distance + bearing +" 
                            "pickup_weekday + pickup_hour + pickup_latitude + pickup_longitude +"
                            "dropoff_latitude + dropoff_longitude + C(vendor_id) + C(working_day)", train)
result_04 = model.fit()
print(result_04.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.343
Model:                                 OLS   Adj. R-squared:                  0.343
Method:                      Least Squares   F-statistic:                 6.922e+04
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             10:52:27   Log-Likelihood:            -1.4301e+06
No. Observations:                  1458644   AIC:                         2.860e+06
Df Residuals:                      1458632   BIC:                         2.860e+06
Df Model:                               11                                         
Covariance Type:                 nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

In [45]:
storage(result_04, "pickup_month 제거")

In [46]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.009621,0.009612,29107830.0,29108000.0,0.0,0.0,0.0,420621.388753,기본 모델
1,0.344416,0.344411,2857118.0,2857289.0,0.0,0.0,0.0,420621.388753,종속변수 log
2,0.344416,0.34441,2857118.0,2857276.0,0.0,0.0,0.0,420610.107281,store_and_fwd_flag 제거
3,0.342983,0.342978,2860300.0,2860446.0,0.0,0.0,0.0,420562.767611,pickup_month 제거


## 5. 종속변수, distance log

In [50]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ passenger_count + np.log1p(distance) + bearing + pickup_month +" 
                            "pickup_weekday + pickup_hour + pickup_latitude + pickup_longitude +"
                            "dropoff_latitude + dropoff_longitude + C(vendor_id) + C(working_day)", train)
result_05 = model.fit()
print(result_05.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.579
Model:                                 OLS   Adj. R-squared:                  0.579
Method:                      Least Squares   F-statistic:                 1.671e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             10:57:53   Log-Likelihood:            -1.1058e+06
No. Observations:                  1458644   AIC:                         2.212e+06
Df Residuals:                      1458631   BIC:                         2.212e+06
Df Model:                               12                                         
Covariance Type:                 nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

In [51]:
storage(result_05, "종속변수, distance log")

In [52]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.009621,0.009612,29107830.0,29108000.0,0.0,0.0,0.0,420621.388753,기본 모델
1,0.344416,0.344411,2857118.0,2857289.0,0.0,0.0,0.0,420621.388753,종속변수 log
2,0.344416,0.34441,2857118.0,2857276.0,0.0,0.0,0.0,420610.107281,store_and_fwd_flag 제거
3,0.342983,0.342978,2860300.0,2860446.0,0.0,0.0,0.0,420562.767611,pickup_month 제거
4,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,417822.233319,"종속변수, distance log"


## 6. 위경도 scale

In [53]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ passenger_count + np.log1p(distance) + bearing + pickup_month +" 
                            "pickup_weekday + pickup_hour + scale(pickup_latitude) + scale(pickup_longitude) +"
                            "scale(dropoff_latitude) + scale(dropoff_longitude) + C(vendor_id) + C(working_day)", train)
result_06 = model.fit()
print(result_06.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.579
Model:                                 OLS   Adj. R-squared:                  0.579
Method:                      Least Squares   F-statistic:                 1.671e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             11:09:06   Log-Likelihood:            -1.1058e+06
No. Observations:                  1458644   AIC:                         2.212e+06
Df Residuals:                      1458631   BIC:                         2.212e+06
Df Model:                               12                                         
Covariance Type:                 nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------

In [54]:
storage(result_06, "위경도 scale")

In [55]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.009621,0.009612,29107830.0,29108000.0,0.0,0.0,0.0,420621.388753,기본 모델
1,0.344416,0.344411,2857118.0,2857289.0,0.0,0.0,0.0,420621.388753,종속변수 log
2,0.344416,0.34441,2857118.0,2857276.0,0.0,0.0,0.0,420610.107281,store_and_fwd_flag 제거
3,0.342983,0.342978,2860300.0,2860446.0,0.0,0.0,0.0,420562.767611,pickup_month 제거
4,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,417822.233319,"종속변수, distance log"
5,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,1322.093218,위경도 scale


## dropoff_longitude 제거

In [56]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ passenger_count + np.log1p(distance) + bearing + pickup_month +" 
                            "pickup_weekday + pickup_hour + pickup_latitude + pickup_longitude +"
                            "dropoff_latitude + C(vendor_id) + C(working_day)", train)
result_07 = model.fit()
print(result_07.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.579
Model:                                 OLS   Adj. R-squared:                  0.579
Method:                      Least Squares   F-statistic:                 1.821e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             11:20:19   Log-Likelihood:            -1.1063e+06
No. Observations:                  1458644   AIC:                         2.213e+06
Df Residuals:                      1458632   BIC:                         2.213e+06
Df Model:                               11                                         
Covariance Type:                 nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

In [57]:
storage(result_07, "dropoff_longitude 제거")

In [58]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.009621,0.009612,29107830.0,29108000.0,0.0,0.0,0.0,420621.388753,기본 모델
1,0.344416,0.344411,2857118.0,2857289.0,0.0,0.0,0.0,420621.388753,종속변수 log
2,0.344416,0.34441,2857118.0,2857276.0,0.0,0.0,0.0,420610.107281,store_and_fwd_flag 제거
3,0.342983,0.342978,2860300.0,2860446.0,0.0,0.0,0.0,420562.767611,pickup_month 제거
4,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,417822.233319,"종속변수, distance log"
5,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,1322.093218,위경도 scale
6,0.578583,0.57858,2212536.0,2212683.0,0.0,0.0,0.0,386395.42996,dropoff_longitude 제거


## 8. pickup_longitude 제거

In [59]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ passenger_count + np.log1p(distance) + bearing + pickup_month +" 
                            "pickup_weekday + pickup_hour + pickup_latitude + "
                            "dropoff_latitude + C(vendor_id) + C(working_day)", train)
result_08 = model.fit()
print(result_08.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.575
Model:                                 OLS   Adj. R-squared:                  0.575
Method:                      Least Squares   F-statistic:                 1.977e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             11:25:32   Log-Likelihood:            -1.1116e+06
No. Observations:                  1458644   AIC:                         2.223e+06
Df Residuals:                      1458633   BIC:                         2.223e+06
Df Model:                               10                                         
Covariance Type:                 nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

In [60]:
storage(result_08, "pickup_longitude 제거")

In [61]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.009621,0.009612,29107830.0,29108000.0,0.0,0.0,0.0,420621.388753,기본 모델
1,0.344416,0.344411,2857118.0,2857289.0,0.0,0.0,0.0,420621.388753,종속변수 log
2,0.344416,0.34441,2857118.0,2857276.0,0.0,0.0,0.0,420610.107281,store_and_fwd_flag 제거
3,0.342983,0.342978,2860300.0,2860446.0,0.0,0.0,0.0,420562.767611,pickup_month 제거
4,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,417822.233319,"종속변수, distance log"
5,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,1322.093218,위경도 scale
6,0.578583,0.57858,2212536.0,2212683.0,0.0,0.0,0.0,386395.42996,dropoff_longitude 제거
7,0.575494,0.575492,2223185.0,2223319.0,0.0,0.0,0.0,270700.547803,pickup_longitude 제거


## 9. scale(pickup_latitude, dropoff_latitude)

In [62]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ passenger_count + np.log1p(distance) + bearing + pickup_month +" 
                            "pickup_weekday + pickup_hour + scale(pickup_latitude) + "
                            "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train)
result_09 = model.fit()
print(result_09.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.575
Model:                                 OLS   Adj. R-squared:                  0.575
Method:                      Least Squares   F-statistic:                 1.977e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             11:28:01   Log-Likelihood:            -1.1116e+06
No. Observations:                  1458644   AIC:                         2.223e+06
Df Residuals:                      1458633   BIC:                         2.223e+06
Df Model:                               10                                         
Covariance Type:                 nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [63]:
storage(result_09, "pickup_latitude, dropoff_latitude scale")

In [64]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.009621,0.009612,29107830.0,29108000.0,0.0,0.0,0.0,420621.388753,기본 모델
1,0.344416,0.344411,2857118.0,2857289.0,0.0,0.0,0.0,420621.388753,종속변수 log
2,0.344416,0.34441,2857118.0,2857276.0,0.0,0.0,0.0,420610.107281,store_and_fwd_flag 제거
3,0.342983,0.342978,2860300.0,2860446.0,0.0,0.0,0.0,420562.767611,pickup_month 제거
4,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,417822.233319,"종속변수, distance log"
5,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,1322.093218,위경도 scale
6,0.578583,0.57858,2212536.0,2212683.0,0.0,0.0,0.0,386395.42996,dropoff_longitude 제거
7,0.575494,0.575492,2223185.0,2223319.0,0.0,0.0,0.0,270700.547803,pickup_longitude 제거
8,0.575494,0.575492,2223185.0,2223319.0,0.0,0.0,0.0,1302.76227,"pickup_latitude, dropoff_latitude scale"


## 10. pickup_latitude 제거

In [72]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ passenger_count + np.log1p(distance) + bearing + pickup_month +" 
                            "pickup_weekday + pickup_hour +"
                            "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train)
result_10 = model.fit()
print(result_10.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.575
Model:                                 OLS   Adj. R-squared:                  0.575
Method:                      Least Squares   F-statistic:                 2.195e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             11:39:18   Log-Likelihood:            -1.1120e+06
No. Observations:                  1458644   AIC:                         2.224e+06
Df Residuals:                      1458634   BIC:                         2.224e+06
Df Model:                                9                                         
Covariance Type:                 nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [68]:
storage(result_10, "9와 동일 + pickup_latitude 제거")

In [69]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.009621,0.009612,29107830.0,29108000.0,0.0,0.0,0.0,420621.388753,기본 모델
1,0.344416,0.344411,2857118.0,2857289.0,0.0,0.0,0.0,420621.388753,종속변수 log
2,0.344416,0.34441,2857118.0,2857276.0,0.0,0.0,0.0,420610.107281,store_and_fwd_flag 제거
3,0.342983,0.342978,2860300.0,2860446.0,0.0,0.0,0.0,420562.767611,pickup_month 제거
4,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,417822.233319,"종속변수, distance log"
5,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,1322.093218,위경도 scale
6,0.578583,0.57858,2212536.0,2212683.0,0.0,0.0,0.0,386395.42996,dropoff_longitude 제거
7,0.575494,0.575492,2223185.0,2223319.0,0.0,0.0,0.0,270700.547803,pickup_longitude 제거
8,0.575494,0.575492,2223185.0,2223319.0,0.0,0.0,0.0,1302.76227,"pickup_latitude, dropoff_latitude scale"
9,0.575247,0.575244,2224034.0,2224156.0,0.0,0.0,0.0,1302.680917,9와 동일 + pickup_latitude 제거


## 11. passenger_count 제거

In [73]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ np.log1p(distance) + bearing + pickup_month +" 
                            "pickup_weekday + pickup_hour +"
                            "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train)
result_11 = model.fit()
print(result_11.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.575
Model:                                 OLS   Adj. R-squared:                  0.575
Method:                      Least Squares   F-statistic:                 2.468e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             11:40:40   Log-Likelihood:            -1.1122e+06
No. Observations:                  1458644   AIC:                         2.224e+06
Df Residuals:                      1458635   BIC:                         2.225e+06
Df Model:                                8                                         
Covariance Type:                 nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [74]:
storage(result_11, "passenger_count 제거")

In [75]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.009621,0.009612,29107830.0,29108000.0,0.0,0.0,0.0,420621.388753,기본 모델
1,0.344416,0.344411,2857118.0,2857289.0,0.0,0.0,0.0,420621.388753,종속변수 log
2,0.344416,0.34441,2857118.0,2857276.0,0.0,0.0,0.0,420610.107281,store_and_fwd_flag 제거
3,0.342983,0.342978,2860300.0,2860446.0,0.0,0.0,0.0,420562.767611,pickup_month 제거
4,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,417822.233319,"종속변수, distance log"
5,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,1322.093218,위경도 scale
6,0.578583,0.57858,2212536.0,2212683.0,0.0,0.0,0.0,386395.42996,dropoff_longitude 제거
7,0.575494,0.575492,2223185.0,2223319.0,0.0,0.0,0.0,270700.547803,pickup_longitude 제거
8,0.575494,0.575492,2223185.0,2223319.0,0.0,0.0,0.0,1302.76227,"pickup_latitude, dropoff_latitude scale"
9,0.575247,0.575244,2224034.0,2224156.0,0.0,0.0,0.0,1302.680917,9와 동일 + pickup_latitude 제거


## 12. bearing, 시간 scale

In [76]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ np.log1p(distance) + scale(bearing) + scale(pickup_month) +" 
                            "scale(pickup_weekday) + scale(pickup_hour) +"
                            "scale(dropoff_latitude) + C(vendor_id) + C(working_day)", train)
result_12 = model.fit()
print(result_12.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.575
Model:                                 OLS   Adj. R-squared:                  0.575
Method:                      Least Squares   F-statistic:                 2.468e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             11:43:20   Log-Likelihood:            -1.1122e+06
No. Observations:                  1458644   AIC:                         2.224e+06
Df Residuals:                      1458635   BIC:                         2.225e+06
Df Model:                                8                                         
Covariance Type:                 nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [77]:
storage(result_12, "bearing, 시간 scale")

In [78]:
result_sets

Unnamed: 0,R_squared,Adj.R_squared,AIC,BIC,P_fstatics,P_omnibus,P_jb,Cond_no,changed
0,0.009621,0.009612,29107830.0,29108000.0,0.0,0.0,0.0,420621.388753,기본 모델
1,0.344416,0.344411,2857118.0,2857289.0,0.0,0.0,0.0,420621.388753,종속변수 log
2,0.344416,0.34441,2857118.0,2857276.0,0.0,0.0,0.0,420610.107281,store_and_fwd_flag 제거
3,0.342983,0.342978,2860300.0,2860446.0,0.0,0.0,0.0,420562.767611,pickup_month 제거
4,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,417822.233319,"종속변수, distance log"
5,0.578828,0.578825,2211688.0,2211846.0,0.0,0.0,0.0,1322.093218,위경도 scale
6,0.578583,0.57858,2212536.0,2212683.0,0.0,0.0,0.0,386395.42996,dropoff_longitude 제거
7,0.575494,0.575492,2223185.0,2223319.0,0.0,0.0,0.0,270700.547803,pickup_longitude 제거
8,0.575494,0.575492,2223185.0,2223319.0,0.0,0.0,0.0,1302.76227,"pickup_latitude, dropoff_latitude scale"
9,0.575247,0.575244,2224034.0,2224156.0,0.0,0.0,0.0,1302.680917,9와 동일 + pickup_latitude 제거


## 5. vendor_id 제거

In [24]:
model = sm.OLS.from_formula("trip_duration ~ passenger_count + distance + bearing +" 
                            "pickup_weekday + pickup_hour + pickup_latitude + pickup_longitude +"
                            "dropoff_latitude + dropoff_longitude + C(working_day)", train)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:          trip_duration   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     1364.
Date:                Mon, 12 Mar 2018   Prob (F-statistic):               0.00
Time:                        23:13:00   Log-Likelihood:            -1.4554e+07
No. Observations:             1458644   AIC:                         2.911e+07
Df Residuals:                 1458633   BIC:                         2.911e+07
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept            9.643e+04   8

## 5. working_day 제거

In [25]:
model = sm.OLS.from_formula("trip_duration ~ passenger_count + distance + bearing +" 
                            "pickup_weekday + pickup_hour + pickup_latitude + pickup_longitude +"
                            "dropoff_latitude + dropoff_longitude + C(vendor_id)", train)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:          trip_duration   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     1407.
Date:                Mon, 12 Mar 2018   Prob (F-statistic):               0.00
Time:                        23:15:02   Log-Likelihood:            -1.4554e+07
No. Observations:             1458644   AIC:                         2.911e+07
Df Residuals:                 1458633   BIC:                         2.911e+07
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept           9.63e+04   8516.78

## 6. passenger_count 제거

In [27]:
model = sm.OLS.from_formula("trip_duration ~ distance + bearing +" 
                            "pickup_weekday + pickup_hour + pickup_latitude + pickup_longitude +"
                            "dropoff_latitude + dropoff_longitude + C(vendor_id)", train)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:          trip_duration   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     1562.
Date:                Mon, 12 Mar 2018   Prob (F-statistic):               0.00
Time:                        23:18:01   Log-Likelihood:            -1.4554e+07
No. Observations:             1458644   AIC:                         2.911e+07
Df Residuals:                 1458634   BIC:                         2.911e+07
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept          9.634e+04   8516.77

## pickup_weekday 제거

In [28]:
model = sm.OLS.from_formula("trip_duration ~ distance + bearing + pickup_hour + pickup_latitude + pickup_longitude +"
                            "dropoff_latitude + dropoff_longitude + C(vendor_id)", train)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:          trip_duration   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     1757.
Date:                Mon, 12 Mar 2018   Prob (F-statistic):               0.00
Time:                        23:19:20   Log-Likelihood:            -1.4554e+07
No. Observations:             1458644   AIC:                         2.911e+07
Df Residuals:                 1458635   BIC:                         2.911e+07
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept          9.612e+04   8515.62

## dropoff_longitude 제거

In [30]:
model = sm.OLS.from_formula("trip_duration ~ distance + bearing + pickup_hour + pickup_latitude + pickup_longitude +"
                            "dropoff_longitude + C(vendor_id)", train)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:          trip_duration   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     2008.
Date:                Mon, 12 Mar 2018   Prob (F-statistic):               0.00
Time:                        23:20:34   Log-Likelihood:            -1.4554e+07
No. Observations:             1458644   AIC:                         2.911e+07
Df Residuals:                 1458636   BIC:                         2.911e+07
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept          8.946e+04   7843.63

## 위치 데이터에 scale

In [31]:
model = sm.OLS.from_formula("trip_duration ~ distance + bearing + pickup_hour + scale(pickup_latitude) + scale(pickup_longitude) +"
                            "scale(dropoff_longitude) + scale(dropoff_latitude) + C(vendor_id)", train)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:          trip_duration   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     1757.
Date:                Mon, 12 Mar 2018   Prob (F-statistic):               0.00
Time:                        23:22:13   Log-Likelihood:            -1.4554e+07
No. Observations:             1458644   AIC:                         2.911e+07
Df Residuals:                 1458635   BIC:                         2.911e+07
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept               

## scale(dropoff_latitude) 제거

In [32]:
model = sm.OLS.from_formula("trip_duration ~ distance + bearing + pickup_hour + scale(pickup_latitude) + scale(pickup_longitude) +"
                            "scale(dropoff_longitude) + C(vendor_id)", train)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:          trip_duration   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     2008.
Date:                Mon, 12 Mar 2018   Prob (F-statistic):               0.00
Time:                        23:23:44   Log-Likelihood:            -1.4554e+07
No. Observations:             1458644   AIC:                         2.911e+07
Df Residuals:                 1458636   BIC:                         2.911e+07
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept               

## 모든 실수 변수에 scale

In [34]:
model = sm.OLS.from_formula("scale(trip_duration) ~ scale(distance) + scale(bearing) + scale(pickup_hour) + scale(pickup_latitude) + scale(pickup_longitude) +"
                            "scale(dropoff_longitude) + C(vendor_id)", train)
result = model.fit()
print(result.summary())

                             OLS Regression Results                             
Dep. Variable:     scale(trip_duration)   R-squared:                       0.010
Model:                              OLS   Adj. R-squared:                  0.010
Method:                   Least Squares   F-statistic:                     2008.
Date:                  Mon, 12 Mar 2018   Prob (F-statistic):               0.00
Time:                          23:33:10   Log-Likelihood:            -2.0627e+06
No. Observations:               1458644   AIC:                         4.125e+06
Df Residuals:                   1458636   BIC:                         4.126e+06
Df Model:                             7                                         
Covariance Type:              nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Inte

## log(trip_duration + 1)

In [35]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ scale(distance) + scale(bearing) + scale(pickup_hour) + scale(pickup_latitude) + scale(pickup_longitude) +"
                            "scale(dropoff_longitude) + C(vendor_id)", train)
result = model.fit()
print(result.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.339
Model:                                 OLS   Adj. R-squared:                  0.339
Method:                      Least Squares   F-statistic:                 1.067e+05
Date:                     Mon, 12 Mar 2018   Prob (F-statistic):               0.00
Time:                             23:35:23   Log-Likelihood:            -1.4349e+06
No. Observations:                  1458644   AIC:                         2.870e+06
Df Residuals:                      1458636   BIC:                         2.870e+06
Df Model:                                7                                         
Covariance Type:                 nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------

## log(trip_duration+1), log(distance+1)

In [36]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ np.log1p(distance) + bearing + scale(pickup_hour) + scale(pickup_latitude) + scale(pickup_longitude) +"
                            "scale(dropoff_longitude) + C(vendor_id)", train)
result = model.fit()
print(result.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.572
Model:                                 OLS   Adj. R-squared:                  0.572
Method:                      Least Squares   F-statistic:                 2.789e+05
Date:                     Mon, 12 Mar 2018   Prob (F-statistic):               0.00
Time:                             23:37:10   Log-Likelihood:            -1.1169e+06
No. Observations:                  1458644   AIC:                         2.234e+06
Df Residuals:                      1458636   BIC:                         2.234e+06
Df Model:                                7                                         
Covariance Type:                 nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------

## log(trip_duration+1), log(distance+1), sin(pickup_hour)

In [None]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ np.log1p(distance) + bearing + np.sin(pickup_hour) + scale(pickup_latitude) + scale(pickup_longitude) +"
                            "scale(dropoff_longitude) + C(vendor_id)", train)
result = model.fit()
print(result.summary())

## log(trip_duration+1), log(distance+1), sin(pickup_hour), sin(bearing)

In [38]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ np.log1p(distance) + np.sin(bearing) + np.sin(pickup_hour) + scale(pickup_latitude) + scale(pickup_longitude) +"
                            "scale(dropoff_longitude) + C(vendor_id)", train)
result = model.fit()
print(result.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.565
Model:                                 OLS   Adj. R-squared:                  0.565
Method:                      Least Squares   F-statistic:                 2.708e+05
Date:                     Mon, 12 Mar 2018   Prob (F-statistic):               0.00
Time:                             23:39:49   Log-Likelihood:            -1.1291e+06
No. Observations:                  1458644   AIC:                         2.258e+06
Df Residuals:                      1458636   BIC:                         2.258e+06
Df Model:                                7                                         
Covariance Type:                 nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------

In [63]:
train2 = train.copy()

In [65]:
def bearing_array2(lat1, lng1, lat2, lng2):
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

In [66]:
lat1, lng1, lat2, lng2 = (train2['pickup_latitude'].values, train2['pickup_longitude'].values, 
                          train2['dropoff_latitude'].values, train2['dropoff_longitude'].values)

In [67]:
train2 = train.assign(bearing = bearing_array2(lat1, lng1, lat2, lng2))

In [68]:
train2.tail()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_weekday,pickup_hour,working_day,distance,bearing
1458639,id2376096,2,2016-04-08 13:31:04,2016-04-08 13:44:02,4,-73.982201,40.745522,-73.994911,40.74017,0,778,4,4,13,1,1.22508,-119.059338
1458640,id1049543,1,2016-01-10 07:35:15,2016-01-10 07:46:10,1,-74.000946,40.747379,-73.970184,40.796547,0,655,1,6,7,0,6.049836,25.342196
1458641,id2304944,2,2016-04-22 06:57:41,2016-04-22 07:10:25,1,-73.959129,40.768799,-74.004433,40.707371,0,764,4,4,6,1,7.824606,-150.788492
1458642,id2714485,1,2016-01-05 15:56:26,2016-01-05 16:02:39,1,-73.982079,40.749062,-73.974632,40.757107,0,373,1,1,15,1,1.092564,35.033294
1458643,id1209952,1,2016-04-05 14:44:25,2016-04-05 14:47:43,1,-73.979538,40.78175,-73.972809,40.790585,0,198,4,1,14,1,1.134042,29.969486


In [80]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ scale(bearing) + scale(I(bearing ** 2)) + scale(I(bearing ** 3)) +"
                            "scale(I(bearing ** 4)) + scale(I(bearing ** 5))", train2)
result = model.fit()
print(result.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.005
Model:                                 OLS   Adj. R-squared:                  0.005
Method:                      Least Squares   F-statistic:                     1565.
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             00:14:48   Log-Likelihood:            -1.7326e+06
No. Observations:                  1458644   AIC:                         3.465e+06
Df Residuals:                      1458638   BIC:                         3.465e+06
Df Model:                                5                                         
Covariance Type:                 nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------

In [79]:
result2 = model.fit_regularized(alpha=0.01, L1_wt=0.5)
print(result2.params)
print(result2.summary())

Intercept          6.361488e+00
bearing           -2.260954e-04
I(bearing ** 2)    6.674292e-06
I(bearing ** 3)    1.886252e-08
I(bearing ** 4)    0.000000e+00
I(bearing ** 5)    0.000000e+00
dtype: float64
None


In [95]:
model = sm.OLS.from_formula("scale(np.log1p(trip_duration)) ~ scale(np.log1p(distance))"
                            "+ scale(I(pickup_hour ** 2)) + scale(I(pickup_hour ** 4)) + scale(bearing)", train)
result = model.fit()
print(result.summary())

                                  OLS Regression Results                                  
Dep. Variable:     scale(np.log1p(trip_duration))   R-squared:                       0.586
Model:                                        OLS   Adj. R-squared:                  0.586
Method:                             Least Squares   F-statistic:                 5.154e+05
Date:                            Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                                    01:01:08   Log-Likelihood:            -1.4272e+06
No. Observations:                         1458644   AIC:                         2.854e+06
Df Residuals:                             1458639   BIC:                         2.854e+06
Df Model:                                       4                                         
Covariance Type:                        nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.

In [97]:
train_X0.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_month,pickup_weekday,pickup_hour,working_day,distance,bearing
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,3,0,17,1,1.498521,99.970196
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,6,6,0,0,1.805507,242.846232
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,1,1,11,1,6.385098,200.319835
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,4,2,19,1,1.485498,187.2623
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,3,5,13,0,1.188588,179.473585


In [99]:
col_reals = col_real + ["pickup_hour", "pickup_weekday", "pickup_month"]
train_reals = train[col_reals]
train_reals.head()

Unnamed: 0,passenger_count,distance,bearing,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_hour,pickup_weekday,pickup_month
0,1,1.498521,99.970196,-73.982155,40.767937,-73.96463,40.765602,17,0,3
1,1,1.805507,242.846232,-73.980415,40.738564,-73.999481,40.731152,0,6,6
2,1,6.385098,200.319835,-73.979027,40.763939,-74.005333,40.710087,11,1,1
3,1,1.485498,187.2623,-74.01004,40.719971,-74.012268,40.706718,19,2,4
4,1,1.188588,179.473585,-73.973053,40.793209,-73.972923,40.78252,13,5,3


In [100]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(train_reals.values, i) for i in range(train_reals.shape[1])]
vif["features"] = train_reals.columns
vif

Unnamed: 0,VIF Factor,features
0,2.606391,passenger_count
1,1.714442,distance
2,4.411248,bearing
3,3819268.0,pickup_longitude
4,1815531.0,pickup_latitude
5,3590561.0,dropoff_longitude
6,1596748.0,dropoff_latitude
7,5.578616,pickup_hour
8,3.46877,pickup_weekday
9,5.379472,pickup_month


In [120]:
model = sm.OLS.from_formula("np.log1p(trip_duration) ~ np.log1p(distance)"
                            "+ np.sin(pickup_hour)", train)
result = model.fit()
print(result.summary())

                               OLS Regression Results                              
Dep. Variable:     np.log1p(trip_duration)   R-squared:                       0.563
Model:                                 OLS   Adj. R-squared:                  0.563
Method:                      Least Squares   F-statistic:                 9.378e+05
Date:                     Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                             02:46:05   Log-Likelihood:            -1.1335e+06
No. Observations:                  1458644   AIC:                         2.267e+06
Df Residuals:                      1458641   BIC:                         2.267e+06
Df Model:                                2                                         
Covariance Type:                 nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------