# 範例 : 計程車費率預測
https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

# [作業目標]
- 試著模仿範例寫法, 使用程車費率預測競賽練習時間欄位處理

# [作業重點]
- 新增星期幾(day of week)與第幾周(week of year)這兩項特徵, 觀察有什麼影響 (In[4], Out[4], In[5], Out[5])
- 新增加上年週期與周周期特徵 , 觀察有什麼影響 (In[8], Out[8], In[9], Out[9]) 

In [2]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = 'data/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

# 資料做取出
train_Y = df['fare_amount']                    # 取出目標值
df = df.drop(['fare_amount'] , axis=1)
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [3]:
# 時間特徵分解方式:使用datetime

# 把[string 物件] 轉為 [時間物件]
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))

# 把要得到的[時間物件]轉為[string 物件]。再轉成[數值](int64)
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')      # 年
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')     # 月
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')       # 日
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')      # 時
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')    # 分
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')    # 秒

#################
'''
將 [string] 轉成 [時間物件]
datatime.datetime.strptime( x, " 要輸入&輸出的資料 " )
x : string 形式物件

將[時間物件] 轉成 [string]
datatime.datetime.strftime( x, " 要輸入&輸出的資料 " )
x : 時間形式物件

%Y : 年
%m : 月
%d : 日
%H : 時
%M : 分
%S : 秒
%w : 星期幾
%U : 一年中的第幾周

df.astype( 'type' )
將 df 的資料轉為所設定的 type
'''
#################

df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [4]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)

scaler = MinMaxScaler()                             # 設定最大最小化
train_X = scaler.fit_transform(df_temp)             # 進行最大最小化轉換

Linear = LinearRegression()                         # 設定使用線性回歸
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')           # 計算分數
GDBT = GradientBoostingRegressor()                  # 設定使用梯度提升樹
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')  # 計算分數

Linear Reg Score : 0.02687687147564244
Gradient Boosting Reg Score : 0.7102229829685841


# 作業1
* 對照範例，試著加入星期幾 (day of week) 與第幾周 (week of year) 這兩項特徵，  
看看結果會比原本只有時間特徵分解的結果更好或更差?

## 依照結果，比原本好一點點，但沒差多少

In [7]:
# 加入星期幾與第幾周兩個特徵
"""
Your Code Here
"""
df['day_of_week'] = df['pickup_datetime'].map( lambda x: datetime.datetime.strftime(x,'%w') ).astype('int64')       # 星期幾( 0=Sunday、6=Saturday...)
df['week_of_year'] = df['pickup_datetime'].map( lambda x: datetime.datetime.strftime( x, "%U") ).astype('int64')    # 轉成第幾周(一年中的)
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day_of_week,week_of_year
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,5,42
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,2,5
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,0,11
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,6,23
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,4,23


In [8]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)                                                 # 刪去總日期時間欄位

train_X = scaler.fit_transform(df_temp)                                                         # 最大最小化
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')           # 線性回歸分數
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')  # 梯度提升樹分數

Linear Reg Score : 0.02872263991179307
Gradient Boosting Reg Score : 0.7103396626059206


In [9]:
# 加上"日週期"特徵 (參考講義"週期循環特徵")
import math

# 日週期計算
df['day_cycle'] = df['pickup_hour']/12 + df['pickup_minute']/720 + df['pickup_second']/43200        
df['day_cycle'] = df['day_cycle'].map(lambda x:math.sin(x*math.pi))                             

###############
'''
日週期 = sin[ (小時/12 + 分/720 + 秒/43200) * pi ]
ps. 小時 : 24 小時制
'''
##############

df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day_of_week,week_of_year,day_cycle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,5,42,-0.02545
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,2,5,0.333601
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,0,11,-0.967083
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,6,23,-0.888817
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,4,23,0.782427


In [10]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.02823409406181778
Gradient Boosting Reg Score : 0.7114659152761554


# 作業2
* 對照範例的日週期效果，試著參考投影片完成年週期與周週期的特徵 (也可以用你自己想到的方式)，  
看看結果會比範例中的結果更好或更差?

## 依照結果，比原本和只用[日週期]好一點點，但沒差多少

In [15]:
# 加上"年週期"

df['Year_cycle'] = df['pickup_month']/6 + df['pickup_day'] /180
# df['Year_cycle'] = df['pickup_month']/6 + ( df['pickup_datetime'].map(lambda x: datetime.datetime.strftime(x,'%j') ).astype('int64') ) /180
df['Year_cycle'] = df['Year_cycle'].map( lambda x: math.cos( x * math.pi ))

####################
'''
年週期 = cos[( 月/6 + 日/180) * pi]
# 日 : day of year # 但答案不用此-呵呵
'''
####################

'\n年週期 = cos[( 月/6 + 日/180) * pi]\n日 : day of year\n\n'

In [17]:
# 加上"月週期"

df['Month_cycle'] = df['day_of_week']/3.5 + df['pickup_hour']/84
df['Month_cycle'] = df['Month_cycle'].map( lambda x: math.sin( x * math.pi ) )
#######################
'''
月週期 = sin[ ( 星期幾/3.5 + 小時/84 ) ]
# 小時 : 星期小時 # 但答案不用此-呵呵
'''
#######################
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day_of_week,week_of_year,day_cycle,Year_cycle,Month_cycle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,5,42,-0.02545,-0.587785,-0.804598
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,2,5,0.333601,-0.069756,0.826239
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,0,11,-0.967083,-0.965926,0.62349
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,6,23,-0.888817,0.961262,-0.294755
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,4,23,0.782427,0.956305,-0.532032


In [18]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)                                             # 刪去總日期時間欄位
train_X = scaler.fit_transform(df_temp)                                                     # 最大最小化
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')       # 線性回歸分數
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')  # 梯度提升樹分數

Linear Reg Score : 0.02904012297923413
Gradient Boosting Reg Score : 0.7114440645910214
