In [1]:
import glob
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error , r2_score

car = pd.read_csv("car_speed_36912.csv")
print(car.columns)

Index(['date', 'days', 'road_name', 'link_id', 'start_point', 'end_point',
       'direction', 'road_length', 'road_num', 'road_type', 'urban_suburb',
       'district', 'time', 'value', 'temperature', 'wind', 'rain', 'snow',
       'SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM2.5', 'cctv', 'kids',
       'speed_limit', 'entrance', 'signal', 'special', 'vacation'],
      dtype='object')


In [5]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4089876 entries, 0 to 4089875
Data columns (total 31 columns):
date            int64
days            object
road_name       object
link_id         int64
start_point     object
end_point       object
direction       object
road_length     int64
road_num        float64
road_type       object
urban_suburb    object
district        object
time            int64
value           float64
temperature     float64
wind            float64
rain            float64
snow            float64
SO2             float64
CO              float64
O3              float64
NO2             float64
PM10            float64
PM2.5           float64
cctv            float64
kids            float64
speed_limit     float64
entrance        float64
signal          float64
special         int64
vacation        int64
dtypes: float64(17), int64(6), object(8)
memory usage: 967.3+ MB


False

In [7]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
car["days"] = lb_make.fit_transform(car["days"])
car["direction"] = lb_make.fit_transform(car["direction"])
car["road_type"] = lb_make.fit_transform(car["road_type"])
car["urban_suburb"] = lb_make.fit_transform(car["urban_suburb"])
car["district"] = lb_make.fit_transform(car["district"])
car["time"] = lb_make.fit_transform(car["time"])

print(car.shape)
car.head()

(4089876, 31)


Unnamed: 0,date,days,road_name,link_id,start_point,end_point,direction,road_length,road_num,road_type,...,NO2,PM10,PM2.5,cctv,kids,speed_limit,entrance,signal,special,vacation
0,20180313,6,4.19로,1080012200,아카데미하우스,국립4.19묘지,0,1179,1.0,2,...,0.072,108.0,69.0,0.0,0.0,40.0,25.0,8.0,0,0
1,20180313,6,4.19로,1080012200,아카데미하우스,국립4.19묘지,0,1179,1.0,2,...,0.07,111.0,66.0,0.0,0.0,40.0,25.0,8.0,0,0
2,20180313,6,4.19로,1080012200,아카데미하우스,국립4.19묘지,0,1179,1.0,2,...,0.056,79.0,50.0,0.0,0.0,40.0,25.0,8.0,0,0
3,20180313,6,4.19로,1080012200,아카데미하우스,국립4.19묘지,0,1179,1.0,2,...,0.065,82.0,49.0,0.0,0.0,40.0,25.0,8.0,0,0
4,20180313,6,4.19로,1080012200,아카데미하우스,국립4.19묘지,0,1179,1.0,2,...,0.024,33.0,15.0,0.0,0.0,40.0,25.0,8.0,0,0


In [32]:
X_data = car.drop(['date','road_name','link_id','value','start_point','end_point'],axis=1,inplace=False)
y_target = car['value']
X_train , X_test , y_train , y_test = train_test_split(X_data , y_target ,test_size=0.3, random_state=156)

In [14]:
X_train.shape

(2862913, 43)

In [9]:
def get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=False):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    if is_expm1 :
        y_test = np.expm1(y_test)
        pred = np.expm1(pred)
    mse = mean_squared_error(y_test, pred)    
    r2 = r2_score(y_test, pred)   
    
    print('###',model.__class__.__name__,'###')
    print('MSE: {0:.3f}, R2: {1:.3F}'.format(mse, r2*100))

In [33]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(n_jobs=-1)
get_model_predict(lr,X_train, X_test, y_train, y_test) # 날씨만 merge했을 때 - MSE: 132.951, R2: 17.872

### LinearRegression ###
MSE: 117.622, R2: 28.801


In [46]:
X_data_ohe = pd.get_dummies(X_data, columns=['days','time','urban_suburb','road_type','direction'])
X_train , X_test , y_train , y_test = train_test_split(X_data_ohe, y_target, test_size=0.3, random_state=156)

lr = LinearRegression(n_jobs=-1)
get_model_predict(lr,X_train, X_test, y_train, y_test)

### LinearRegression ###
MSE: 85.439, R2: 48.282


In [47]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
ridge = Ridge()
lasso = Lasso()
get_model_predict(ridge,X_train, X_test, y_train, y_test)
get_model_predict(lasso,X_train, X_test, y_train, y_test)

### Ridge ###
MSE: 85.439, R2: 48.282
### Lasso ###
MSE: 131.043, R2: 20.677


In [30]:
pd.DataFrame({'X':X_train.columns,'coef':lr.coef_}).sort_values('coef',ascending=False).head(5)

Unnamed: 0,X,coef
7,SO2,81.395785
36,road_type_1,21.081082
25,time_0,7.094389
26,time_1,4.993473
22,days_4,3.184626


## log 취했을 때

In [48]:
y_target_log = np.log1p(y_target)
X_train , X_test , y_train , y_test = train_test_split(X_data , y_target_log ,test_size=0.3, random_state=156)

lr = LinearRegression(n_jobs=-1)
get_model_predict(lr,X_train, X_test, y_train, y_test,is_expm1=True)

### LinearRegression ###
MSE: 120.431, R2: 27.101


In [49]:
X_train , X_test , y_train , y_test = train_test_split(X_data_ohe , y_target,test_size=0.3, random_state=156)

lr = LinearRegression(n_jobs=-1)
get_model_predict(lr,X_train, X_test, y_train, y_test) # 날씨만 merge했을 때 - MSE: 82.157, R2: 49.250

### LinearRegression ###
MSE: 85.439, R2: 48.282
