In [1]:
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor

# 노트북 안에 그래프를 그리기 위해
%matplotlib inline

# 그래프에서 격자로 숫자 범위가 눈에 잘 띄도록 ggplot 스타일을 사용
plt.style.use('ggplot')

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("data/train.csv", parse_dates=["datetime"])
train.shape

(10886, 12)

In [4]:
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['hour'] = train['datetime'].dt.hour
train['dayofweek'] = train['datetime'].dt.dayofweek
train.shape

(10886, 16)

In [6]:
from sklearn.utils import shuffle
train = shuffle(train)

train_70 = train[:int(len(train)*0.7)]
train_30 = train[int(len(train)*0.7):]

In [9]:
f_columns = ['season','holiday','workingday','weather','temp','atemp','casual','registered','year','month','hour','dayofweek']

X_train = train_70[f_columns]
Y_train = train_70['count']

X_test = train_30[f_columns]
Y_test = train_30['count']

# Score
## RMSLE
과대평가 된 항목보다는 과소평가 된 항목에 패널티를 준다.

오차(Error)를 제곱(Square)해서 평균(Mean)한 값의 제곱근(Root) 으로 값이 작을 수록 정밀도가 높다. 

0에 가까운 값이 나올 수록 정밀도가 높은 값이다.

Submissions are evaluated one the Root Mean Squared Logarithmic Error (RMSLE)

$$ \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 } $$

In [102]:
from sklearn.metrics import make_scorer

def rmsle(predicted_values, actual_values):
    # 넘파이로 배열 형태로 바꿔준다.
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    
    # 예측값과 실제 값에 1을 더하고 로그를 씌워준다.
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
    # 위에서 계산한 예측값에서 실제값을 빼주고 제곱을 해준다.
    difference = log_predict - log_actual
    # difference = (log_predict - log_actual) ** 2
    difference = np.square(difference)
    
    # 평균을 낸다.
    mean_difference = difference.mean()
    
    # 다시 루트를 씌운다.
    score = np.sqrt(mean_difference)
    
    return score

rmsle_scorer = make_scorer(rmsle)
rmsle_scorer

make_scorer(rmsle)

In [100]:
LinearRegression()

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [111]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, Y_train)

X_test_pred = model.predict(X_test)

rmsle(X_test_pred, Y_test)

3.0205833252836935e-14

In [116]:
list(zip(map(int, model.predict(X_test).tolist()), Y_test))

[(213, 214),
 (283, 284),
 (118, 119),
 (219, 220),
 (153, 153),
 (102, 102),
 (139, 139),
 (664, 665),
 (252, 253),
 (756, 757),
 (149, 150),
 (3, 3),
 (331, 332),
 (59, 59),
 (231, 232),
 (30, 30),
 (498, 498),
 (639, 640),
 (15, 15),
 (4, 4),
 (52, 52),
 (116, 116),
 (113, 113),
 (54, 54),
 (215, 215),
 (504, 505),
 (12, 12),
 (134, 134),
 (90, 91),
 (153, 153),
 (6, 6),
 (398, 399),
 (95, 95),
 (411, 411),
 (18, 18),
 (7, 7),
 (36, 36),
 (130, 130),
 (156, 156),
 (206, 207),
 (125, 126),
 (6, 6),
 (210, 210),
 (3, 3),
 (277, 278),
 (2, 2),
 (373, 374),
 (97, 97),
 (270, 270),
 (63, 64),
 (429, 430),
 (5, 5),
 (52, 52),
 (37, 37),
 (7, 7),
 (134, 134),
 (7, 7),
 (108, 109),
 (380, 381),
 (194, 194),
 (267, 268),
 (51, 51),
 (29, 29),
 (709, 710),
 (120, 120),
 (63, 63),
 (190, 191),
 (10, 10),
 (223, 224),
 (117, 117),
 (136, 136),
 (4, 4),
 (333, 334),
 (72, 72),
 (37, 37),
 (1, 1),
 (4, 4),
 (55, 55),
 (5, 5),
 (167, 168),
 (227, 227),
 (314, 315),
 (554, 554),
 (154, 155),
 (247,

In [113]:
Y_test

4163     214
10655    284
3824     119
5239     220
2400     153
        ... 
10039    400
10513    207
6064      98
565      121
9912     382
Name: count, Length: 3266, dtype: int64

In [93]:
from sklearn.ensemble import RandomForestRegressor #랜덤포레스트

model = RandomForestRegressor() #모델 가져오기
model.fit(X_train,Y_train) #모델 실행

rmsle(model.predict(X_test), Y_test) #평가

0.01609243422888797

In [75]:
list(zip(list(map(int, pred.tolist())), Y_test.tolist()))[:50]

[(214, 214),
 (281, 284),
 (118, 119),
 (221, 220),
 (153, 153),
 (101, 102),
 (138, 139),
 (674, 665),
 (251, 253),
 (690, 757),
 (149, 150),
 (3, 3),
 (330, 332),
 (59, 59),
 (232, 232),
 (30, 30),
 (507, 498),
 (633, 640),
 (15, 15),
 (4, 4),
 (52, 52),
 (115, 116),
 (112, 113),
 (56, 54),
 (211, 215),
 (512, 505),
 (12, 12),
 (133, 134),
 (89, 91),
 (152, 153),
 (6, 6),
 (399, 399),
 (94, 95),
 (405, 411),
 (18, 18),
 (7, 7),
 (35, 36),
 (130, 130),
 (155, 156),
 (207, 207),
 (126, 126),
 (6, 6),
 (205, 210),
 (3, 3),
 (277, 278),
 (2, 2),
 (374, 374),
 (97, 97),
 (268, 270),
 (64, 64)]