In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

# 한글
import warnings
warnings.filterwarnings('ignore')
mpl.rcParams['axes.unicode_minus'] = False
plt.rc('font', family='Malgun Gothic')

### 전처리

In [2]:
df = pd.read_csv("final.csv", index_col = 0, encoding = "utf-8")

In [3]:
# pd.set_option('display.max_row',100)
# pd.set_option('display.max_column', 100)

In [4]:
df.drop(["시군구", "year_quarter", "addr_road", \
         "transaction_year_month", "transaction_date", \
         "서울_운영_영업_개월_평균", "서울_폐업_영업_개월_평균"], \
        axis = 1, inplace = True)
df["transaction_year"] = df["date"].str.split("-").str[0].astype(int)
df["transaction_month"] = df["date"].str.split("-").str[1].astype(int)
df["transaction_day"] = df["date"].str.split("-").str[2].astype(int)
df.drop("date", axis = 1, inplace = True)

encoder = LabelEncoder()
df["dong_encoded"] = encoder.fit_transform(df["dong"])
df["apt_encoded"] = encoder.fit_transform(df["apt"])
df.drop(["dong", "apt"], axis = 1, inplace = True)

df.rename(columns = {
    "exclusive_use_area": "전용면적",
    "year_of_completion": "건축년도",
    "floor": "층",
    "transaction_real_price": "실거래가",
    "transaction_year": "계약년도",
    "transaction_month": "계약월",
    "transaction_day": "계약일",
    "dong_encoded": "법정동",
    "apt_encoded": "아파트명"
}, inplace = True)
columns = ['소비자태도', '현재생활형편', '미래생활형편', '현재경기판단', '미래경기판단', '내구재구입태도', '주택구입태도', '현재소비지출', '미래소비지출', '순자산', '고용상황', '물가예상']
for column in columns:
    df.rename(columns = {
        column: column + "지수"
    }, inplace = True)
df.rename(columns = {
    "주택매매가격지수": "주택매매가격지수 전국"
}, inplace = True)
df.rename(columns = {
    "아파트실거래가격지수_서울": "아파트실거래가격지수",
    "아파트실거래가격지수_전국": "아파트실거래가격지수 전국",
    "주택매매가격지수_서울_아파트": "주택매매가격지수"
}, inplace = True)
df.rename(columns = {
    "시장금리(콜)": "시장금리"
}, inplace = True)
df.reset_index(inplace = True, drop = True)

### Modeling

In [5]:
X = df[['전용면적', '건축년도',
       '법정동별인구수',
       '한방전문의 인원수',
       '공원 개수', '한강 근접 여부',
       '계약년도']]
y = df['실거래가']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.02,
                                                    random_state = 0)

print('Train Data : ', X_train.shape, y_train.shape)
print('Test Data : ', X_test.shape, y_test.shape)

Train Data :  (595886, 7) (595886,)
Test Data :  (12161, 7) (12161,)


test_size=0.1<br> max_features vs %%time

In [6]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state = 0)
Model_rfr = RandomForestRegressor(n_estimators = 50,
                                  max_features = 7,
                                  random_state = 0,
                                  n_jobs = -1)
Model_rfr.fit(X_train, y_train)
y_hat = Model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_hat)
print(np.sqrt(mse))

7783.435979029184
CPU times: user 1min 24s, sys: 591 ms, total: 1min 24s
Wall time: 21.9 s


In [7]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.1,
                                                    random_state = 0)
Model_rfr = RandomForestRegressor(n_estimators = 50,
                                  max_features = 6,
                                  random_state = 0,
                                  n_jobs = -1)
Model_rfr.fit(X_train, y_train)
y_hat = Model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_hat)
print(np.sqrt(mse))

7299.937596707377
CPU times: user 1min 25s, sys: 432 ms, total: 1min 25s
Wall time: 22.5 s


In [8]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.1,
                                                    random_state = 0)
Model_rfr = RandomForestRegressor(n_estimators = 50,
                                  max_features = 5,
                                  random_state = 0,
                                  n_jobs = -1)
Model_rfr.fit(X_train, y_train)
y_hat = Model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_hat)
print(np.sqrt(mse))

7295.391027853777
CPU times: user 1min 17s, sys: 169 ms, total: 1min 17s
Wall time: 24.9 s


In [9]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.1,
                                                    random_state = 0)
Model_rfr = RandomForestRegressor(n_estimators = 50,
                                  max_features = 4,
                                  random_state = 0,
                                  n_jobs = -1)
Model_rfr.fit(X_train, y_train)
y_hat = Model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_hat)
print(np.sqrt(mse))

7372.8966064497245
CPU times: user 1min 6s, sys: 208 ms, total: 1min 6s
Wall time: 21.3 s


In [10]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.1,
                                                    random_state = 0)
Model_rfr = RandomForestRegressor(n_estimators = 50,
                                  max_features = 3,
                                  random_state = 0,
                                  n_jobs = -1)
Model_rfr.fit(X_train, y_train)
y_hat = Model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_hat)
print(np.sqrt(mse))

7404.427130574988
CPU times: user 54.9 s, sys: 99.7 ms, total: 55 s
Wall time: 18.2 s


In [11]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.1,
                                                    random_state = 0)
Model_rfr = RandomForestRegressor(n_estimators = 50,
                                  max_features = 2,
                                  random_state = 0,
                                  n_jobs = -1)
Model_rfr.fit(X_train, y_train)
y_hat = Model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_hat)
print(np.sqrt(mse))

7533.715027144501
CPU times: user 43.1 s, sys: 96.1 ms, total: 43.2 s
Wall time: 13.8 s


In [12]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.1,
                                                    random_state = 0)
Model_rfr = RandomForestRegressor(n_estimators = 50,
                                  max_features = 1,
                                  random_state = 0,
                                  n_jobs = -1)
Model_rfr.fit(X_train, y_train)
y_hat = Model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_hat)
print(np.sqrt(mse))

7753.407887201498
CPU times: user 33.4 s, sys: 136 ms, total: 33.6 s
Wall time: 10.9 s


In [13]:
# test_size=0.3

In [14]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 0)
Model_rfr = RandomForestRegressor(n_estimators = 50,
                                  max_features = 5,
                                  random_state = 0,
                                  n_jobs = -1)
Model_rfr.fit(X_train, y_train)
y_hat = Model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_hat)
print(np.sqrt(mse))

7997.250809947016
CPU times: user 59.6 s, sys: 39.8 ms, total: 59.7 s
Wall time: 17.4 s


In [15]:
# test_size=0.05

In [16]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.05,
                                                    random_state = 0)
Model_rfr = RandomForestRegressor(n_estimators = 50,
                                  max_features = 5,
                                  random_state = 0,
                                  n_jobs = -1)
Model_rfr.fit(X_train, y_train)
y_hat = Model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_hat)
print(np.sqrt(mse))

7141.6782728617545
CPU times: user 1min 22s, sys: 76.1 ms, total: 1min 22s
Wall time: 26.6 s


In [17]:
# test_size=0.02

In [18]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.02,
                                                    random_state = 0)
Model_rfr = RandomForestRegressor(n_estimators = 50,
                                  max_features = 5,
                                  random_state = 0,
                                  n_jobs = -1)
Model_rfr.fit(X_train, y_train)
y_hat = Model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_hat)
print(np.sqrt(mse))

6962.811121120585
CPU times: user 1min 24s, sys: 24 ms, total: 1min 24s
Wall time: 27.5 s


In [19]:
# test_size=0.02, n_estimators = 100

In [20]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.02,
                                                    random_state = 0)
Model_rfr = RandomForestRegressor(n_estimators = 100,
                                  max_features = 5,
                                  random_state = 0,
                                  n_jobs = -1)
Model_rfr.fit(X_train, y_train)
y_hat = Model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_hat)
print(np.sqrt(mse))

6988.728269409988
CPU times: user 2min 48s, sys: 624 ms, total: 2min 48s
Wall time: 53.9 s


In [21]:
# test_size가 작아도되는지?
# 60만개에
# 9:1이면
# test 6만개


In [None]:
#-------------------여기까지 일단 돌려놓음

In [None]:
# !cat /proc/cpuinfo | grep 'model name'