In [1]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_validate # 교차 검증 클래스
from sklearn.tree import DecisionTreeRegressor # 결정트리
import itertools
from sklearn.tree import plot_tree

# for dataloading
train = pd.read_csv('/content/drive/MyDrive/train.csv')
test = pd.read_csv('/content/drive/MyDrive/test.csv')

# 오류 데이터 삭제
error_data = ['C2085', 'C1397', 'C2431', 'C1649', 'C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']
for error in error_data:
    error_index = train[train['단지코드'] == error].index
    train.drop(error_index, inplace=True)

# for 중복값 제거
train = train.drop_duplicates()
test = test.drop_duplicates()

# for 아파트이면서 공공분양이 아닌 데이터만 사용
train = train[(train.임대건물구분 == '아파트') & (train.공급유형 != '공공분양')]
test = test[(test.임대건물구분 == '아파트') & (test.공급유형 != '공공분양')]

# 임대보증금과 임대료 타입 float로 변환
train.loc[train.임대보증금=='-', '임대보증금'] = np.nan
test.loc[test.임대보증금=='-','임대보증금' ] = np.nan
train['임대보증금'] = train['임대보증금'].astype(float)
test['임대보증금'] = test['임대보증금'].astype(float)
train.loc[train.임대료=='-', '임대료'] = np.nan
test.loc[test.임대료=='-', '임대료'] = np.nan
train['임대료'] = train['임대료'].astype(float)
test['임대료'] = test['임대료'].astype(float)

# 공급유형이 장기전세면 임대료 = 0
train.loc[(train.공급유형=='장기전세')&(train.임대료.isnull()), '임대료']=0

# 지하철역과 버스 정류장의 NULL값의 경우, 0으로 판단, 밑에서 컬럼명 바꿈(지하철, 버스)
train['도보 10분거리 내 지하철역 수(환승노선 수 반영)'].fillna(0, inplace=True)
train['도보 10분거리 내 버스정류장 수'].fillna(0, inplace=True)

test['도보 10분거리 내 지하철역 수(환승노선 수 반영)'].fillna(0, inplace=True)
test['도보 10분거리 내 버스정류장 수'].fillna(0, inplace=True)

# test의 자격유형에 있는 2개 결측치 채우기
test.loc[(test.단지코드=='C2411')&(test.자격유형.isnull()), '자격유형'] = 'A'
test.loc[(test.단지코드=='C2253')&(test.자격유형.isnull()), '자격유형'] = 'C'

# train 강원도 행복주택

# 전용면적 16.91 
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 16.91), '임대보증금'] = 13450000
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 16.91), '임대료'] = 65500
# 전용면적 26.9
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 26.9), '임대보증금'] = 19700000
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 26.9), '임대료'] = 96000
# 전용면적 26.9
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 26.9), '임대보증금'] = 19150000
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 26.9), '임대료'] = 94000
# 전용면적 26.9
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 26.9), '임대보증금'] = 21400000
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 26.9), '임대료'] = 105000

# train 부산 국민임대

# 전용면적 24.72
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 24.72), '임대보증금'] = 7000000
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 24.72), '임대료'] = 135000
# 전용면적 24.79
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 24.79), '임대보증금'] = 7000000
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 24.79), '임대료'] = 135000
# 전용면적 26.83
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 26.83), '임대보증금'] = 7600000
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 26.83), '임대료'] = 142000
# 전용면적 37.7
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 37.7), '임대보증금'] = 14800000
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 37.7), '임대료'] = 198000
# 전용면적 46.94
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 46.94), '임대보증금'] = 23100000
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 46.94), '임대료'] = 259000

# train 대구 국민임대

# 전용면적 29.17
train.loc[(train['단지코드'] == 'C2186') & (train['전용면적'] == 29.17), '임대보증금'] = 10847000
train.loc[(train['단지코드'] == 'C2186') & (train['전용면적'] == 29.17), '임대료'] = 138600
# 전용면적 29.34
train.loc[(train['단지코드'] == 'C2186') & (train['전용면적'] == 29.34), '임대보증금'] = 10847000
train.loc[(train['단지코드'] == 'C2186') & (train['전용면적'] == 29.34), '임대료'] = 138600
# 전용면적 37.43
train.loc[(train['단지코드'] == 'C2186') & (train['전용면적'] == 37.43), '임대보증금'] = 17338000
train.loc[(train['단지코드'] == 'C2186') & (train['전용면적'] == 37.43), '임대료'] = 197500

# test 대전 임대상가

# 전용면적 26.37
test.loc[(test['단지코드'] == 'C1006') & (test['전용면적'] == 26.37), '임대보증금'] = 5787000
test.loc[(test['단지코드'] == 'C1006') & (test['전용면적'] == 26.37), '임대료'] = 79980
test.loc[(test['단지코드'] == 'C1006') & (test['전용면적'] == 26.37), '자격유형'] = 'C'
# 전용면적 52.74
test.loc[(test['단지코드'] == 'C1006') & (test['전용면적'] == 52.74), '임대보증금'] = 11574000
test.loc[(test['단지코드'] == 'C1006') & (test['전용면적'] == 52.74), '임대료'] = 159960
test.loc[(test['단지코드'] == 'C1006') & (test['전용면적'] == 52.74), '자격유형'] = 'C'

# test 강원도 영구임대

# 전용면적 24.83
test.loc[(test['단지코드'] == 'C2152') & (test['전용면적'] == 24.83), '임대보증금'] = 2129000
test.loc[(test['단지코드'] == 'C2152') & (test['전용면적'] == 24.83), '임대료'] = 42350
# 전용면적 33.84
test.loc[(test['단지코드'] == 'C2152') & (test['전용면적'] == 33.84), '임대보증금'] = 2902000
test.loc[(test['단지코드'] == 'C2152') & (test['전용면적'] == 33.84), '임대료'] = 57730

# test 경상남도 행복주택

# 전용면적 16.94
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 16.94), '임대보증금'] = 11200000
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 16.94), '임대료'] = 53200
# 전용면적 26.85
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 26.85), '임대보증금'] = 16333330
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 26.85), '임대료'] = 77580
# 전용면적 26.85
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 26.85), '임대보증금'] = 18620000
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 26.85), '임대료'] = 88440
# 전용면적 36.77
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 36.77), '임대보증금'] = 23760000
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 36.77), '임대료'] = 112860

# 데이터프레임 컬럼명 변경(버스,지하철 컬럼명이 너무 길어서 변경)
train.columns = ['단지코드', '총세대수', '임대건물구분', '지역', '공급유형',
    '전용면적', '전용면적별세대수', '공가수', '자격유형', '임대보증금',
    '임대료', '지하철', '버스', '단지내주차면수', '등록차량수']
test.columns = [
    '단지코드', '총세대수', '임대건물구분', '지역', '공급유형',
    '전용면적', '전용면적별세대수', '공가수', '자격유형', '임대보증금',
    '임대료', '지하철', '버스', '단지내주차면수']

# 지역별 예측값을 보기위한 코드
df = test.loc[:, ['지역', '단지코드', '단지내주차면수']]

# train 공급유형
train.loc[train['공급유형'].isin(['국민임대']),'공급유형그룹'] = '국민임대'
train.loc[train['공급유형'].isin(['공공임대(50년)']),'공급유형그룹'] = '장기공공임대'
train.loc[train['공급유형'].isin(['공공임대(5년)', '공공임대(10년)', '공공임대(분납)']),'공급유형그룹'] = '단기공공임대'
train.loc[train['공급유형'].isin(['행복주택', '영구임대', '장기전세']), '공급유형그룹'] = '저소득층'

# test 공급유형
test.loc[test['공급유형'].isin(['국민임대']),'공급유형그룹'] = '국민임대'
test.loc[test['공급유형'].isin(['공공임대(50년)']),'공급유형그룹'] = '장기공공임대'
test.loc[test['공급유형'].isin(['공공임대(5년)', '공공임대(10년)', '공공임대(분납)']),'공급유형그룹'] = '단기공공임대'
test.loc[test['공급유형'].isin(['행복주택', '영구임대', '장기전세']), '공급유형그룹'] = '저소득층'

train = train.drop(labels='공급유형', axis=1)
test = test.drop(labels='공급유형', axis=1)

# train 지역
train.insert(4, 'n지역','')
train.loc[(train['지역'] == '강원도') | (train['지역'] == '제주특별자치도') | (train['지역'] == '충청남도') | (train['지역'] == '울산광역시'), 'n지역'] = 0
train.loc[(train['지역'] == '전라남도') | (train['지역'] == '전라북도') | (train['지역'] == '경상남도'), 'n지역'] = 1
train.loc[(train['지역'] == '부산광역시') | (train['지역'] == '충청북도') | (train['지역'] == '경상북도'), 'n지역'] = 2
train.loc[(train['지역'] == '대전광역시') | (train['지역'] == '광주광역시') | (train['지역'] == '서울특별시'), 'n지역'] = 3
train.loc[(train['지역'] == '경기도') | (train['지역'] == '대구광역시') | (train['지역'] == '세종특별자치시'), 'n지역'] = 4

# test 지역
test.insert(4, 'n지역','')
test.loc[(test['지역'] == '강원도') | (test['지역'] == '제주특별자치도') | (test['지역'] == '충청남도') | (test['지역'] == '울산광역시'), 'n지역'] = 0
test.loc[(test['지역'] == '전라남도') | (test['지역'] == '전라북도') | (test['지역'] == '경상남도'), 'n지역'] = 1
test.loc[(test['지역'] == '부산광역시') | (test['지역'] == '충청북도') | (test['지역'] == '경상북도'), 'n지역'] = 2
test.loc[(test['지역'] == '대전광역시') | (test['지역'] == '광주광역시') | (test['지역'] == '서울특별시'), 'n지역'] = 3
test.loc[(test['지역'] == '경기도') | (test['지역'] == '대구광역시') | (test['지역'] == '세종특별자치시'), 'n지역'] = 4


train = train.drop(labels='지역', axis=1)
test = test.drop(labels='지역', axis=1)

# train 자격유형
train.loc[train['자격유형'].isin(['E','H','I','J']),'자격유형그룹'] = 'q1'
train.loc[train['자격유형'].isin(['B','L']),'자격유형그룹'] = 'q2'
train.loc[train['자격유형'].isin(['G','K','M','N','O']),'자격유형그룹'] = 'q3'
train.loc[train['자격유형'].isin(['D','C','F']),'자격유형그룹'] = 'q4'
train.loc[train['자격유형'].isin(['A']),'자격유형그룹'] = 'q5'

# test 자격유형
test.loc[test['자격유형'].isin(['E','H','I','J']),'자격유형그룹'] = 'q1'
test.loc[test['자격유형'].isin(['B','L']),'자격유형그룹'] = 'q2'
test.loc[test['자격유형'].isin(['G','K','M','N','O']),'자격유형그룹'] = 'q3'
test.loc[test['자격유형'].isin(['D','C','F']),'자격유형그룹'] = 'q4'
test.loc[test['자격유형'].isin(['A']),'자격유형그룹'] = 'q5'

train = train.drop(labels='자격유형', axis=1)
test = test.drop(labels='자격유형', axis=1)

# 데이터프레임 인덱스 재정의
train.reset_index(drop=False, inplace=True)
test.reset_index(drop=False, inplace=True)

# 원-핫 인코딩
train = pd.get_dummies(train, columns = ['n지역','공급유형그룹','자격유형그룹'])
test =  pd.get_dummies(test, columns = ['n지역','공급유형그룹','자격유형그룹'])

# train데이터 컬럼 정리
train = train[['총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '지하철', '버스',
       '단지내주차면수', 'n지역_0', 'n지역_1', 'n지역_2', 'n지역_3', 'n지역_4',
       '공급유형그룹_국민임대', '공급유형그룹_단기공공임대', '공급유형그룹_장기공공임대', '공급유형그룹_저소득층',
       '자격유형그룹_q1', '자격유형그룹_q2', '자격유형그룹_q3', '자격유형그룹_q4', '자격유형그룹_q5', '등록차량수']]
test = test[['총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '지하철', '버스',
       '단지내주차면수', 'n지역_0', 'n지역_1', 'n지역_2', 'n지역_3', 'n지역_4', '공급유형그룹_국민임대',
       '공급유형그룹_단기공공임대', '공급유형그룹_장기공공임대', '공급유형그룹_저소득층', '자격유형그룹_q1',
       '자격유형그룹_q2', '자격유형그룹_q3', '자격유형그룹_q4', '자격유형그룹_q5']]

############################################################
#######################전처리 완료##########################
############################################################

# train데이터에서  data, target 지정 
from sklearn.model_selection import train_test_split 

data = train.drop(columns=['등록차량수'])
target = train['등록차량수']

# train세트와 test세트로 나누기
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1
[['C1072'], ['C1072'], ['C1072'], ['C1072'], ['C1072']]


  uniques = Index(uniques)
  uniques = Index(uniques)


In [2]:
# CatBoost 모델 생성
catb = CatBoostRegressor(
    loss_function='MAE',
    n_estimators=500,
    learning_rate=0.05,
    random_state=42
)

catb.fit(train_input, train_target)

0:	learn: 292.4219204	total: 49.6ms	remaining: 24.8s
1:	learn: 281.7746909	total: 51.8ms	remaining: 12.9s
2:	learn: 272.0884542	total: 54ms	remaining: 8.94s
3:	learn: 262.2792305	total: 56.1ms	remaining: 6.96s
4:	learn: 255.2368211	total: 58.4ms	remaining: 5.78s
5:	learn: 246.2758425	total: 60.7ms	remaining: 5s
6:	learn: 238.9314453	total: 63.1ms	remaining: 4.44s
7:	learn: 231.7850913	total: 65.4ms	remaining: 4.02s
8:	learn: 226.9133121	total: 67.8ms	remaining: 3.7s
9:	learn: 220.3146546	total: 70.1ms	remaining: 3.44s
10:	learn: 214.5984647	total: 72.3ms	remaining: 3.21s
11:	learn: 208.4408342	total: 74.7ms	remaining: 3.04s
12:	learn: 203.8545769	total: 77ms	remaining: 2.88s
13:	learn: 198.9864800	total: 79.4ms	remaining: 2.76s
14:	learn: 194.8091050	total: 81.7ms	remaining: 2.64s
15:	learn: 190.4797220	total: 84.1ms	remaining: 2.54s
16:	learn: 186.6404604	total: 86.3ms	remaining: 2.45s
17:	learn: 182.7656188	total: 88.7ms	remaining: 2.38s
18:	learn: 178.4205612	total: 91ms	remaining: 

<catboost.core.CatBoostRegressor at 0x7fe2e054e6a0>

In [3]:
print(catb.score(train_input, train_target))
print(catb.score(test_input, test_target))

0.9641087274749525
0.9439693793089703


In [20]:
model_pred = list(catb.predict(test_input))

In [21]:
model_target = list(test_target)

In [25]:
data = sorted(list(zip(model_target, model_pred)))

In [27]:
print(data)

[(19.0, 31.42269763232912), (22.0, 131.40770795850443), (47.0, 37.69250549804951), (59.0, 61.882994386266205), (60.0, 67.4037412059127), (65.0, 65.73734548567808), (73.0, 267.4929605487388), (78.0, 69.21051811181832), (78.0, 75.17808906743284), (78.0, 75.8315573306802), (86.0, 92.00646719714103), (94.0, 115.50118644716639), (98.0, 87.50166941816161), (98.0, 98.08917289499846), (98.0, 125.25472441061953), (98.0, 126.97691619301327), (98.0, 231.24388186133223), (108.0, 103.89469021534632), (116.0, 177.161945022821), (120.0, 112.61454465762574), (124.0, 160.99001164761472), (125.0, 98.39841207994675), (132.0, 240.8382731614518), (133.0, 134.6342011176257), (133.0, 136.93857985992724), (133.0, 165.532529139468), (135.0, 230.73420138885103), (135.0, 242.43290064246565), (135.0, 259.75665509526794), (146.0, 164.1074983469872), (146.0, 172.60542212283508), (146.0, 187.10527570907516), (148.0, 200.89120799923307), (148.0, 213.70489799282905), (149.0, 132.8882764579148), (155.0, 248.95240907211

In [None]:
model_pred = list(catb.predict(test_input))
model_target = list(test_target)
data = sorted(list(zip(model_target, model_pred)))
print(data)

In [5]:
print(catb.predict(test_input[:10])) # 예측 잘 하는지 확인하는 용도

[ 919.26422717 1378.89829755  723.78270958  288.31013793  298.57526426
  955.98400572 1140.24461503  891.32492665  231.24388186   75.83155733]


In [6]:
print(test_target[:10]) # 예측 잘 하는지 확인하는 용도

1755     922.0
1281    1350.0
350      579.0
420      422.0
56       301.0
1556     957.0
845     1145.0
1657     982.0
111       98.0
2074      78.0
Name: 등록차량수, dtype: float64


In [7]:
# test파일 모든 예측 값 확인(837개)
print(catb.predict(test))

[ 683.7469429   705.92658338  703.84774504  703.3248176   709.45005453
  714.19773366  721.20245193  722.93735427 1194.80890624 1172.32196046
 1181.64159899 1198.72046095 1214.84730558 1231.85956296 1167.24669325
 1236.96494538 1231.46408468  623.93278313  632.66200518  634.90594817
  634.92461149  623.58400623  622.75196873  633.82250196  630.79532884
  519.4862154   525.78321793  550.992619    575.71419537 1074.73198936
 1075.82849076 1063.30902177 1078.43092963 1073.25729987 1078.70848325
 1087.26605182 1093.30534486 1790.49845918 1742.75685237 1899.27711907
 1738.26351733 1741.07160646 1773.65937482 1748.15932328 1741.11632345
 1963.27052384 1764.51424978 1755.65855663  906.42626225  881.08457543
  880.64536738  899.27027811  904.3101099  1002.92371213  419.73876877
  368.70491079  355.56900559  482.32173116  390.09027592  373.33549124
  387.19797255  367.87044355  339.63954995  297.23009394  296.32323393
  303.17731587  344.87533247  448.33801357  452.42874782  479.98799083
  483.

In [8]:
pred = catb.predict(test) # 예측값 pred 변수에 저장

code = list(itertools.chain.from_iterable(code)) 

df = pd.DataFrame({'code':code, 'pred':pred})
df_mean = df.groupby('code')['pred'].agg(**{'mean':'mean'}).reset_index()

In [9]:
# 최종 단지별 예측값
df_mean

Unnamed: 0,code,mean
0,C1003,288.714947
1,C1006,357.517492
2,C1016,650.828190
3,C1019,278.125314
4,C1030,64.758308
...,...,...
145,C2653,846.759072
146,C2675,1032.521665
147,C2676,451.245198
148,C2688,88.908242
