In [4]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

In [5]:
# for dataloading
train = pd.read_csv('/content/drive/MyDrive/train.csv')
test = pd.read_csv('/content/drive/MyDrive/test.csv')

In [6]:
train.describe()

Unnamed: 0,총세대수,전용면적,전용면적별세대수,공가수,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수
count,2952.0,2952.0,2952.0,2952.0,2741.0,2948.0,2952.0,2952.0
mean,886.661247,44.757215,102.747967,12.92107,0.176578,3.695726,601.66836,559.768293
std,513.540168,31.87428,132.640159,10.778831,0.427408,2.644665,396.407072,433.375027
min,26.0,12.62,1.0,0.0,0.0,0.0,13.0,13.0
25%,513.5,32.1,14.0,4.0,0.0,2.0,279.25,220.0
50%,779.0,39.93,60.0,11.0,0.0,3.0,517.0,487.0
75%,1106.0,51.5625,144.0,20.0,0.0,4.0,823.0,770.0
max,2568.0,583.4,1865.0,55.0,3.0,20.0,1798.0,2550.0


In [7]:
# 오류 데이터 삭제
error_data = ['C2085', 'C1397', 'C2431', 'C1649', 'C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']
for error in error_data:
    error_index = train[train['단지코드'] == error].index
    train.drop(error_index, inplace=True)

# for 중복값 제거
train = train.drop_duplicates()
test = test.drop_duplicates()

# for 아파트이면서 공공분양이 아닌 데이터만 사용
train = train[(train.임대건물구분 == '아파트') & (train.공급유형 != '공공분양')]
test = test[(test.임대건물구분 == '아파트') & (test.공급유형 != '공공분양')]

In [8]:
# 임대보증금과 임대료 타입 float로 변환
train.loc[train.임대보증금=='-', '임대보증금'] = np.nan
test.loc[test.임대보증금=='-','임대보증금' ] = np.nan
train['임대보증금'] = train['임대보증금'].astype(float)
test['임대보증금'] = test['임대보증금'].astype(float)
train.loc[train.임대료=='-', '임대료'] = np.nan
test.loc[test.임대료=='-', '임대료'] = np.nan
train['임대료'] = train['임대료'].astype(float)
test['임대료'] = test['임대료'].astype(float)

In [9]:
# 공급유형이 장기전세면 임대료 = 0
train.loc[(train.공급유형=='장기전세')&(train.임대료.isnull()), '임대료']=0

In [10]:
# 지하철역과 버스 정류장의 NULL값의 경우, 0으로 판단, 밑에서 컬럼명 바꿈(지하철, 버스)
train['도보 10분거리 내 지하철역 수(환승노선 수 반영)'].fillna(0, inplace=True)
train['도보 10분거리 내 버스정류장 수'].fillna(0, inplace=True)

test['도보 10분거리 내 지하철역 수(환승노선 수 반영)'].fillna(0, inplace=True)
test['도보 10분거리 내 버스정류장 수'].fillna(0, inplace=True)

In [11]:
# test의 자격유형에 있는 2개 결측치 채우기
test.loc[(test.단지코드=='C2411')&(test.자격유형.isnull()), '자격유형'] = 'A'
test.loc[(test.단지코드=='C2253')&(test.자격유형.isnull()), '자격유형'] = 'C'

In [12]:
# train 강원도 행복주택

# 전용면적 16.91 
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 16.91), '임대보증금'] = 13450000
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 16.91), '임대료'] = 65500
# 전용면적 26.9
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 26.9), '임대보증금'] = 19700000
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 26.9), '임대료'] = 96000
# 전용면적 26.9
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 26.9), '임대보증금'] = 19150000
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 26.9), '임대료'] = 94000
# 전용면적 26.9
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 26.9), '임대보증금'] = 21400000
train.loc[(train['단지코드'] == 'C1786') & (train['전용면적'] == 26.9), '임대료'] = 105000

# train 부산 국민임대

# 전용면적 24.72
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 24.72), '임대보증금'] = 7000000
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 24.72), '임대료'] = 135000
# 전용면적 24.79
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 24.79), '임대보증금'] = 7000000
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 24.79), '임대료'] = 135000
# 전용면적 26.83
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 26.83), '임대보증금'] = 7600000
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 26.83), '임대료'] = 142000
# 전용면적 37.7
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 37.7), '임대보증금'] = 14800000
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 37.7), '임대료'] = 198000
# 전용면적 46.94
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 46.94), '임대보증금'] = 23100000
train.loc[(train['단지코드'] == 'C1326') & (train['전용면적'] == 46.94), '임대료'] = 259000

# train 대구 국민임대

# 전용면적 29.17
train.loc[(train['단지코드'] == 'C2186') & (train['전용면적'] == 29.17), '임대보증금'] = 10847000
train.loc[(train['단지코드'] == 'C2186') & (train['전용면적'] == 29.17), '임대료'] = 138600
# 전용면적 29.34
train.loc[(train['단지코드'] == 'C2186') & (train['전용면적'] == 29.34), '임대보증금'] = 10847000
train.loc[(train['단지코드'] == 'C2186') & (train['전용면적'] == 29.34), '임대료'] = 138600
# 전용면적 37.43
train.loc[(train['단지코드'] == 'C2186') & (train['전용면적'] == 37.43), '임대보증금'] = 17338000
train.loc[(train['단지코드'] == 'C2186') & (train['전용면적'] == 37.43), '임대료'] = 197500

In [13]:
# test 대전 임대상가

# 전용면적 26.37
test.loc[(test['단지코드'] == 'C1006') & (test['전용면적'] == 26.37), '임대보증금'] = 5787000
test.loc[(test['단지코드'] == 'C1006') & (test['전용면적'] == 26.37), '임대료'] = 79980
test.loc[(test['단지코드'] == 'C1006') & (test['전용면적'] == 26.37), '자격유형'] = 'C'
# 전용면적 52.74
test.loc[(test['단지코드'] == 'C1006') & (test['전용면적'] == 52.74), '임대보증금'] = 11574000
test.loc[(test['단지코드'] == 'C1006') & (test['전용면적'] == 52.74), '임대료'] = 159960
test.loc[(test['단지코드'] == 'C1006') & (test['전용면적'] == 52.74), '자격유형'] = 'C'

# test 강원도 영구임대

# 전용면적 24.83
test.loc[(test['단지코드'] == 'C2152') & (test['전용면적'] == 24.83), '임대보증금'] = 2129000
test.loc[(test['단지코드'] == 'C2152') & (test['전용면적'] == 24.83), '임대료'] = 42350
# 전용면적 33.84
test.loc[(test['단지코드'] == 'C2152') & (test['전용면적'] == 33.84), '임대보증금'] = 2902000
test.loc[(test['단지코드'] == 'C2152') & (test['전용면적'] == 33.84), '임대료'] = 57730

# test 경상남도 행복주택

# 전용면적 16.94
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 16.94), '임대보증금'] = 11200000
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 16.94), '임대료'] = 53200
# 전용면적 26.85
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 26.85), '임대보증금'] = 16333330
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 26.85), '임대료'] = 77580
# 전용면적 26.85
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 26.85), '임대보증금'] = 18620000
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 26.85), '임대료'] = 88440
# 전용면적 36.77
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 36.77), '임대보증금'] = 23760000
test.loc[(test['단지코드'] == 'C1267') & (test['전용면적'] == 36.77), '임대료'] = 112860

In [14]:
train.isnull().sum() # 모든 데이터에 결측치 없음을 확인

단지코드                            0
총세대수                            0
임대건물구분                          0
지역                              0
공급유형                            0
전용면적                            0
전용면적별세대수                        0
공가수                             0
자격유형                            0
임대보증금                           0
임대료                             0
도보 10분거리 내 지하철역 수(환승노선 수 반영)    0
도보 10분거리 내 버스정류장 수              0
단지내주차면수                         0
등록차량수                           0
dtype: int64

In [15]:
test.isnull().sum() # 모든 데이터에 결측치 없음을 확인

단지코드                            0
총세대수                            0
임대건물구분                          0
지역                              0
공급유형                            0
전용면적                            0
전용면적별세대수                        0
공가수                             0
자격유형                            0
임대보증금                           0
임대료                             0
도보 10분거리 내 지하철역 수(환승노선 수 반영)    0
도보 10분거리 내 버스정류장 수              0
단지내주차면수                         0
dtype: int64

In [16]:
# 데이터프레임 컬럼명 변경(버스,지하철 컬럼명이 너무 길어서 변경)
train.columns = ['단지코드', '총세대수', '임대건물구분', '지역', '공급유형',
    '전용면적', '전용면적별세대수', '공가수', '자격유형', '임대보증금',
    '임대료', '지하철', '버스', '단지내주차면수', '등록차량수']
test.columns = [
    '단지코드', '총세대수', '임대건물구분', '지역', '공급유형',
    '전용면적', '전용면적별세대수', '공가수', '자격유형', '임대보증금',
    '임대료', '지하철', '버스', '단지내주차면수']

In [17]:
# 필요없는 칼럼 삭제(임대건물구분, 단지코드)
# train = train.drop('임대건물구분',axis=1)
# test = test.drop('임대건물구분',axis=1)

# train = train.drop('단지코드',axis=1)
# test = test.drop('단지코드',axis=1)

In [18]:
# 범주형 데이터인 컬럼의 카테고리 확인으로 >> train,test 카테고리가 다른 것을 확인
print(train['지역'].unique())
print(test['지역'].unique())

print(train['공급유형'].unique()) 
print(test['공급유형'].unique())

print(train['자격유형'].unique())
print(test['자격유형'].unique())

['경상남도' '대전광역시' '경기도' '전라북도' '강원도' '광주광역시' '충청남도' '부산광역시' '제주특별자치도'
 '울산광역시' '충청북도' '전라남도' '경상북도' '대구광역시' '서울특별시' '세종특별자치시']
['경기도' '부산광역시' '전라북도' '경상남도' '충청남도' '대전광역시' '제주특별자치도' '강원도' '울산광역시' '경상북도'
 '충청북도' '광주광역시' '전라남도' '대구광역시' '세종특별자치시']
['국민임대' '공공임대(50년)' '영구임대' '공공임대(10년)' '공공임대(분납)' '장기전세' '행복주택' '공공임대(5년)']
['국민임대' '영구임대' '공공임대(50년)' '공공임대(10년)' '공공임대(분납)' '행복주택']
['A' 'B' 'C' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O']
['H' 'A' 'E' 'C' 'G' 'I' 'J' 'K' 'L' 'M' 'N']


In [19]:
px.box(train, x='공급유형', y='등록차량수', hover_name='단지코드')

In [20]:
# train 공급유형
train.loc[train['공급유형'].isin(['국민임대']),'공급유형그룹'] = '국민임대'
train.loc[train['공급유형'].isin(['공공임대(50년)']),'공급유형그룹'] = '장기공공임대'
train.loc[train['공급유형'].isin(['공공임대(5년)', '공공임대(10년)', '공공임대(분납)']),'공급유형그룹'] = '단기공공임대'
train.loc[train['공급유형'].isin(['행복주택', '영구임대', '장기전세']), '공급유형그룹'] = '기타'

# test 공급유형
test.loc[test['공급유형'].isin(['국민임대']),'공급유형그룹'] = '국민임대'
test.loc[test['공급유형'].isin(['공공임대(50년)']),'공급유형그룹'] = '장기공공임대'
test.loc[test['공급유형'].isin(['공공임대(5년)', '공공임대(10년)', '공공임대(분납)']),'공급유형그룹'] = '단기공공임대'
test.loc[test['공급유형'].isin(['행복주택', '영구임대', '장기전세']), '공급유형그룹'] = '기타'

train = train.drop(labels='공급유형', axis=1)
test = test.drop(labels='공급유형', axis=1)

In [21]:
# 데이터 시각화 라이브러리 plotly.express
import plotly.express as px

px.box(train, x='지역', y='등록차량수',hover_name='단지코드')
# hover_name = tooltip (점)에 마우스 올렸을 때, 단지코드가 이름처럼 보임

In [22]:
# 지역별 샘플수(count)와 등록차량수의 중앙값(median) 산출 후 중앙값 기준으로 정렬
area_groups = pd.DataFrame([
    *pd.DataFrame({
        '지역' : train['지역'],
        '등록차량수' : train['등록차량수'],
    })
    .groupby(['지역'])
    .apply(lambda x: {
        '지역' : x.iloc[0,0],
        'count' : len(x),
        'median' : x.등록차량수.median()
    })
]).sort_values('median')

In [23]:
print(train['지역'].unique())
print(len(train['지역'].unique()))

['경상남도' '대전광역시' '경기도' '전라북도' '강원도' '광주광역시' '충청남도' '부산광역시' '제주특별자치도'
 '울산광역시' '충청북도' '전라남도' '경상북도' '대구광역시' '서울특별시' '세종특별자치시']
16


In [24]:
print(test['지역'].unique())
print(len(test['지역'].unique()))

['경기도' '부산광역시' '전라북도' '경상남도' '충청남도' '대전광역시' '제주특별자치도' '강원도' '울산광역시' '경상북도'
 '충청북도' '광주광역시' '전라남도' '대구광역시' '세종특별자치시']
15


In [25]:
test_지역=test['지역'].unique()
train_지역 = train['지역'].unique()

In [26]:
test_지역=set(test_지역.tolist())
train_지역=set(train_지역.tolist())

In [27]:
print(len(test_지역.intersection(train_지역)))  # 교집합
print(train_지역.difference(test_지역))  # train_지역에서 test_지역을 뺀 차집합
# train에 '서울특별시'가 있어서 test보다 지역이 하나 더 많았음

15
{'서울특별시'}


In [28]:
# area_groups

In [29]:
# 중앙값이 작은 지역부터 누적 샘플크기(cum_count)를 토대로 백분위수를 고려해 5개 그룹으로 분할
area_groups['cum_count'] = np.cumsum(area_groups['count'])
area_groups['n지역'] = pd.qcut(area_groups['cum_count'], 5, labels=False, retbins=False)

In [30]:
# n지역 숫자 배정에 지역이 조금 바꼈어요 아마도 우리가 상가를 빼서인 듯?
area_groups

Unnamed: 0,지역,count,median,cum_count,n지역
0,강원도,155,333.0,155,0
13,제주특별자치도,66,358.0,221,0
14,충청남도,81,359.0,302,0
10,울산광역시,30,396.0,332,0
11,전라남도,145,408.0,477,1
12,전라북도,107,415.0,584,1
2,경상남도,263,528.0,847,1
7,부산광역시,107,551.0,954,2
15,충청북도,161,553.0,1115,2
3,경상북도,110,562.0,1225,2


In [31]:
# train 지역
train.insert(4, 'n지역','')
train.loc[(train['지역'] == '강원도') | (train['지역'] == '제주특별자치도') | (train['지역'] == '충청남도') | (train['지역'] == '울산광역시'), 'n지역'] = 0
train.loc[(train['지역'] == '전라남도') | (train['지역'] == '전라북도') | (train['지역'] == '경상남도'), 'n지역'] = 1
train.loc[(train['지역'] == '부산광역시') | (train['지역'] == '충청북도') | (train['지역'] == '경상북도'), 'n지역'] = 2
train.loc[(train['지역'] == '대전광역시') | (train['지역'] == '광주광역시') | (train['지역'] == '서울특별시'), 'n지역'] = 3
train.loc[(train['지역'] == '경기도') | (train['지역'] == '대구광역시') | (train['지역'] == '세종특별자치시'), 'n지역'] = 4

# test 지역
test.insert(4, 'n지역','')
test.loc[(test['지역'] == '강원도') | (test['지역'] == '제주특별자치도') | (test['지역'] == '충청남도') | (test['지역'] == '울산광역시'), 'n지역'] = 0
test.loc[(test['지역'] == '전라남도') | (test['지역'] == '전라북도') | (test['지역'] == '경상남도'), 'n지역'] = 1
test.loc[(test['지역'] == '부산광역시') | (test['지역'] == '충청북도') | (test['지역'] == '경상북도'), 'n지역'] = 2
test.loc[(test['지역'] == '대전광역시') | (test['지역'] == '광주광역시') | (test['지역'] == '서울특별시'), 'n지역'] = 3
test.loc[(test['지역'] == '경기도') | (test['지역'] == '대구광역시') | (test['지역'] == '세종특별자치시'), 'n지역'] = 4


train = train.drop(labels='지역', axis=1)
test = test.drop(labels='지역', axis=1)

In [32]:
px.box(train, x='자격유형', y='등록차량수',hover_name='단지코드')

In [42]:
train1 = train.copy()
train1['주차가능자리']=train1['단지내주차면수']-train1['등록차량수']

# 자격유형 그룹
train1.loc[train1['자격유형'].isin(['E','H','I','J']),'자격유형그룹'] = '4'
train1.loc[train1['자격유형'].isin(['B','L']),'자격유형그룹'] = '2'
train1.loc[train1['자격유형'].isin(['G','K','M','N','O']),'자격유형그룹'] = '5'
train1.loc[train1['자격유형'].isin(['D','C','F']),'자격유형그룹'] = '3'
train1.loc[train1['자격유형'].isin(['A']),'자격유형그룹'] = '1'

fig = px.scatter(train1, x='자격유형', y='주차가능자리',hover_data=["등록차량수","총세대수",'자격유형그룹'], color='자격유형그룹',color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_layout(title='자격유형그룹화',
                  title_x=0.5,
                  width=1000,height=600,
                  template="plotly_white")

fig.show()

In [None]:
train.loc[train['자격유형'].isin(['E','H','I','J']),'자격유형그룹'] = 'q1'
train.loc[train['자격유형'].isin(['B','L']),'자격유형그룹'] = 'q2'
train.loc[train['자격유형'].isin(['G','K','M','N','O']),'자격유형그룹'] = 'q3'
train.loc[train['자격유형'].isin(['D','C','F']),'자격유형그룹'] = 'q4'
train.loc[train['자격유형'].isin(['A']),'자격유형그룹'] = 'q5'

# test 자격유형
test.loc[test['자격유형'].isin(['E','H','I','J']),'자격유형그룹'] = 'q1'
test.loc[test['자격유형'].isin(['B','L']),'자격유형그룹'] = 'q2'
test.loc[test['자격유형'].isin(['G','K','M','N','O']),'자격유형그룹'] = 'q3'
test.loc[test['자격유형'].isin(['D','C','F']),'자격유형그룹'] = 'q4'
test.loc[test['자격유형'].isin(['A']),'자격유형그룹'] = 'q5'

train = train.drop(labels='자격유형', axis=1)
test = test.drop(labels='자격유형', axis=1)

In [None]:
train

Unnamed: 0,단지코드,총세대수,임대건물구분,n지역,전용면적,전용면적별세대수,공가수,임대보증금,임대료,지하철,버스,단지내주차면수,등록차량수,공급유형그룹,자격유형그룹
8,C2515,545,아파트,1,33.48,276,17.0,9216000.0,82940.0,0.0,3.0,624.0,205.0,국민임대,q5
9,C2515,545,아파트,1,39.60,60,17.0,12672000.0,107130.0,0.0,3.0,624.0,205.0,국민임대,q5
10,C2515,545,아파트,1,39.60,20,17.0,12672000.0,107130.0,0.0,3.0,624.0,205.0,국민임대,q5
11,C2515,545,아파트,1,46.90,38,17.0,18433000.0,149760.0,0.0,3.0,624.0,205.0,국민임대,q5
12,C2515,545,아파트,1,46.90,19,17.0,18433000.0,149760.0,0.0,3.0,624.0,205.0,국민임대,q5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2947,C2532,239,아파트,0,49.20,19,7.0,11346000.0,116090.0,0.0,1.0,166.0,146.0,국민임대,q5
2948,C2532,239,아파트,0,51.08,34,7.0,14005000.0,142310.0,0.0,1.0,166.0,146.0,국민임대,q5
2949,C2532,239,아파트,0,51.73,34,7.0,14005000.0,142310.0,0.0,1.0,166.0,146.0,국민임대,q5
2950,C2532,239,아파트,0,51.96,114,7.0,14005000.0,142310.0,0.0,1.0,166.0,146.0,국민임대,q5


In [None]:
# 원-핫 인코딩이 필요한 컬럼들 컬럼별 카테고리 확인
print(train['공급유형그룹'].unique())
print(test['공급유형그룹'].unique())

print(train['n지역'].unique())
print(test['n지역'].unique())

print(train['자격유형그룹'].unique())
print(test['자격유형그룹'].unique())

['국민임대' '장기공공임대' '기타' '단기공공임대']
['국민임대' '기타' '장기공공임대' '단기공공임대']
[1 3 4 0 2]
[4 2 1 0 3]
['q5' 'q2' 'q4' 'q1' 'q3']
['q1' 'q5' 'q4' 'q3' 'q2']


In [None]:
# 데이터프레임 인덱스 재정의
train.reset_index(drop=False, inplace=True)
test.reset_index(drop=False, inplace=True)

In [None]:
# 원-핫 인코딩
from sklearn.preprocessing import OneHotEncoder

train = pd.get_dummies(train, columns = ['n지역','공급유형그룹','자격유형그룹'])
test =  pd.get_dummies(test, columns = ['n지역','공급유형그룹','자격유형그룹'])


In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior)


In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior)



In [None]:
# 뒤에 단지코드 컬럼 쓰기위해 리스트로 저장
code = test[['단지코드']]
code = code.values.tolist() #리스트로 변환
print(code[:5])

[['C1072'], ['C1072'], ['C1072'], ['C1072'], ['C1072']]


In [None]:
train

Unnamed: 0,index,단지코드,총세대수,임대건물구분,전용면적,전용면적별세대수,공가수,임대보증금,임대료,지하철,...,n지역_4,공급유형그룹_국민임대,공급유형그룹_기타,공급유형그룹_단기공공임대,공급유형그룹_장기공공임대,자격유형그룹_q1,자격유형그룹_q2,자격유형그룹_q3,자격유형그룹_q4,자격유형그룹_q5
0,8,C2515,545,아파트,33.48,276,17.0,9216000.0,82940.0,0.0,...,0,1,0,0,0,0,0,0,0,1
1,9,C2515,545,아파트,39.60,60,17.0,12672000.0,107130.0,0.0,...,0,1,0,0,0,0,0,0,0,1
2,10,C2515,545,아파트,39.60,20,17.0,12672000.0,107130.0,0.0,...,0,1,0,0,0,0,0,0,0,1
3,11,C2515,545,아파트,46.90,38,17.0,18433000.0,149760.0,0.0,...,0,1,0,0,0,0,0,0,0,1
4,12,C2515,545,아파트,46.90,19,17.0,18433000.0,149760.0,0.0,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2259,2947,C2532,239,아파트,49.20,19,7.0,11346000.0,116090.0,0.0,...,0,1,0,0,0,0,0,0,0,1
2260,2948,C2532,239,아파트,51.08,34,7.0,14005000.0,142310.0,0.0,...,0,1,0,0,0,0,0,0,0,1
2261,2949,C2532,239,아파트,51.73,34,7.0,14005000.0,142310.0,0.0,...,0,1,0,0,0,0,0,0,0,1
2262,2950,C2532,239,아파트,51.96,114,7.0,14005000.0,142310.0,0.0,...,0,1,0,0,0,0,0,0,0,1


In [None]:
train = train[['총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '지하철', '버스',
       '단지내주차면수', 'n지역_0', 'n지역_1', 'n지역_2', 'n지역_3', 'n지역_4',
       '공급유형그룹_국민임대', '공급유형그룹_단기공공임대', '공급유형그룹_장기공공임대', '공급유형그룹_기타',
       '자격유형그룹_q1', '자격유형그룹_q2', '자격유형그룹_q3', '자격유형그룹_q4', '자격유형그룹_q5', '등록차량수']]
test = test[['총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '지하철', '버스',
       '단지내주차면수', 'n지역_0', 'n지역_1', 'n지역_2', 'n지역_3', 'n지역_4', '공급유형그룹_국민임대',
       '공급유형그룹_단기공공임대', '공급유형그룹_장기공공임대', '공급유형그룹_기타', '자격유형그룹_q1',
       '자격유형그룹_q2', '자격유형그룹_q3', '자격유형그룹_q4', '자격유형그룹_q5']]

In [None]:
print(len(train.columns))
print(len(test.columns)) # test에는 '등록차량수'가 없기 때문에 하나 더 적은게 맞음

24
23


In [None]:
train # 전처리가 완료된 train

Unnamed: 0,총세대수,전용면적,전용면적별세대수,공가수,임대보증금,임대료,지하철,버스,단지내주차면수,n지역_0,...,공급유형그룹_국민임대,공급유형그룹_단기공공임대,공급유형그룹_장기공공임대,공급유형그룹_기타,자격유형그룹_q1,자격유형그룹_q2,자격유형그룹_q3,자격유형그룹_q4,자격유형그룹_q5,등록차량수
0,545,33.48,276,17.0,9216000.0,82940.0,0.0,3.0,624.0,0,...,1,0,0,0,0,0,0,0,1,205.0
1,545,39.60,60,17.0,12672000.0,107130.0,0.0,3.0,624.0,0,...,1,0,0,0,0,0,0,0,1,205.0
2,545,39.60,20,17.0,12672000.0,107130.0,0.0,3.0,624.0,0,...,1,0,0,0,0,0,0,0,1,205.0
3,545,46.90,38,17.0,18433000.0,149760.0,0.0,3.0,624.0,0,...,1,0,0,0,0,0,0,0,1,205.0
4,545,46.90,19,17.0,18433000.0,149760.0,0.0,3.0,624.0,0,...,1,0,0,0,0,0,0,0,1,205.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2259,239,49.20,19,7.0,11346000.0,116090.0,0.0,1.0,166.0,1,...,1,0,0,0,0,0,0,0,1,146.0
2260,239,51.08,34,7.0,14005000.0,142310.0,0.0,1.0,166.0,1,...,1,0,0,0,0,0,0,0,1,146.0
2261,239,51.73,34,7.0,14005000.0,142310.0,0.0,1.0,166.0,1,...,1,0,0,0,0,0,0,0,1,146.0
2262,239,51.96,114,7.0,14005000.0,142310.0,0.0,1.0,166.0,1,...,1,0,0,0,0,0,0,0,1,146.0


In [None]:
test # 전처리가 완료된 test

Unnamed: 0,총세대수,전용면적,전용면적별세대수,공가수,임대보증금,임대료,지하철,버스,단지내주차면수,n지역_0,...,n지역_4,공급유형그룹_국민임대,공급유형그룹_단기공공임대,공급유형그룹_장기공공임대,공급유형그룹_기타,자격유형그룹_q1,자격유형그룹_q2,자격유형그룹_q3,자격유형그룹_q4,자격유형그룹_q5
0,754,39.79,116,14.0,22830000.0,189840.0,0.0,2.0,683.0,0,...,1,1,0,0,0,1,0,0,0,0
1,754,46.81,30,14.0,36048000.0,249930.0,0.0,2.0,683.0,0,...,1,1,0,0,0,0,0,0,0,1
2,754,46.90,112,14.0,36048000.0,249930.0,0.0,2.0,683.0,0,...,1,1,0,0,0,1,0,0,0,0
3,754,46.90,120,14.0,36048000.0,249930.0,0.0,2.0,683.0,0,...,1,1,0,0,0,1,0,0,0,0
4,754,51.46,60,14.0,43497000.0,296780.0,0.0,2.0,683.0,0,...,1,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832,675,36.77,126,38.0,23760000.0,112860.0,0.0,1.0,467.0,0,...,0,0,0,0,1,0,1,0,0,0
833,382,29.19,96,45.0,6872000.0,106400.0,0.0,2.0,300.0,0,...,0,1,0,0,0,1,0,0,0,0
834,382,29.19,20,45.0,6872000.0,106400.0,0.0,2.0,300.0,0,...,0,1,0,0,0,1,0,0,0,0
835,382,39.45,202,45.0,13410000.0,144600.0,0.0,2.0,300.0,0,...,0,1,0,0,0,1,0,0,0,0


In [None]:
# train데이터에서  data, target 지정 
from sklearn.model_selection import train_test_split 

data = train.drop(columns=['등록차량수'])
target = train['등록차량수']

# train세트와 test세트로 나누기
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)