In [1]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

train_df = pd.read_csv('../dataset/train_data.csv')
train_df.head()

Unnamed: 0,시군구,단지명,전용면적(㎡),계약년월,거래금액(만원),층,건축년도,도로명
0,서울특별시 강남구 개포동,개포6차우성아파트1동~8동,79.97,201801,130000,4,1987,언주로 3
1,서울특별시 강남구 개포동,개포6차우성아파트1동~8동,79.97,201801,117000,2,1987,언주로 3
2,서울특별시 강남구 개포동,개포6차우성아파트1동~8동,79.97,201801,130000,1,1987,언주로 3
3,서울특별시 강남구 개포동,개포6차우성아파트1동~8동,79.97,201803,139500,2,1987,언주로 3
4,서울특별시 강남구 개포동,개포6차우성아파트1동~8동,54.98,201804,107500,5,1987,언주로 3


In [2]:
# 데이터프레임 거래금액(만원) 데이터타입 및 컬럼명 변경
train_df = train_df.rename(columns={'거래금액(만원)':'거래금액'})
train_df['거래금액'] = train_df['거래금액'].apply(lambda x: x.replace(',', '')).astype('int64')

# 시군구, 전용면적, 계약년월, 층, 건축년도, 도로명 컬럼 제거
train_df = train_df[['단지명', '거래금액']]
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240921 entries, 0 to 240920
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   단지명     240921 non-null  object
 1   거래금액    240921 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.7+ MB


In [3]:
from sklearn.preprocessing import minmax_scale

# 거래금액 컬럼 minmax scaling 수행
minmax_scaled_price = minmax_scale(train_df['거래금액'])

# train_df에 minmax scaled 거래금액 반영
train_df['scaled_거래금액'] = minmax_scaled_price

### 1. 브랜드 평판 top10 시공사 인코딩

In [4]:
# 1. 브랜드 평판 top10 시공사에 대한 mean 인코딩
# top 10 시공사 아파트 여부를 나타내는 컬럼 생성
# 2021.09.16 시공사 브랜드평판 순위 참고 링크: http://brikorea.com/bbs/board.php?bo_table=rep_1&wr_id=701

'''
    case1. 평균값 인코딩
        case1-1. 전체 거래금액 컬럼 값을 minmax scale 수행 한 후 이에 대한 top10별 스케일된 평균 거래금액 도출
        case1-2. top10별로 거래금액 컬럼 값을 minmax scale 수행 한 후 이에 대한 평균 거래금액 도출
        
    case2. 원-핫 인코딩

    case3. smoothing, cv loop, expanding mean
'''

train_df['top10'] = 'top10_nan'
top10_apt = ['힐스테이트', '자이', '롯데캐슬', '더샵', '푸르지오', 
            '래미안', '아이파크', 'sk|SK|에스케이', '이편한|e편한|e-편한', '더 플래티넘']

# top 10 시공사에 속해 있는 아파트면 해당 시공사 이름으로 top10 컬럼 값 할당
for brand in top10_apt:
    train_df.loc[train_df['단지명'].str.contains(brand), 'top10'] = brand

# top10 시공사에 속하는 아파트 단지 탐색
train_df['top10'].value_counts()

top10_nan       196333
래미안              14014
푸르지오              5775
힐스테이트             5547
이편한|e편한|e-편한      4479
자이                4211
롯데캐슬              3928
아이파크              3115
sk|SK|에스케이        2377
더샵                1142
Name: top10, dtype: int64

In [5]:
# top10 시공사 별 평균 거래금액 도출
top10_grouped_mean_price = train_df.groupby('top10').mean()['거래금액']
top10_mean_dict = top10_grouped_mean_price.to_dict()
top10_mean_dict

{'sk|SK|에스케이': 66391.12957509466,
 'top10_nan': 72015.6218109029,
 '더샵': 126580.55166374781,
 '래미안': 107767.14321392892,
 '롯데캐슬': 107670.10234215886,
 '아이파크': 106744.51524879615,
 '이편한|e편한|e-편한': 93578.87117660193,
 '자이': 128949.99501306104,
 '푸르지오': 84372.06060606061,
 '힐스테이트': 90688.18099873805}

In [6]:
# train_df에 top10별 평균 거래금액 반영
def get_mean(x):
    return top10_mean_dict.get(x)

train_df['top10_mean'] = train_df['top10'].apply(get_mean)
train_df.head()

Unnamed: 0,단지명,거래금액,scaled_거래금액,top10,top10_mean
0,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811
1,개포6차우성아파트1동~8동,117000,0.133717,top10_nan,72015.621811
2,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811
3,개포6차우성아파트1동~8동,139500,0.160676,top10_nan,72015.621811
4,개포6차우성아파트1동~8동,107500,0.122334,top10_nan,72015.621811


In [7]:
# case1-1
from sklearn.preprocessing import minmax_scale
import numpy as np

# top10 그룹별 평균 scaled_거래금액 집계
top10_grouped_scaled_mean_price = train_df.groupby('top10').mean()['scaled_거래금액']
top10_scaled_mean_dict = top10_grouped_scaled_mean_price.to_dict()

print(top10_scaled_mean_dict)

# train_df에 top10별 스케일된 평균 거래금액 반영
def get_scaled_mean1(x):
    return top10_scaled_mean_dict.get(x)

train_df['top10_encode1'] = train_df['top10'].apply(get_scaled_mean1)
train_df.head()

{'sk|SK|에스케이': 0.07307827650981867, 'top10_nan': 0.0798174236890761, '더샵': 0.14519596413101826, '래미안': 0.12265413756761195, '롯데캐슬': 0.12253786525540243, '아이파크': 0.12142884645194843, '이편한|e편한|e-편한': 0.10565405125401621, '자이': 0.14803498084478917, '푸르지오': 0.09462264630488931, '힐스테이트': 0.10219048765724667}


Unnamed: 0,단지명,거래금액,scaled_거래금액,top10,top10_mean,top10_encode1
0,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811,0.079817
1,개포6차우성아파트1동~8동,117000,0.133717,top10_nan,72015.621811,0.079817
2,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811,0.079817
3,개포6차우성아파트1동~8동,139500,0.160676,top10_nan,72015.621811,0.079817
4,개포6차우성아파트1동~8동,107500,0.122334,top10_nan,72015.621811,0.079817


In [8]:
# case1-2
from sklearn.preprocessing import minmax_scale
import numpy as np

# top10 카테고리에 속한 거래금액 탐색 후
# 각 카테고리 별 minmax 스케일링 수행하고
# 이에 대한 평균 값 도출
top10_minmax_scaled_mean_dict = {}
for top10_name in top10_mean_dict.keys():
    top10_price = train_df[train_df['top10'] == top10_name]['거래금액']
    top10_minmax_scaled_price = np.array(minmax_scale(top10_price))
    top10_minmax_scaled_mean_dict[top10_name] = np.mean(top10_minmax_scaled_price)

print(top10_minmax_scaled_mean_dict)

# train_df에 top10별 minmax scale된 평균 거래금액 반영
def get_scaled_mean2(x):
    return top10_minmax_scaled_mean_dict.get(x)

train_df['top10_encode2'] = train_df['top10'].apply(get_scaled_mean2)
train_df.head()

{'sk|SK|에스케이': 0.10876102884906268, 'top10_nan': 0.0798174236890761, '더샵': 0.24134448297377306, '래미안': 0.15944765832779134, '롯데캐슬': 0.22803240252251367, '아이파크': 0.12986875219829297, '이편한|e편한|e-편한': 0.21729540571254638, '자이': 0.24293003404860947, '푸르지오': 0.243647790901819, '힐스테이트': 0.20650051420096036}


Unnamed: 0,단지명,거래금액,scaled_거래금액,top10,top10_mean,top10_encode1,top10_encode2
0,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811,0.079817,0.079817
1,개포6차우성아파트1동~8동,117000,0.133717,top10_nan,72015.621811,0.079817,0.079817
2,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811,0.079817,0.079817
3,개포6차우성아파트1동~8동,139500,0.160676,top10_nan,72015.621811,0.079817,0.079817
4,개포6차우성아파트1동~8동,107500,0.122334,top10_nan,72015.621811,0.079817,0.079817


In [9]:
# case2. one-hot encoding
top10_onehot = pd.get_dummies(train_df['top10'])
top10_onehot

Unnamed: 0,sk|SK|에스케이,top10_nan,더샵,래미안,롯데캐슬,아이파크,이편한|e편한|e-편한,자이,푸르지오,힐스테이트
0,0,1,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
240916,0,1,0,0,0,0,0,0,0,0
240917,0,1,0,0,0,0,0,0,0,0
240918,0,1,0,0,0,0,0,0,0,0
240919,0,1,0,0,0,0,0,0,0,0


In [10]:
# case3. smoothing, cv loop, expanding mean

### 2. 본 데이터셋에 자주 등장하는 단지명 키워드 상위 20개 인코딩

In [11]:
# 데이터에 존재하는 상위 20개 아파트 리스트
top20 = train_df['단지명'].value_counts()[:20]
top20_list = list(top20.index)

# top20 컬럼 생성
train_df['top20'] = 'top20_nan'

for apt in top20_list:
    train_df.loc[train_df['단지명'].str.contains(apt), 'top20'] = apt

# 상위 20개 자주 나오는 아파트 키워드에 대한 각각의 속성 개수 탐색
train_df['top20'].value_counts()

top20_nan    167240
현대            18675
삼성             8106
동아             7153
우성             6103
한신             5354
두산             5181
벽산             4626
롯데캐슬           3928
대우             2584
쌍용             2412
극동             2169
주공2            1713
중앙하이츠          1373
대림e-편한세상       1019
에스케이북한산시티       939
파크리오            911
신동아아파트1         739
중계그린1단지         696
Name: top20, dtype: int64

In [12]:
# top20 시공사 별 평균 거래금액 도출
top20_grouped_mean_price = train_df.groupby('top20').mean()['거래금액']
top20_mean_dict = top20_grouped_mean_price.to_dict()

print(top20_mean_dict)

# train_df에 top10별 평균 거래금액 반영
def get_top20_mean(x):
    return top20_mean_dict.get(x)

train_df['top20_mean'] = train_df['top20'].apply(get_top20_mean)
train_df.head()

{'top20_nan': 78536.36375269074, '극동': 71921.6786537575, '대림e-편한세상': 83480.11776251227, '대우': 67783.4241486068, '동아': 73591.33314693136, '두산': 63158.10885929357, '롯데캐슬': 107670.10234215886, '벽산': 51715.34370946822, '삼성': 85070.95595854922, '신동아아파트1': 30162.182679296347, '쌍용': 76014.26533996683, '에스케이북한산시티': 52068.92438764643, '우성': 78820.86875307225, '주공2': 39859.61295971979, '중계그린1단지': 36170.16379310345, '중앙하이츠': 61087.51347414421, '파크리오': 157247.0911086718, '한신': 63342.38700037355, '현대': 80762.79866131191}


Unnamed: 0,단지명,거래금액,scaled_거래금액,top10,top10_mean,top10_encode1,top10_encode2,top20,top20_mean
0,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753
1,개포6차우성아파트1동~8동,117000,0.133717,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753
2,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753
3,개포6차우성아파트1동~8동,139500,0.160676,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753
4,개포6차우성아파트1동~8동,107500,0.122334,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753


In [13]:
# case1-1
from sklearn.preprocessing import minmax_scale
import numpy as np

# top20 그룹별 평균 scaled_거래금액 집계
top20_grouped_scaled_mean_price = train_df.groupby('top20').mean()['scaled_거래금액']
top20_scaled_mean_dict = top20_grouped_scaled_mean_price.to_dict()

print(top20_scaled_mean_dict)

# train_df에 top10별 스케일된 평균 거래금액 반영
def get_top20_scaled_mean1(x):
    return top20_scaled_mean_dict.get(x)

train_df['top20_encode1'] = train_df['top20'].apply(get_top20_scaled_mean1)
train_df.head()

{'top20_nan': 0.08763043823710849, '극동': 0.07970486299276, '대림e-편한세상': 0.09355393932723732, '대우': 0.07474649430698156, '동아': 0.08170540755683126, '두산': 0.06920453973076154, '롯데캐슬': 0.12253786525540243, '벽산': 0.05549406147791544, '삼성': 0.0954600478774853, '신동아아파트1': 0.02966952154241115, '쌍용': 0.08460851346748963, '에스케이북한산시티': 0.055917714339379865, '우성': 0.0879713260880329, '주공2': 0.0412887766112147, '중계그린1단지': 0.03686815695315534, '중앙하이츠': 0.06672359630259311, '파크리오': 0.1819399605903089, '한신': 0.06942533788686024, '현대': 0.0902981052735585}


Unnamed: 0,단지명,거래금액,scaled_거래금액,top10,top10_mean,top10_encode1,top10_encode2,top20,top20_mean,top20_encode1
0,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971
1,개포6차우성아파트1동~8동,117000,0.133717,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971
2,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971
3,개포6차우성아파트1동~8동,139500,0.160676,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971
4,개포6차우성아파트1동~8동,107500,0.122334,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971


In [14]:
# case1-2
from sklearn.preprocessing import minmax_scale
import numpy as np

# top20 카테고리에 속한 거래금액 탐색 후
# minmax 스케일링 수행하고
# 이에 대한 평균 값 도출
top20_minmax_scaled_mean_dict = {}
for top20_name in top20_mean_dict.keys():
    top20_price = train_df[train_df['top20'] == top20_name]['거래금액']
    top20_minmax_scaled_price = np.array(minmax_scale(top20_price))
    top20_minmax_scaled_mean_dict[top20_name] = np.mean(top20_minmax_scaled_price)

print(top20_minmax_scaled_mean_dict)

# train_df에 top10별 minmax scale된 평균 거래금액 반영
def get_top20_scaled_mean2(x):
    return top20_minmax_scaled_mean_dict.get(x)

train_df['top20_encode2'] = train_df['top20'].apply(get_top20_scaled_mean2)
train_df.head()

{'top20_nan': 0.0876304382371085, '극동': 0.2856725871370689, '대림e-편한세상': 0.24019474641996244, '대우': 0.11112424220606235, '동아': 0.17570928900633231, '두산': 0.26251062435283873, '롯데캐슬': 0.22803240252251367, '벽산': 0.22990293847329948, '삼성': 0.1631977303446722, '신동아아파트1': 0.2561417610072194, '쌍용': 0.240625728330222, '에스케이북한산시티': 0.4076247867909989, '우성': 0.15497240510811922, '주공2': 0.14024302429275945, '중계그린1단지': 0.3732644302848576, '중앙하이츠': 0.14820044153363504, '파크리오': 0.47934090422576886, '한신': 0.15365334219183216, '현대': 0.11098792797421834}


Unnamed: 0,단지명,거래금액,scaled_거래금액,top10,top10_mean,top10_encode1,top10_encode2,top20,top20_mean,top20_encode1,top20_encode2
0,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971,0.154972
1,개포6차우성아파트1동~8동,117000,0.133717,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971,0.154972
2,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971,0.154972
3,개포6차우성아파트1동~8동,139500,0.160676,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971,0.154972
4,개포6차우성아파트1동~8동,107500,0.122334,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971,0.154972


In [15]:
# case2. one-hot encoding
top20_onehot = pd.get_dummies(train_df['top20'])
top20_onehot

Unnamed: 0,top20_nan,극동,대림e-편한세상,대우,동아,두산,롯데캐슬,벽산,삼성,신동아아파트1,쌍용,에스케이북한산시티,우성,주공2,중계그린1단지,중앙하이츠,파크리오,한신,현대
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240916,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
240917,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
240918,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
240919,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [16]:
onehot_df = pd.concat([top10_onehot, top20_onehot], axis=1)
train_df = pd.concat([train_df, onehot_df], axis=1)

train_df.head()

Unnamed: 0,단지명,거래금액,scaled_거래금액,top10,top10_mean,top10_encode1,top10_encode2,top20,top20_mean,top20_encode1,...,신동아아파트1,쌍용,에스케이북한산시티,우성,주공2,중계그린1단지,중앙하이츠,파크리오,한신,현대
0,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971,...,0,0,0,1,0,0,0,0,0,0
1,개포6차우성아파트1동~8동,117000,0.133717,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971,...,0,0,0,1,0,0,0,0,0,0
2,개포6차우성아파트1동~8동,130000,0.149293,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971,...,0,0,0,1,0,0,0,0,0,0
3,개포6차우성아파트1동~8동,139500,0.160676,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971,...,0,0,0,1,0,0,0,0,0,0
4,개포6차우성아파트1동~8동,107500,0.122334,top10_nan,72015.621811,0.079817,0.079817,우성,78820.868753,0.087971,...,0,0,0,1,0,0,0,0,0,0


In [17]:
# case3. smoothing, cv loop, expanding mean

In [19]:
# train_df.to_csv('../dataset/danji_encoding_result.csv', index=False, encoding='utf-8-sig')