In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import polars as pl
import numpy as np
import re

In [2]:
# df = pd.read_csv('../data/Apart Deal.csv', low_memory=True)
df = pl.read_csv('../data/Apart Deal.csv', 
                 schema_overrides={
                     '거래금액': pl.Utf8,
                     '층': pl.Utf8
                 })

In [9]:
df = df[:15000]

In [10]:
len(df)

15000

In [11]:
df.head()

지역코드,법정동,거래일,아파트,지번,전용면적,층,건축년도,거래금액
f64,str,str,str,str,f64,str,f64,str
31110.0,"""학성동""","""5/30/2020 0:00""","""남운학성타운""","""379""",135.58,"""8""",1991.0,"""26700"""
31110.0,"""남외동""","""1/3/2020 0:00""","""남외푸르지오1차""","""506-1""",101.6,"""2""",2006.0,"""35500"""
31110.0,"""남외동""","""1/3/2020 0:00""","""에일린의뜰""","""500""",84.992,"""11""",2007.0,"""36500"""
31110.0,"""남외동""","""1/3/2020 0:00""","""남외푸르지오1차""","""506-1""",118.706,"""8""",2006.0,"""43000"""
31110.0,"""남외동""","""1/4/2020 0:00""","""남외푸르지오2차""","""501-1""",84.9636,"""7""",2007.0,"""38700"""


In [12]:
df.head()

지역코드,법정동,거래일,아파트,지번,전용면적,층,건축년도,거래금액
f64,str,str,str,str,f64,str,f64,str
31110.0,"""학성동""","""5/30/2020 0:00""","""남운학성타운""","""379""",135.58,"""8""",1991.0,"""26700"""
31110.0,"""남외동""","""1/3/2020 0:00""","""남외푸르지오1차""","""506-1""",101.6,"""2""",2006.0,"""35500"""
31110.0,"""남외동""","""1/3/2020 0:00""","""에일린의뜰""","""500""",84.992,"""11""",2007.0,"""36500"""
31110.0,"""남외동""","""1/3/2020 0:00""","""남외푸르지오1차""","""506-1""",118.706,"""8""",2006.0,"""43000"""
31110.0,"""남외동""","""1/4/2020 0:00""","""남외푸르지오2차""","""501-1""",84.9636,"""7""",2007.0,"""38700"""


In [13]:
def extract_phase(apartment_name):
    """
    아파트명에서 차수 정보를 추출하는 함수
    예: '남외푸르지오1차' -> '1차', '남운학성타운' -> None
    """
    if pd.isna(apartment_name):
        return None
    # 마지막에 있는 숫자+차 패턴 찾기 (예: 1차, 2차, 10차 등)
    match = re.search(r'(\d+차)$', str(apartment_name))
    return match.group(1) if match else None

In [14]:
def remove_phase(apartment_name):
    """
    아파트명에서 차수 정보를 제거하는 함수
    예: '남외푸르지오1차' -> '남외푸르지오', '남운학성타운' -> '남운학성타운'
    """
    if pd.isna(apartment_name):
        return apartment_name
    # 마지막에 있는 숫자+차 패턴 제거
    return re.sub(r'\d+차$', '', str(apartment_name)).strip()

In [15]:
def split_lot_number(lot):
    """
    지번을 본번과 부번으로 분리하는 함수
    '506-1' -> (506, 1)
    '379' -> (379, 0)
    """
    if pd.isna(lot):
        return 0, 0
    
    lot_str = str(lot).strip()
    if '-' in lot_str:
        parts = lot_str.split('-', 1)
        main = int(parts[0]) if parts[0].isdigit() else 0
        sub = int(parts[1]) if parts[1].isdigit() else 0
        return main, sub
    else:
        main = int(lot_str) if lot_str.isdigit() else 0
        return main, 0

In [16]:
# df['차수'] = df['아파트'].apply(extract_phase)
df = df.with_columns(
    pl.col('아파트').map_elements(extract_phase, return_dtype=pl.Utf8).alias('차수')
)

In [17]:
# df[['지번_본번', '지번_부번']] = df['지번'].apply(
#     lambda x: pd.Series(split_lot_number(x))
# )
df = df.with_columns([
    pl.col('지번').map_elements(
        lambda x: split_lot_number(x)[0] if x else 0,
        return_dtype=pl.Int64
    ).alias('지번_본번'),
    pl.col('지번').map_elements(
        lambda x: split_lot_number(x)[1] if x else 0,
        return_dtype=pl.Int64
    ).alias('지번_부번')
])

In [18]:
# df['아파트_정제'] = df['아파트'].apply(remove_phase)
df = df.with_columns(
    pl.col('아파트').map_elements(remove_phase, return_dtype=pl.Utf8).alias('아파트_정제')
)

In [19]:
# df['차수'] = df['차수'].fillna('없음')
df = df.with_columns(
    pl.col('차수').fill_null('없음')
)

In [20]:
df.head()

지역코드,법정동,거래일,아파트,지번,전용면적,층,건축년도,거래금액,차수,지번_본번,지번_부번,아파트_정제
f64,str,str,str,str,f64,str,f64,str,str,i64,i64,str
31110.0,"""학성동""","""5/30/2020 0:00""","""남운학성타운""","""379""",135.58,"""8""",1991.0,"""26700""","""없음""",379,0,"""남운학성타운"""
31110.0,"""남외동""","""1/3/2020 0:00""","""남외푸르지오1차""","""506-1""",101.6,"""2""",2006.0,"""35500""","""1차""",506,1,"""남외푸르지오"""
31110.0,"""남외동""","""1/3/2020 0:00""","""에일린의뜰""","""500""",84.992,"""11""",2007.0,"""36500""","""없음""",500,0,"""에일린의뜰"""
31110.0,"""남외동""","""1/3/2020 0:00""","""남외푸르지오1차""","""506-1""",118.706,"""8""",2006.0,"""43000""","""1차""",506,1,"""남외푸르지오"""
31110.0,"""남외동""","""1/4/2020 0:00""","""남외푸르지오2차""","""501-1""",84.9636,"""7""",2007.0,"""38700""","""2차""",501,1,"""남외푸르지오"""


In [21]:
n_df = df[['지역코드', '법정동', '아파트_정제', '차수', '지번_본번', '지번_부번', '전용면적', '층', '건축년도', '거래금액']]

In [22]:
n_df.head()

지역코드,법정동,아파트_정제,차수,지번_본번,지번_부번,전용면적,층,건축년도,거래금액
f64,str,str,str,i64,i64,f64,str,f64,str
31110.0,"""학성동""","""남운학성타운""","""없음""",379,0,135.58,"""8""",1991.0,"""26700"""
31110.0,"""남외동""","""남외푸르지오""","""1차""",506,1,101.6,"""2""",2006.0,"""35500"""
31110.0,"""남외동""","""에일린의뜰""","""없음""",500,0,84.992,"""11""",2007.0,"""36500"""
31110.0,"""남외동""","""남외푸르지오""","""1차""",506,1,118.706,"""8""",2006.0,"""43000"""
31110.0,"""남외동""","""남외푸르지오""","""2차""",501,1,84.9636,"""7""",2007.0,"""38700"""


In [23]:
# n_df.info()

In [24]:
# le = LabelEncoder()
# n_df['법정동'] = le.fit_transform(n_df['법정동'])
# n_df['차수'] = le.fit_transform(n_df['차수'])
# n_df['아파트_정제'] = le.fit_transform(n_df['아파트_정제'])

n_df_pd = n_df.to_pandas()

# 각 컬럼별 LabelEncoder 생성 및 저장 (inference 시 재사용 가능)
le_a= LabelEncoder()
le_b= LabelEncoder()
le_c= LabelEncoder()

# LabelEncoder 적용
n_df_pd['법정동'] = le_a.fit_transform(n_df_pd['법정동'])
n_df_pd['차수'] = le_b.fit_transform(n_df_pd['차수'])
n_df_pd['아파트_정제'] = le_c.fit_transform(n_df_pd['아파트_정제'])


In [25]:
n_df_pd['지번_본번'].isna().sum()

np.int64(0)

In [26]:
n_df_pd['지번_본번'] = n_df_pd['지번_본번'].fillna(0).astype(int)
n_df_pd['지번_부번'] = n_df_pd['지번_부번'].fillna(0).astype(int)

In [27]:
n_df_pd['건축년도'] = n_df_pd['건축년도'].fillna(0).astype(int)

In [28]:
n_df_pd['지역코드'] = n_df_pd['지역코드'].fillna(0).astype(int)

In [29]:
n_df_pd['층'] = pd.to_numeric(
    n_df_pd['층'].replace(' ', np.nan).replace('', np.nan),
    errors='coerce'
).fillna(0).astype(int)


In [30]:
n_df_pd['거래금액'] = pd.to_numeric(
    n_df_pd['거래금액'].astype(str).str.replace(',', ''), 
    errors='coerce'
).fillna(0).astype(int)

In [31]:
# n_df_pd = n_df_pd.fillna(0)

In [32]:
n_df_pd.head()

Unnamed: 0,지역코드,법정동,아파트_정제,차수,지번_본번,지번_부번,전용면적,층,건축년도,거래금액
0,31110,96,156,9,379,0,135.58,8,1991,26700
1,31110,4,150,1,506,1,101.6,2,2006,35500
2,31110,4,713,9,500,0,84.992,11,2007,36500
3,31110,4,150,1,506,1,118.706,8,2006,43000
4,31110,4,150,3,501,1,84.9636,7,2007,38700


In [33]:
n_df_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   지역코드    15000 non-null  int64  
 1   법정동     15000 non-null  int64  
 2   아파트_정제  15000 non-null  int64  
 3   차수      15000 non-null  int64  
 4   지번_본번   15000 non-null  int64  
 5   지번_부번   15000 non-null  int64  
 6   전용면적    15000 non-null  float64
 7   층       15000 non-null  int64  
 8   건축년도    15000 non-null  int64  
 9   거래금액    15000 non-null  int64  
dtypes: float64(1), int64(9)
memory usage: 1.1 MB


In [34]:
y = n_df_pd['거래금액']
X = n_df_pd.drop(columns=['거래금액'])

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
rfc = RandomForestRegressor(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 11045013.612611288


In [38]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error


In [40]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 거래금액 통계
y_mean = y_test.mean()
y_std = y_test.std()

print("=" * 50)
print("모델 성능 평가 지표")
print("=" * 50)
print(f"MSE (Mean Squared Error): {mse:,.2f}")
print(f"RMSE (Root Mean Squared Error): {rmse:,.2f} 만원")
print(f"MAE (Mean Absolute Error): {mae:,.2f} 만원")
print(f"R² Score: {r2:.4f}")
print()
print("=" * 50)
print("실제 거래금액 통계")
print("=" * 50)
print(f"평균: {y_mean:,.2f} 만원")
print(f"표준편차: {y_std:,.2f} 만원")
print(f"최소값: {y_test.min():,} 만원")
print(f"최대값: {y_test.max():,} 만원")
print()
print("=" * 50)
print("상대적 성능")
print("=" * 50)
print(f"RMSE / 평균: {(rmse/y_mean)*100:.2f}%")
print(f"MAE / 평균: {(mae/y_mean)*100:.2f}%")
print(f"R² Score: {r2:.4f} ({r2*100:.2f}% 설명력)")

모델 성능 평가 지표
MSE (Mean Squared Error): 11,045,013.61
RMSE (Root Mean Squared Error): 3,323.40 만원
MAE (Mean Absolute Error): 2,013.53 만원
R² Score: 0.9532

실제 거래금액 통계
평균: 28,819.99 만원
표준편차: 15,367.66 만원
최소값: 3,100 만원
최대값: 125,000 만원

상대적 성능
RMSE / 평균: 11.53%
MAE / 평균: 6.99%
R² Score: 0.9532 (95.32% 설명력)


In [None]:
# 1. 하이퍼파라미터 튜닝
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10]
}

rfc_tuned = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rfc_tuned, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score (negative MSE):", grid_search.best_score_)

# 2. 특성 중요도 확인
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': grid_search.best_estimator_.feature_importances_
}).sort_values('importance', ascending=False)

print("\n특성 중요도:")
print(feature_importance)

In [41]:
import matplotlib.pyplot as plt

# 예측 vs 실제 산점도
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('실제 거래금액 (만원)')
plt.ylabel('예측 거래금액 (만원)')
plt.title('실제 vs 예측 거래금액')
plt.show()

# 잔차 플롯
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('예측 거래금액 (만원)')
plt.ylabel('잔차 (만원)')
plt.title('잔차 플롯')
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# 문제가 되는 컬럼 찾기
# for col in X.columns:
#     try:
#         pd.to_numeric(X[col], errors='raise')
#     except:
#         print(f"문제 컬럼: {col}")
#         print(f"고유값 샘플: {X[col].unique()[:10]}")
#         print(f"공백 포함 여부: {(X[col] == ' ').sum()}")