In [None]:
import warnings;warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer, LabelEncoder

In [None]:
data_path = './data'
train_path = f'{data_path}/new/new_train_ver3.csv'
test_path  = f'{data_path}/new/new_test_ver3.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
print('Train data shape : ', train_data.shape, 'Test data shape : ', test_data.shape)

In [None]:
test_data.info()

In [None]:
# train/test 구분을 위한 칼럼 생성.
train_data['is_test'] = 0
test_data['is_test'] = 1
data = pd.concat([train_data, test_data]) # 하나의 데이터로 합친다.

In [None]:
print(data.columns)

In [None]:
gus = list(data['구'].unique())
print(gus)

In [None]:
data = data.drop(columns=['아파트명', '계약년월','k-건설사(시공사)', 'k-시행사', '경비비관리형태', '세대전기계약방법', '청소비관리형태', '건축년도',
                          'k-복도유형', 'k-난방방식', 'k-전체동수', 'k-전체세대수', 'k-주거전용면적', 'k-관리비부과면적', '부동산유형', '분양형태', 'k-관리방식'])

## 범주형 변수 처리

In [None]:
# columns_to_encode = ['구', '동', '분양형태', '부동산유형', 'k-관리방식', 'k-난방방식', 'k-복도유형']
columns_to_encode = ['구', '동']
for column in columns_to_encode:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])

## 수치형 변수 처리

In [None]:
data['전용면적'] = np.log(data['전용면적'])

private_area_scaler = MinMaxScaler()
data['전용면적_minmax'] = private_area_scaler.fit_transform(data['전용면적'].values.reshape(-1, 1))

plt.hist(data['전용면적_minmax'], bins=10, edgecolor='k', alpha=0.7)
plt.title('Distribution of 전용면적_minmax')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
year_gap_scaler = MinMaxScaler()

data['건축년도-계약년도'] = year_gap_scaler .fit_transform(data[['건축년도-계약년도']])
print(data['건축년도-계약년도'].min(), data['건축년도-계약년도'].max())

plt.hist(data['건축년도-계약년도'], bins=10, edgecolor='k', alpha=0.7)
plt.title('Distribution')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
qt = QuantileTransformer(output_distribution='normal')
data['층_qt'] = qt.fit_transform(data[['층']])
print(data['층_qt'].min(), data['층_qt'].max())

plt.hist(data['층_qt'], bins=10, edgecolor='k', alpha=0.7)
plt.title('Distribution of 층 (Quantile Transformed)')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
scaler = StandardScaler()
data['좌표X'] = scaler.fit_transform(data[['좌표X']])

scaler = StandardScaler()
data['좌표Y'] = scaler.fit_transform(data[['좌표Y']])

In [None]:
train_df = data[data['is_test'] == 0]
test_df = data[data['is_test'] == 1]

train_df = train_df.drop(columns=['is_test'])
test_df = test_df.drop(columns=['is_test', 'target'])
print(train_df.shape, test_df.shape)

In [None]:
print(train_df.columns)

In [None]:
models = {}
gu_groups = train_df.groupby('구')
params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'subsample': 0.8, 
    'num_leaves': 127, 
    'n_estimators': 1000, 
    'min_child_samples': 30, 
    'learning_rate': 0.1, 
    'feature_fraction': 0.7, 
    'colsample_bytree': 0.7, 
    'bagging_freq': 1, 
    'bagging_fraction': 0.7,
    'verbose': -1
}

num_boost_round = 100000
for gu, group_data in gu_groups:
    print(f"Training model for '구': {gus[gu]}")
    
    X = group_data.drop(columns=['target', '구'])
    y = np.log(group_data['target'])
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    train_dataset = lgb.Dataset(X_train, label=y_train)
    val_dataset = lgb.Dataset(X_val, label=y_val, reference=train_dataset)
    
    model = lgb.train(params, 
                      train_dataset, 
                      valid_sets=[train_dataset, val_dataset],
                      num_boost_round=num_boost_round,
                      callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation(period=10)])
    
    models[gu] = model

print("All models trained successfully.")

In [None]:
# 검증 단계
total_val_rmse = 0
for gu, group_data in gu_groups:
    X = group_data.drop(columns=['target', '구'])
    y = (group_data['target'])
    
    _, X_val, _, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = models[gu]
    y_pred_log = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred = np.exp(y_pred_log)  # 로그 변환을 원래 값으로 복원
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    total_val_rmse += rmse
    print(f"Validation RMSE for '구' {gus[gu]}: {rmse}")

average_val_rmse = total_val_rmse / len(gu_groups)
print(f"Average Validation RMSE: {average_val_rmse}")

In [None]:
# 테스트 단계
test_preds = []
for gu, model in models.items():
    test_group = test_df[test_df['구'] == gu]
    if not test_group.empty:
        X_test = test_group.drop(columns=['구'])
        test_pred_log = model.predict(X_test, num_iteration=model.best_iteration)
        test_pred = np.exp(test_pred_log)  # 로그 변환을 원래 값으로 복원
        test_preds.extend(test_pred)

test_pred_df = pd.DataFrame({'target': test_preds})
test_pred_df['target'] = test_pred_df['target'].round().astype(int)

print(test_pred_df.shape)

In [None]:
# 결과 저장 (필요 시)
test_pred_df.to_csv('./my_submission.csv', index=True)
print('Predictions saved.')

output_path = './output.csv'
output_df = pd.read_csv(output_path)

# 비교할 예측값 가져오기
output_pred = output_df['target']

# 두 예측값 간의 RMSE 계산
comparison_rmse = mean_squared_error(output_pred, test_pred_df['target'], squared=False)
print(f'Comparison RMSE: {comparison_rmse}')