In [1]:
import warnings;warnings.filterwarnings('ignore')

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')

import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split

In [2]:
data_path = './data'
train_path = f'{data_path}/new/new_train_ver6.csv'
test_path  = f'{data_path}/new/new_test_ver6.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
print('Train data shape : ', train_data.shape, 'Test data shape : ', test_data.shape)

Train data shape :  (1118822, 22) Test data shape :  (9272, 21)


In [3]:
train_data = train_data[train_data['아파트명'].isin(test_data['아파트명'])]
train_data = train_data[train_data['도로명'].isin(test_data['도로명'])]
train_data = train_data[train_data['번지'].isin(test_data['번지'])]
train_data = train_data[train_data['시군구'].isin(test_data['시군구'])]

In [4]:
for index, row in train_data.iterrows():
    if pd.isnull(row['아파트명']):
        corresponding_value = test_data[test_data['아파트명'] == row['아파트명']]['아파트명'].values
        if len(corresponding_value) > 0:
            train_data.at[index, '아파트명'] = corresponding_value[0]

In [5]:
train_data['등기신청일자'] = pd.to_numeric(train_data['등기신청일자'], errors='coerce').astype('Int64')
print(train_data['등기신청일자'])

train_data['등기신청일자'] = train_data['등기신청일자'].apply(lambda x: 1 if pd.notnull(x) else 0)
train_data['해제사유발생일'] = train_data['해제사유발생일'].apply(lambda x: 1 if pd.notnull(x) else 0)
train_data = train_data[(train_data['등기신청일자'] != 1) & (train_data['해제사유발생일'] != 1)]

0          <NA>
1          <NA>
2          <NA>
3          <NA>
4          <NA>
           ... 
1118817    <NA>
1118818    <NA>
1118819    <NA>
1118820    <NA>
1118821    <NA>
Name: 등기신청일자, Length: 821249, dtype: Int64


In [6]:
train_data['is_test'] = 0
test_data['is_test'] = 1
total_data = pd.concat([train_data, test_data])

In [7]:
total_data = total_data.assign(year=total_data['계약년월'].astype('str').str[:4].astype(int)).query('year > 2020')

total_data['아파트명'] = total_data['아파트명'].astype('category')

label_encoder = LabelEncoder()
total_data['아파트명_encoded'] = label_encoder.fit_transform(total_data['아파트명'])
total_data = total_data.drop(['아파트명'], axis=1)
total_data['아파트명_encoded'] = total_data['아파트명_encoded'].astype('category')

In [8]:
# 이제 다시 train과 test dataset을 분할해줍니다. 위에서 제작해 놓았던 is_test 칼럼을 이용합니다.
train_data = total_data.query('is_test==0')
test_data = total_data.query('is_test==1')

# 이제 is_test 칼럼은 drop해줍니다.
train_data.drop(['is_test'], axis = 1, inplace=True)
test_data.drop(['is_test'], axis = 1, inplace=True)
print(train_data.shape, test_data.shape)

(40234, 23) (9272, 23)


In [13]:
x, y = train_data[['전용면적','아파트명_encoded']], train_data[['target']],
test = test_data[['전용면적','아파트명_encoded']]

# LightGBM 모델 학습
lgb_params = {
    'n_estimators': 100,
    'learning_rate': 0.01,
    'max_depth': 5,
    'num_leaves': 64,  # num_leaves 값을 2^max_depth보다 크게 설정
    'categorical_feature': 'auto',  # 카테고리 피처 자동 인식
    'force_row_wise': True  # 행 단위 처리 강제
}

model = lgb.LGBMRegressor(**lgb_params)
model.fit(x, y)
pred = model.predict(test)

test_pred = pd.DataFrame(pred.astype(int), columns=["target"])
test_pred.to_csv('predictions.csv', index=False)

[LightGBM] [Info] Total Bins 2026
[LightGBM] [Info] Number of data points in the train set: 40234, number of used features: 2
[LightGBM] [Info] Start training from score 110526.003678


In [10]:
# params = {
#     'objective': 'regression',
#     'boosting_type': 'gbdt',
#     'metric': 'rmse',
#     'subsample': 0.8, 
#     'num_leaves': 127, 
#     'n_estimators': 1000, 
#     'min_child_samples': 30, 
#     'learning_rate': 0.1, 
#     'feature_fraction': 0.7, 
#     'colsample_bytree': 0.7, 
#     'bagging_freq': 1, 
#     'bagging_fraction': 0.7,
#     'verbose': -1
# }


# # 모델 훈련
# num_boost_round = 100000
# print('Starting training...')
# model = lgb.train(params, 
#                  train_dataset, 
#                  valid_sets=[train_dataset, val_dataset],
#                  num_boost_round=num_boost_round,
#                  callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation(period=10)])

In [11]:
# # 테스트 데이터 예측
# test_pred = model.predict(test_data, num_iteration=model.best_iteration)
# test_pred = test_pred.round().astype(int)

# # 결과 저장 (필요 시)
# output = pd.DataFrame({'target': test_pred})
# output.to_csv('./my_submission.csv', index=False)
# print('Predictions saved.')

In [14]:
output_path = './output.csv'
output_df = pd.read_csv(output_path)

# 비교할 예측값 가져오기
output_pred = output_df['target']

# 두 예측값 간의 RMSE 계산
comparison_rmse = mean_squared_error(output_pred, test_pred, squared=False)
print(f'Comparison RMSE: {comparison_rmse}')

Comparison RMSE: 48500.522266022024
