<a href="https://colab.research.google.com/github/paranoidandroid2124/AIFFEL_quest_rs/blob/main/first_trial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import xgboost
import lightgbm
from os.path import join
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:

#%%
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')

#%%
#로컬에서 작업해서 로컬 폴더로 주소 지정
path_temp = "C:/Users/양자/Desktop/Hun_Works/practices/kagglekr_pricing/data"
train_path = join(path_temp, 'train.csv')
test_path = join(path_temp, 'test.csv')
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
#%%
train.columns
train.info()

In [None]:
#%%
## id는 쓸모없어서, 나머지는 95% 이상의 데이터가 한 값을 갖고 있어 편향 줄까봐 제거
del train['id']
del train['view']
del train['waterfront']
del train['yr_renovated']
train['date'] = train['date'].apply(lambda i: i[:6]).astype(int)

## 테스트도 마찬가지로 트레인 셋과 열 동기화
del test['id']
del test['view']
del test['waterfront']
del test['yr_renovated']
test['date'] = test['date'].apply(lambda i: i[:6]).astype(int)

#%%
y = train['price']
del train['price']
## 나중에 꼭 바꿔줄 것
y = np.log1p(y)

In [None]:
#%%
plt.figure()
sns.countplot(data=train, x='sqft_lot15')
plt.show()
#%%
plt.figure()
sns.kdeplot(data=y)
plt.show()
#%%
plt.figure()
sns.kdeplot(data=y)
plt.show()


In [None]:
#%%
## rmse 정의
def rmse(y_test,y_pred):
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
#%%
# initial model 정의하기
random_state = 2025

gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]

In [None]:
# 노드에서 사용한 함수 재정의해서 사용
def get_scores(models, train, y):
    df = {}

    for model in models:
        model_name = model.__class__.__name__

        X_train, X_test, y_train, y_test = train_test_split(train, y, random_state=random_state, test_size=0.2)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        df[model_name] = rmse(y_test, y_pred)
    score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)

    return score_df
#%%
get_scores(models, train, y)

In [None]:
#%%
def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):
    grid_model = GridSearchCV(model, param_grid=param_grid, \
                        scoring='neg_mean_squared_error', \
                        cv=5, verbose=verbose, n_jobs=n_jobs)
    grid_model.fit(train, y)
    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']
    score=pd.DataFrame(data = score, columns = ['score'])
    results = pd.DataFrame(params)
    results['score'] = score
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values('RMSLE')
    return results

#%%
## 대강의 결과
##                               RMSE
##XGBRegressor               0.025468
##LGBMRegressor              0.019407
##RandomForestRegressor      0.006283
##GradientBoostingRegressor  0.005782

In [None]:
model = gboost
param_grid = {
    'learning_rate': [0.01, 0.2],
    'n_estimators': [50, 100], # 트리 개수
    'max_depth': [3, 5, 10], # 깊이
    'min_samples_split': [5, 10], # 노드 분할 최소 샘플 수
    'min_samples_leaf': [5, 10], # 리프 노드 최소 샘플 수
}
ans1 = my_GridSearch(model, train, y, param_grid)
## 원랜 더 많이 해서 144 개 candidate x 5 folds = 720번 = 시간 진짜 하루종일 걸림.. 줄여봤음
## best:
## learning_rate         0.200000
## max_depth            10.000000
## min_samples_leaf      5.000000
## min_samples_split    10.000000
## n_estimators         50.000000
## score                -0.000014
## RMSLE                 0.003742: 시작부터 y를 로그스케일로 만들고 했음. 마지막에 다시 exponential 해주면될듯?
#%%
## lightgbm 할 때는 가능한 candidate 개수를 확 줄여보자
model = lightgbm
param_grid = {
    'learning_rate': [0.01, 0.1],         # 학습 속도
    'n_estimators': [50, 100],                 # 트리 개수
    'max_depth': [-1, 5],                   # 깊이
    'num_leaves': [15, 31],                 # 리프 노드 개수
}
ans2 = my_GridSearch(model, train, y, param_grid)
## best:
## learning_rate      0.100000
## max_depth          5.000000
## n_estimators     100.000000
## num_leaves        31.000000
## score             -0.000225
## RMSLE              0.015004
#%%
model = xgboost
param_grid = {
    'n_estimators': [100, 200],  # 트리 개수
    'max_depth': [None, 20],    # 트리 최대 깊이
    'min_samples_split': [2, 10],    # 노드 분할 최소 샘플 수
    'min_samples_leaf': [1, 4],   # 리프 노드 최소 샘플 수
    'max_features': [0.75, 1.0],       # 최대 피처 비율
}
ans3 = my_GridSearch(model, train, y, param_grid)
## best:
## max_depth                20.0
## max_features              1.0
## min_samples_leaf            4
## min_samples_split          10
## n_estimators              200
## score               -0.000357
## RMSLE                0.018891
#%%

In [None]:
## 아까 그리드서치로 얻은 하이퍼파라미터 대입
model = GradientBoostingRegressor(learning_rate=0.2, max_depth=10, min_samples_leaf=5, min_samples_split=10, n_estimators=50)
model.fit(train,y)
y_pred1 = model.predict(test)
y_pred1 = np.expm1(y_pred1)
#%%
model = LGBMRegressor(learning_rate=0.1, max_depth=5, n_estimators=100, num_leaves=31)
model.fit(train,y)
y_pred2 = model.predict(test)
y_pred2 = np.expm1(y_pred2)
#%%
model = XGBRegressor(
    max_depth=20,
    colsample_bytree=1.0,
    min_child_weight=4,
    gamma=10,
    n_estimators=200
)
model.fit(train, y)
y_pred3 = model.predict(test)
y_pred3 = np.expm1(y_pred3)

model.fit(train,y)
y_pred3 = model.predict(test)
y_pred3 = np.expm1(y_pred3)

In [None]:
#%%
from sklearn.linear_model import Ridge # L2 정칙화 사용
#스태킹 사용할 것.
# 1) train/val 분할
X_train, X_val, y_train, y_val = train_test_split(train, y, test_size=0.2, random_state=2025)

# 2) 개별 모델 학습 및 예측
model1 = GradientBoostingRegressor(
    learning_rate=0.2,
    max_depth=10,
    min_samples_leaf=5,
    min_samples_split=10,
    n_estimators=50,
    random_state=2025
)
model2 = LGBMRegressor(
    learning_rate=0.1,
    max_depth=5,
    n_estimators=100,
    num_leaves=31,
    random_state=2025
)
model3 = XGBRegressor(
    max_depth=20,
    colsample_bytree=1.0,
    min_child_weight=4,
    gamma=10,
    n_estimators=200,
    random_state=2025
)

models = [model1, model2, model3]

# 각각 학습
for m in models:
    m.fit(X_train, y_train)

# validation 예측
val_pred1 = model1.predict(X_val)
val_pred2 = model2.predict(X_val)
val_pred3 = model3.predict(X_val)

# test 예측
test_pred1 = model1.predict(test)
test_pred2 = model2.predict(test)
test_pred3 = model3.predict(test)

# 3) 스택 데이터 생성 (validation, test 모두)
meta_X_train = np.column_stack((val_pred1, val_pred2, val_pred3))
meta_X_test  = np.column_stack((test_pred1, test_pred2, test_pred3))

# 4) 메타 모델 학습
meta_model = Ridge()
meta_model.fit(meta_X_train, y_val)

# 5) 최종 예측: log scale → 원래 스케일 역변환
meta_y_test_pred = meta_model.predict(meta_X_test)
final_prediction = np.expm1(meta_y_test_pred)

# 최종 예측 결과: final_prediction

submission_path = join('C:/Users/양자/Desktop/Hun_Works/practices/kagglekr_pricing/data', 'sample_submission.csv')
submission = pd.read_csv(submission_path)
submission['price'] = final_prediction

submission_csv_path = '{}/submission.csv'.format('C:/Users/양자/Desktop/Hun_Works/practices/kagglekr_pricing/data')
submission.to_csv(submission_csv_path, index=False)
print(submission_csv_path)

# 결과 스코어 값: 139314: 망했음