In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import xgboost
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# import feature_pp as fp

df = pd.read_csv("/content/drive/MyDrive/1조(semi)_데이터분석엔지니어24회차/데이터/regression_df.csv")
df= df[:1000]

# get dataframe
def get_xy(df):
    X = df.drop(["rent_adjusted"], axis=1)
    y = df["rent_adjusted"]
    return X, y


# 데이터 분할
def data_split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=df["service_type"], random_state=42
    )
    X_train = pd.get_dummies(X_train)
    X_test = pd.get_dummies(X_test)
    return X_train, X_test, y_train, y_test


# poly_transform
def poly_data(X_train, X_test):
    # 2차 다항식으로 변환
    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_poly_2_train = poly.fit_transform(X_train)
    X_poly_2_test = poly.fit_transform(X_test)
    return X_poly_2_train, X_poly_2_test


# ML 모델 with best parameter
def get_model():
    lr = LinearRegression()
    lr_ridge = Ridge(alpha=0.001)
    lr_lasso = Lasso(alpha=0.1)
    elastic = ElasticNet(alpha=0.0000155, l1_ratio=0.005, max_iter=500)
    rf = RandomForestRegressor(
        n_estimators=621, min_samples_leaf=1, min_samples_split=5
    )
    gb = GradientBoostingRegressor(n_estimators=954, learning_rate=0.09, subsample=0.8)
    xgb = xgboost.XGBRegressor(
        n_jobs=-1,
        n_estimators=2000,
        colsample_bytree=0.75,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.75,
        gamma=10,
    )
    lgbm = LGBMRegressor(
        n_estimators=448,
        learning_rate=0.1,
        max_depth=15,
        min_child_samples=40,
        num_leaves=23,
    )
    return lr, lr_ridge, lr_lasso, elastic, rf, gb, xgb, lgbm


# fit
def model_eval(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    mse_train = mean_squared_error(y_train, y_pred_train)
    r2_train = r2_score(y_train, y_pred_train)

    y_pred_test = model.predict(X_test)
    mse_test = mean_squared_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)

    train_rmse = np.sqrt(abs(mse_train))
    test_rmse = np.sqrt(abs(mse_test))

    return train_rmse, r2_train, test_rmse, r2_test


# 스케일링 적용 함수
def apply_scaling(scaling_method, X_train, X_test):
    if scaling_method == "standard":
        scaler = StandardScaler()
    elif scaling_method == "minmax":
        scaler = MinMaxScaler()
    elif scaling_method == "robust":
        scaler = RobustScaler()
    elif scaling_method == "poly":
        X_train_scaled, X_test_scaled = poly_data(X_train, X_test)
        return X_train_scaled, X_test_scaled
    else:
        raise ValueError("Unsupported scaling method")

    # 훈련 데이터에 스케일링 적용
    X_train_scaled = scaler.fit_transform(X_train)

    # 테스트 데이터에 스케일링 적용
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled


def get_model_result_df(scaling_method):
    X, y = get_xy(df)

    # 데이터 분할
    X_train, X_test, y_train, y_test = data_split(X, y)

    # 스케일링 적용
    scaling_method = scaling_method  # 원하는 스케일링 방법 선택
    X_train_scaled, X_test_scaled = apply_scaling(scaling_method, X_train, X_test)

    # 사용할 모델 가져오기
    lr, lr_ridge, lr_lasso, elastic, rf, gb, xgb, lgbm = get_model()

    # 모델 평가
    models = [lr, lr_ridge, lr_lasso, elastic, rf, gb, xgb, lgbm]
    model_names = [
        "Linear Regression",
        "Ridge Regression",
        "Lasso Regression",
        "ElasticNet",
        "Random Forest",
        "Gradient Boosting",
        "XGBoost",
        "LightGBM",
    ]
    results = []

    for model, name in zip(models, model_names):
        train_rmse, r2_train, test_rmse, r2_test = model_eval(
            model, X_train_scaled, X_test_scaled, y_train, y_test
        )
        results.append((name, train_rmse, r2_train, test_rmse, r2_test))

    # 결과 출력
    results_df = pd.DataFrame(
        results, columns=["Model", "Train RMSE", "Train R^2", "Test RMSE", "Test R^2"]
    )
    print(results_df)


get_model_result_df("standard")


  model = cd_fast.enet_coordinate_descent(


               Model  Train RMSE  Train R^2  Test RMSE  Test R^2
0  Linear Regression   10.058940   0.614318  10.996622  0.506231
1   Ridge Regression   10.032291   0.616359  10.796114  0.524073
2   Lasso Regression   10.415383   0.586501  10.349157  0.562664
3         ElasticNet   10.068659   0.613573  10.853918  0.518963
4      Random Forest    4.262460   0.930746   9.412568  0.638239
5  Gradient Boosting    1.108863   0.995313   9.372124  0.641341
6            XGBoost    1.006409   0.996139   9.326375  0.644834
7           LightGBM    1.946590   0.985556  10.446396  0.554407


# 모델 파라미터 서치