In [94]:
import pandas as pd
import numpy as np

# 載入訓練資料與標籤
X_train = pd.read_csv('./data/X_train.csv')
y_train = pd.read_csv('./data/y_train.csv')

In [95]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

features = [
    "鄉鎮市區",
    "交易標的",
    "路名",
    "土地移轉總面積平方公尺",
    "都市土地使用分區",
    "土地數",
    "建物數",
    "車位數",
    "移轉層次",
    "移轉層次項目",
    "總樓層數",
    "建物型態",
    "主要用途",
    "主要建材",
    "建築完成年月",
    "建物移轉總面積平方公尺",
    "建物現況格局-房",
    "建物現況格局-廳",
    "建物現況格局-衛",
    "建物現況格局-隔間",
    "有無管理組織",
    "交易年",
    "交易日",
    "交易月",
    "地鐵站",
    "超商",
    "公園",
    "托兒所",
    "國小",
    "國中",
    "高中職",
    "大學",
    "金融機構",
    "醫院",
    "大賣場",
    "超市",
    "百貨公司",
    "警察局",
    "消防局",
    "縱坐標",
    "橫坐標",
]
target = "單價元平方公尺"

numeric_features = [
    "土地移轉總面積平方公尺",
    "土地數",
    "建物數",
    "車位數",
    "移轉層次",
    "總樓層數",
    "建物移轉總面積平方公尺",
    "建物現況格局-房",
    "建物現況格局-廳",
    "建物現況格局-衛",
    "交易年",
    "交易日",
    "交易月",
    "地鐵站",
    "超商",
    "公園",
    "托兒所",
    "國小",
    "國中",
    "高中職",
    "大學",
    "金融機構",
    "醫院",
    "大賣場",
    "超市",
    "百貨公司",
    "警察局",
    "消防局",
    "縱坐標",
    "橫坐標",
]
categorical_features = [i for i in features if i not in numeric_features]

In [96]:
import datetime


def generate_time(df):
    cols = ["交易年", "交易月", "交易日"]
    df["time"] = df[cols].apply(
        lambda row: (
            datetime.datetime(year=int(row[0]), month=int(row[1]), day=int(row[2]))
            - datetime.datetime(2012, 1, 1)
        ).total_seconds()
        // 86400,
        axis=1,
    )

    return df


X_train = generate_time(X_train)


numeric_features.remove("交易年")
numeric_features.remove("交易月")
numeric_features.remove("交易日")
numeric_features.append("time")

  datetime.datetime(year=int(row[0]), month=int(row[1]), day=int(row[2]))


In [97]:
# set up the preprocessing steps for each type of feature

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# create the model with grid search

model = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", Ridge())])

# set up the hyperparameter grid
param_grid = {
    "regressor__alpha": [0.1, 1, 10],
    "regressor__fit_intercept": [True, False],
}

# set up the grid search
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)

# normalize the target

y_train[target] = np.log(y_train[target])

# normalize the features

X_train[numeric_features] = StandardScaler().fit_transform(X_train[numeric_features])

# set the target and features for the model
y_train = y_train[target]
X_train = X_train[numeric_features + categorical_features]

# train the model

grid_search.fit(X_train, y_train)

In [98]:
# save the model
import joblib

model = grid_search.best_estimator_
joblib.dump(model, "model.pkl")

# run inference
import uuid

X_test = pd.read_csv("./data/X_test.csv")

X_test = generate_time(X_test)

X_test[numeric_features] = StandardScaler().fit_transform(X_test[numeric_features])

y_pred = model.predict(X_test)

# denormalize the target
y_pred = np.exp(y_pred)

# save the prediction with ID
y_pred_df = pd.DataFrame(y_pred, columns=[target])
y_pred_df.index.name = "ID"
y_pred_df.to_csv(f"y_pred-nyr-{uuid.uuid4()}.csv")

  datetime.datetime(year=int(row[0]), month=int(row[1]), day=int(row[2]))


In [99]:
# evaluate the model
from sklearn.metrics import mean_squared_error

y_train_pred = model.predict(X_train)
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
print(f"Train MSE: {train_rmse}")

Train MSE: 0.20054131358590285


