In [17]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
#
from sklearn.metrics import r2_score, mean_absolute_percentage_error
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
#
import warnings
warnings.filterwarnings("ignore")
color_map = {f"Trial {k}":v for k, v in zip([x for x in range(0, 10)], sns.color_palette())}

In [2]:
data = pd.read_csv("./Data/Prep_AMES/sig_train.csv").iloc[:, 1:]
print(data.shape)
data.head()

(1458, 35)


Unnamed: 0,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,BsmtQual,TotalBsmtSF,HeatingQC,...,Exterior2nd_VinylSd,MasVnrType_None,MasVnrType_Stone,Foundation_CBlock,Foundation_PConc,GarageType_Attchd,GarageType_Detchd,SaleType_New,SaleCondition_Partial,Y
0,5.831328,19.212182,2.440268,14.187527,14.187527,8.059126,1.194318,1.194318,11.692623,0.0,...,1,0,0,0,1,1,0,0,0,208500
1,6.221214,19.712205,2.259674,14.145138,14.145138,0.0,1.540963,1.194318,12.792276,0.0,...,0,1,0,1,0,1,0,0,0,181500
2,5.91494,20.347241,2.440268,14.184404,14.185966,7.646538,1.194318,1.194318,11.892039,0.0,...,1,0,0,0,1,1,0,0,0,223500
3,5.684507,19.691553,2.440268,14.047529,14.135652,0.0,1.540963,1.820334,11.354094,1.194318,...,0,1,0,0,0,0,1,0,0,140000
4,6.314735,21.32516,2.602594,14.182841,14.182841,9.391827,1.194318,1.194318,12.510588,0.0,...,1,0,0,0,1,1,0,0,0,250000


In [3]:
ftrs = data.columns.tolist()[:-1]
x_train, x_val, y_train, y_val = train_test_split(data[ftrs], data["Y"], test_size=0.2, random_state=42)
train = pd.concat([x_train, y_train], axis=1).reset_index(drop=True)
valid = pd.concat([x_val, y_val], axis=1).reset_index(drop=True)
#
print(f"Train: {train.shape}")
print(f"Valid: {valid.shape}")

Train: (1166, 35)
Valid: (292, 35)


# 1. Ftr Select

In [42]:
# RFE 이용해, n개의 피처 선택
def select_n_ftrs(model, n, data=train, ftrs=ftrs, target="Y"):
    selector = RFE(model, n_features_to_select=n, step=1)
    selector = selector.fit(data[ftrs], data[target])
    sel = []
    for val, selected in zip(ftrs, selector.support_):
        if selected: sel.append(val)
    return sel

In [44]:
# 모델 훈련
class ModelTrain:
    @staticmethod
    def statics(model, sample_data, selected_ftrs, target="Y"):
        model.fit(sample_data[selected_ftrs], sample_data[target])
        return model.predict(x_val)
    @staticmethod
    def kmeans(model, sample_data, selected_ftrs):
        model.fit(sample_data[selected_ftrs])
        train_result = pd.concat([sample_data.reset_index(drop=True), pd.DataFrame({"Train_Cluster":model.labels_})], axis=1)
        pred_result = pd.concat([x_val.reset_index(drop=True), y_val.reset_index(drop=True), pd.DataFrame({"Pred_Cluster":model.predict(x_val.values)})], axis=1)
        train_result.columns = [x+"_t" for x in selected_ftrs]+["Pred_Y", "Train_Cluster"]
        pred_result.columns = [x+"_p" for x in selected_ftrs]+["Act_Y", "Pred_Cluster"]
        #
        pred_train = pd.merge(
            left=pred_result, right=train_result,
            left_on="Pred_Cluster", right_on="Train_Cluster"
        )
        # 코사인 거리 & 유사도
        cd, cs = [], []
        for idx in range(len(pred_train)):
            r_data = pred_train.iloc[idx]
            recommand_ftrs = r_data[[x+"_t" for x in selected_ftrs]]
            input_ftrs = r_data[[x+"_p" for x in selected_ftrs]]
            cd.append(cosine_distances(pd.DataFrame(recommand_ftrs).T, pd.DataFrame(input_ftrs).T)[0][0])
            cs.append(cosine_similarity(pd.DataFrame(recommand_ftrs).T, pd.DataFrame(input_ftrs).T)[0][0])
        pred_train["CosDist"] = cd
        pred_train["CosSim"] = cs
        pred_train["GrLivArea P/T"] = pred_train["GrLivArea_p"] / pred_train["GrLivArea_t"]
        pred_train["GrLivArea T/P"] = pred_train["GrLivArea_t"] / pred_train["GrLivArea_p"]
        return pred_train[["Pred_Y", "Act_Y", "CosDist", "CosSim", "GrLivArea P/T", "GrLivArea T/P"]]
    

In [None]:
# n개 훈련
data_n = 10
def train_n(n, sn, data=train, ftrs=ftrs, target="Y"):
    sample_data,  pred_lrs, pred_dts, pred_rfs, pred_kmeans = [], [], [], [], []
    sftrs_lrs, sftrs_dts, sftrs_rfs = [], [], []
    for _ in range(data_n):
        sample = data.sample(n)
        sample = sample.drop_duplicates()
        sample_data.append(sample)
        print(sample.shape[0], end=" ", sep=" ")
        #
        lr = LinearRegression()
        dt = DecisionTreeRegressor(random_state=42)
        rf = RandomForestRegressor(random_state=42)
        km = KMeans(n_clusters=n, init="k-means++")
        #
        sftr_lr = select_n_ftrs(lr, sn)
        sftr_dt = select_n_ftrs(dt, sn)
        sftr_rf = select_n_ftrs(rf, sn)
        #
        pred_lrs.append(ModelTrain.statics(lr, sample, sftr_lr))
        pred_dts.append(ModelTrain.statics(dt, sample, sftr_dt))
        pred_rfs.append(ModelTrain.statics(rf, sample, sftr_rf))
        kftr = list(set(sftr_lr+sftr_dt+sftr_rf))[:sn]
        pred_kmeans.append(ModelTrain.kmeans(km, sample, kftr))
        #
        sftrs_lrs.append(sftr_lr)
        sftrs_dts.append(sftr_dt)
        sftrs_rfs.append(sftr_rf)
    # 샘플링 데이터 시각화
    for _ in range(len(sample_data)): sns.kdeplot(sample_data[_]["Y"], fill=True, color=color_map[f"Trial {_+1}"])
    plt.legend(color_map.keys())
    plt.title(f"Sampling {n}")
    plt.show()

    return {
        "SampleData":sample_data,
        "LR":{"P":pred_lrs, "F":sftrs_lrs},
        "DT":{"P":pred_dts, "F":sftrs_dts},
        "RF":{"P":pred_rfs, "F":sftrs_rfs},
        "Kmeans":pred_kmeans
    }