In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%%time
# Read Data
basedir = './Training/training_data_ver_21/'
training_data_df = pd.read_csv(basedir + 'training_data_ver_21.csv')
print(training_data_df.shape)
training_data_df.head()

In [None]:
training_data_df.isna().sum()

In [None]:
training_data_df['가격_면적'].hist(bins=100)

In [None]:
def price_over_area_feature_engi(x):
    # x is a number
    
    if pd.isna(x) == True:
        return x
    else:
        for i in range(201):
            if x < ((i+1)*15):
                return '{}~{}'.format((i*15), (i+1)*15)
        
        return '3000초과'

In [None]:
training_data_df['가격_면적_classes'] = training_data_df['가격_면적'].apply(price_over_area_feature_engi)

In [None]:
col_map = {}
col_map_df = pd.DataFrame({'cat':training_data_df['가격_면적_classes'].unique(),
                           'map':[i for i in range(len(training_data_df['가격_면적_classes'].unique()))]})
for i in range(col_map_df.shape[0]):
    col_map[col_map_df['cat'][i]] = col_map_df['map'][i]
training_data_df['가격_면적_classes'] = training_data_df['가격_면적_classes'].map(col_map)
col_map_df.to_csv('./Training/training_data_ver_21/feature_maps/가격_면적_classes_map.csv')

In [None]:
training_data_df.head()

In [None]:
training_data_df['가격_면적_classes'].nunique()

In [None]:
training_data_df['가격_면적_classes'].value_counts()

In [None]:
cols_to_drop = ['지번주소', '도로명주소', '건물(단지)명', '전용면적(㎡)', '거래금액(만원)', '계약날짜', '가격_면적', '년', '가격_면적_classes']

# Training

In [None]:
from sklearn.model_selection import KFold, GroupKFold, GroupShuffleSplit

In [None]:
import lightgbm as lgb

In [None]:
from sklearn.metrics import r2_score

In [None]:
gkf = GroupKFold(n_splits=5)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
target_cols = ['가격_면적_classes']

In [None]:
cat_cols_df = pd.read_csv(basedir + 'cat_cols.csv')
cat_cols_df

In [None]:
cat_cols = cat_cols_df['colname'].tolist()
cat_cols

In [None]:
cat_cols.remove('가격_면적_classes')

In [None]:
count = 0
oof = np.zeros(training_data_df.shape[0])

for train_idx, val_idx in gkf.split(training_data_df, groups=training_data_df['지번주소']):
    print('Fold {} start'.format(count))
    train_data = training_data_df.loc[train_idx].sample(frac=1).reset_index(drop=True)
    val_data = training_data_df.loc[val_idx]
    
    x_train = train_data.drop(columns=cols_to_drop)
    y_train = train_data['가격_면적']
    
    print(x_train.shape)
    
    x_val = val_data.drop(columns=cols_to_drop)
    y_val = val_data['가격_면적']
    
    print('y dist plot')
    y_val.hist(bins=100)
    plt.show()
    plt.clf()
    
    params = {
        'objective': 'regression',
        #'num_class':200,
        #'alpha': 1.35,
        'learning_rate': 0.01,
        'seed': 42,
        'max_depth': 6,
        'num_leaves': 51,
        'min_data_in_leaf': 256,
        #'bagging_fraction':0.8,
        'feature_fraction':0.7,
        #'lambda_l2': 0,
        'metric': 'rmse',
        'num_threads': 6,
        #'is_unbalance': True
    }
    
    train_dataset = lgb.Dataset(x_train, label=y_train, categorical_feature=cat_cols)
    val_dataset = lgb.Dataset(x_val, label=y_val, categorical_feature=cat_cols)
    
    num_rounds = 50000

    bst = lgb.train(params, train_dataset, num_rounds, valid_sets=[train_dataset, val_dataset], early_stopping_rounds=100, verbose_eval=50)
    
    bst.save_model(basedir + 'lightgbm_regression_{}.txt'.format(count), num_iteration=bst.best_iteration)
    
    preds = bst.predict(x_val, num_iteration=bst.best_iteration)
    
    print('preds dist plot')
    sns.displot(preds)
    plt.show()
    plt.clf()
    
    oof[val_idx] = preds
    
    print(np.sqrt(mean_squared_error(y_val, preds)))
    
    print('Fold {} end'.format(count))
    count += 1
    

In [None]:
np.sqrt(mean_squared_error(training_data_df['가격_면적'], oof))

In [None]:
count = 0
oof = np.zeros(training_data_df.shape[0])

for train_idx, val_idx in gkf.split(training_data_df, groups=training_data_df['지번주소']):
    print('Fold {} start'.format(count))
    train_data = training_data_df.loc[train_idx].sample(frac=1).reset_index(drop=True)
    val_data = training_data_df.loc[val_idx]
    
    x_train = train_data.drop(columns=cols_to_drop)
    y_train = train_data['가격_면적']
    
    print(x_train.shape)
    
    x_val = val_data.drop(columns=cols_to_drop)
    y_val = val_data['가격_면적']
    
    print('y dist plot')
    y_val.hist(bins=100)
    plt.show()
    plt.clf()
    
    params = {
        'objective': 'huber',
        #'num_class':200,
        'alpha': 1.35,
        'learning_rate': 1,
        'seed': 42,
        #'max_depth': 7,
        #'num_leaves': 41,
        #'min_data_in_leaf': 64,
        #'bagging_fraction':0.7,
        #'feature_fraction':0.7,
        #'lambda_l2': 0,
        'metric': 'rmse',
        'num_threads': 6,
        #'is_unbalance': True
    }
    
    train_dataset = lgb.Dataset(x_train, label=y_train, categorical_feature=cat_cols)
    val_dataset = lgb.Dataset(x_val, label=y_val, categorical_feature=cat_cols)
    
    num_rounds = 50000

    bst = lgb.train(params, train_dataset, num_rounds, valid_sets=[train_dataset, val_dataset], early_stopping_rounds=100, verbose_eval=50)
    
    bst.save_model(basedir + 'lightgbm_huber_{}.txt'.format(count), num_iteration=bst.best_iteration)
    
    preds = bst.predict(x_val, num_iteration=bst.best_iteration)
    
    print('preds dist plot')
    sns.displot(preds)
    plt.show()
    plt.clf()
    
    oof[val_idx] = preds
    
    print(np.sqrt(mean_squared_error(y_val, preds)))
    
    print('Fold {} end'.format(count))
    count += 1
    

In [None]:
np.sqrt(mean_squared_error(training_data_df['가격_면적'], oof))