In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
import os
#import xgboost as xgb
#import lightgbm as lgb
from functools import reduce
import warnings
from sklearn.metrics import mean_squared_error
from scipy import stats
from math import sqrt
from lightgbm import plot_tree
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import STATUS_OK
from hyperopt.pyll.stochastic import sample
import gc
from sklearn.linear_model import LinearRegression
warnings.filterwarnings('ignore')


In [78]:
train = pd.read_csv('dataset-0510/train.csv')
test  = pd.read_csv('dataset-0510/test.csv')
data = pd.concat([train, test], axis=0)
gc.collect()

2552

In [98]:
def feature_process(df):
    
    #target_encoding
    temp_train = df[:60000]
    combine_rows = ['city', 'town', 'building_type']
    target_df = temp_train.groupby(combine_rows).agg({'building_area' : ['mean', 'median'], 'land_area' : ['mean', 'median'], 'total_price' : ['mean', 'median']}).reset_index()
    target_df.columns = [i[0] + '_' + i[1]  if i[1] != '' else i[0] for i in target_df.columns.tolist()]
    target_df['total_price_median'] = np.log1p(target_df['total_price_median'])
    target_df['total_price_mean'] = np.log1p(target_df['total_price_mean'])
    
    target_df['price_land_rate_median'] = target_df['total_price_median'] / target_df['land_area_median']
    target_df['price_building_rate_median'] = target_df['total_price_median'] / target_df['building_area_median']
    target_df['price_land_rate_mean'] = target_df['total_price_mean'] / target_df['land_area_mean']
    target_df['price_building_rate_mean'] = target_df['total_price_mean'] / target_df['building_area_mean']
    
    combine_cols = combine_rows + ['price_land_rate_median', 'price_building_rate_median', 'total_price_median']
    df = pd.merge(df, target_df[combine_cols], on = combine_rows, how='left')
    
    df['min_cat'] = 0
    for col in [i for i in df.columns if 'MIN' in i]:
        df['min_cat'] = df.apply(lambda x : col if x['min_cat'] == x[col] else x['min_cat'], axis=1)  
    
    #10, 50, 100, 250, 500, 1000, 5000, 10000
    for num in ['_10', '_50', '_100', '_250', '_500', '_1000', '_5000', '_10000']:
        df['Mean_' + num] = df[[i for i in df.columns if i.endswith(num) and 'index' not in i and 'N' not in i]].apply(lambda x : x.mean(), axis=1)
        df['Median_' + num] = df[[i for i in df.columns if i.endswith(num) and 'index' not in i and 'N' not in i]].apply(lambda x : x.median(), axis=1)
        df['Std_' + num] = df[[i for i in df.columns if i.endswith(num) and 'index' not in i and 'N' not in i]].apply(lambda x : x.std(), axis=1)
        df['Skew_' + num] = df[[i for i in df.columns if i.endswith(num) and 'index' not in i and 'N' not in i]].apply(lambda x : x.skew(), axis=1)

    
    #MIN
    MIN_cols = [i for i in df.columns if '_MIN' in i]
    df[MIN_cols].apply(lambda x : x.mean(), axis =1 )
    df[MIN_cols].apply(lambda x : x.median(), axis =1 )
    df[MIN_cols].apply(lambda x : x.std(), axis =1 )
    df[MIN_cols].apply(lambda x : x.skew(), axis =1 )
    
    rank_df = df[MIN_cols].rank(axis =1)
    rank_df = rank_df.add_prefix('Rank_')
    df = pd.concat([df, rank_df], axis=1)
    
    #parking
    df['price_area'] = 1
    df.loc[df['parking_price'].notna(), 'price_area'] = 2
    df.loc[df['parking_area'].notna(), 'price_area'] = 3
    
    df.loc[df['parking_way'] == 2, 'parking_area'] = 0
    df.loc[df['parking_way'] == 2, 'parking_price'] = 0
    
    df = pd.concat([df, pd.get_dummies(df['price_area'], prefix='price_area')], axis=1)
    #Impute missing value
    df['village_income_mean'] = df.groupby(['city', 'town'])['village_income_median'].transform(lambda x : x .fillna(x.mean()))
    
    #location

    df['location_2'] = df.apply(lambda x : int(str(x['city']) + str(x['town'])), axis=1)
    df['city'] = df['city'].astype('category')
    df['location_2'] = df['location_2'].astype('category')
    
    '''
    #degree rate
    df['diff_doc_master'] = df['doc_rate'] - df['master_rate']
    df['diff_master_bachelor'] = df['master_rate'] - df['bachelor_rate']
    df['diff_bachelor_highsch'] = df['bachelor_rate'] - df['highschool_rate']
    df['diff_highsch_jobschool'] = df['highschool_rate'] - df['jobschool_rate']
    df['diff_jobschool_elesch'] = df['jobschool_rate'] - df['elementary_rate']
    
    df['all_degree'] = df['doc_rate'] + df['master_rate'] + df['bachelor_rate'] + df['highschool_rate'] + df['jobschool_rate'] + df['junior_rate'] + df['elementary_rate']
    df['junior_above_rate'] = df['doc_rate'] + df['master_rate'] + df['bachelor_rate'] + df['highschool_rate'] + df['jobschool_rate'] + df['junior_rate'] 
    df['jobschool_above_rate'] = df['doc_rate'] + df['master_rate'] + df['bachelor_rate'] + df['highschool_rate'] +  df['jobschool_rate']
    df['highschool_above_rate'] = df['doc_rate'] + df['master_rate'] + df['bachelor_rate'] + df['highschool_rate']
    df['bachelor_above_rate'] = df['doc_rate'] + df['master_rate'] + df['bachelor_rate']
    df['master_above_rate'] = df['doc_rate'] + df['master_rate'] 
    '''
    
    #building
    df['building_use'] = df['building_use'].astype('category')
    df['building_material'] = df['building_material'].astype('category')
    
    #floor
    df.loc[df['txn_floor'].isna(), 'department'] = 1 
    df.loc[df['txn_floor'].notna(), 'department'] = 0
    df['avg_height_floor'] = df['txn_floor'] / df['total_floor'] 
    df['avg_height_floor'].fillna(0, inplace = True)
    
    #area
    df['land/bulid_area'] = df['land_area'] / df ['building_area']
    df['parking_rate'] = df['building_area'] / df['parking_area']
    
    #date
    df['day_between_txn_complete'] = df['txn_dt'] - df['building_complete_dt']
    df['year_between_txn_complete'] = round(df['day_between_txn_complete'] / 365)
    
    #soical rate
    '''
    df['natural_diff'] = df['born_rate'] - df['death_rate']
    df['natural_rate'] = df['born_rate'] / df['death_rate']
    
    df['marry_diff'] = df['marriage_rate'] - df['divorce_rate']
    df['marry_rate'] = df['marriage_rate'] / df['divorce_rate']
    
    df['total_diff_sum'] = df['natural_rate'] + df['marry_diff']
    df['total_diff_diff'] = df['natural_rate'] - df['marry_diff']
    df['total_rate_diff'] = df['born_rate'] + df['marry_rate']
    
    df['positive_grow_rate'] = df['born_rate'] + df['marriage_rate']
    df['negative_grow_rate'] = df['death_rate'] + df['divorce_rate']
    df['tatal_rate_sum'] = df['born_rate'] + df['death_rate'] + df['born_rate'] + df['death_rate']
    '''
    
    #village_income_median
    temp = df.groupby(['city', 'town', 'village'])['village_income_median'].first().reset_index()
    temp = df.groupby(['city', 'town']).agg({'village_income_median' : ['mean', 'sum', 'median']})
    temp = temp.rename({'village_income_median' : 'town_income_median'}, level = 0)
    temp.columns = [e[0] + '_' + e[1] if e[1] != '' else e[0] for e in temp.columns.tolist()]
    df = pd.merge(df, temp, on = ['city', 'town'], how = 'left')

    temp = df.groupby(['city', 'town', 'village'])['village_income_median'].first().reset_index()
    temp = df.groupby(['city']).agg({'village_income_median' : ['mean', 'sum', 'median']})
    temp = temp.rename({'village_income_median' : 'city_income_median'}, level = 0)
    temp.columns = [e[0] + '_' + e[1] if e[1] != '' else e[0] for e in temp.columns.tolist()]
    df = pd.merge(df, temp, on = ['city'], how = 'left')
    
    #Build_case
    cols = ['city', 'town', 'village', 'building_type', 'building_use', 'total_floor', 'XIV_MIN', 'building_complete_dt']
    df.loc[df[cols].duplicated(keep = False), 'Build_case'] = 0
    df.loc[~df[cols].duplicated(keep = False), 'Build_case'] = 1
    
    #interection 
    '''
    locations =['city', 'location2']
    inter_cols = ['building_type', 'building_use', 'building_material']
    for location in locations:
        for inter_col in inter_cols:
            df['inter_btw_' + location +'_' + inter_col] = df.apply(lambda x : str(x[location]) + str(x[inter_col]), axis=1)
            df['inter_btw_' + location +'_' + inter_col] = df['inter_btw_' + location +'_' + inter_col].astype('category')
    '''
    inter_cols = ['building_type', 'building_use', 'building_material', 'parking_way']
    for i in range(4):
        for j in range(4):
            if j > i:
                df['inter_btw_' + inter_cols[i] +'_' + inter_cols[j]] = df.apply(lambda x : str(x[inter_cols[i]]) + str(x[inter_cols[j]]), axis=1)
                df['inter_btw_' + inter_cols[i] +'_' + inter_cols[j]] = df['inter_btw_' + inter_cols[i] +'_' + inter_cols[j]].astype('category')
    #one-hot
    df = df.join(pd.get_dummies(df['parking_way'], prefix = 'parking_way'))
    df = df.join(pd.get_dummies(df['building_type'], prefix = 'building_type'))
    df = df.join(pd.get_dummies(df['building_material'], prefix = 'building_material'))
    df = df.join(pd.get_dummies(df['building_use'], prefix = 'building_use'))
    
    
    
    #useless cols
    df.drop(['village', 'town', 'building_type', 'parking_way','building_material', 'building_use'], axis = 1, inplace = True)
    df.drop([i for i in train.columns if np.sum(train[i]) == 60000 and 'index' in i], axis = 1, inplace = True)
    return df

In [99]:
final_data = feature_process(data)
FE_train = final_data[:60000]
FE_test = final_data[60000:]
FE_train.to_csv('FE_train.csv', index = False)
FE_test.to_csv('FE_test.csv', index = False)
len(FE_train.columns), len(FE_test.columns)

(309, 309)

In [100]:
FE_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60000 entries, 0 to 59999
Columns: 309 entries, III_10 to building_use_10
dtypes: category(7), float64(99), int64(172), object(1), uint8(30)
memory usage: 127.2+ MB


In [101]:
[i for i in FE_train.columns if FE_train[i].dtypes not in ['float64', 'int64', 'O']]

['price_area_1',
 'price_area_2',
 'price_area_3',
 'location_2',
 'inter_btw_building_type_building_use',
 'inter_btw_building_type_building_material',
 'inter_btw_building_type_parking_way',
 'inter_btw_building_use_building_material',
 'inter_btw_building_use_parking_way',
 'inter_btw_building_material_parking_way',
 'parking_way_0',
 'parking_way_1',
 'parking_way_2',
 'building_type_0',
 'building_type_1',
 'building_type_2',
 'building_type_3',
 'building_type_4',
 'building_material_1',
 'building_material_3',
 'building_material_4',
 'building_material_5',
 'building_material_7',
 'building_material_8',
 'building_material_9',
 'building_material_10',
 'building_material_11',
 'building_use_0',
 'building_use_1',
 'building_use_2',
 'building_use_3',
 'building_use_4',
 'building_use_5',
 'building_use_6',
 'building_use_7',
 'building_use_8',
 'building_use_10']

In [44]:
train['temp'] = ''
for col in [i for i in train.columns if i.endswith('index_500')]:
    train['temp'] = train['temp']+ train[col].astype(str)

In [45]:
train['temp'].value_counts()

11111111111111    17244
11101111111111    10113
11101011111111     5695
11111011111111     5180
11101011111101     2706
11101111111101     2491
11111111111101     2316
11111011111101     1558
01101011111111      532
01101011111101      506
11101011110101      498
01111111111111      335
01111011111111      308
11101111110101      297
11101011110111      292
01101111111111      280
01101111111101      235
11101111110111      233
11101011011101      229
11111011110101      211
11101011011111      210
11111111110101      189
11101011101101      181
01111011111101      177
11101011100101      128
11111111110111      123
11101011000101      122
11111011110111      120
10101011111101      117
00000000000101      108
                  ...  
11000011011111        1
10001010110111        1
00101001010101        1
01100001111101        1
00000000001100        1
01000010010001        1
10111111110111        1
00010000000100        1
11011011010111        1
01001001001111        1
11010110010101  

In [112]:
for i in ['I_10','I_50','I_100','I_250','I_500','I_1000','I_5000','I_10000']:
    train[i + '_average'] = train[i] / int(i.split('_')[1])
    

In [114]:
train['I_10_average'][:5]

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: I_10_average, dtype: float64

In [115]:
train['I_50_average'][:5]

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: I_50_average, dtype: float64

In [116]:
train['I_100_average'][:5]

0    0.01
1    0.01
2    0.01
3    0.00
4    0.00
Name: I_100_average, dtype: float64

In [117]:
train['I_250_average'][:5]

0    0.020
1    0.008
2    0.004
3    0.000
4    0.000
Name: I_250_average, dtype: float64

In [118]:
train['I_500_average'][:5]

0    0.040
1    0.010
2    0.016
3    0.000
4    0.000
Name: I_500_average, dtype: float64

In [119]:
train['I_1000_average'][:5]

0    0.059
1    0.013
2    0.039
3    0.009
4    0.001
Name: I_1000_average, dtype: float64