In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
import os
#import xgboost as xgb
#import lightgbm as lgb
from functools import reduce
import warnings
from sklearn.metrics import mean_squared_error
from scipy import stats
from math import sqrt
from lightgbm import plot_tree
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import STATUS_OK
from hyperopt.pyll.stochastic import sample
import gc
from sklearn.linear_model import LinearRegression
warnings.filterwarnings('ignore')


In [3]:
train = pd.read_csv('dataset-0510/train.csv')
test  = pd.read_csv('dataset-0510/test.csv')


In [4]:
def feature_process(df):
    
    #target_encoding
    temp_train = df[:60000]
    target_df = temp_train.groupby(['city', 'town']).agg({'building_area' : ['mean', 'median'], 'land_area' : ['mean', 'median'], 'total_price' : ['mean', 'median']}).reset_index()
    target_df.columns = [i[0] + '_' + i[1]  if i[1] != '' else i[0] for i in target_df.columns.tolist()]
    target_df['price_land_rate_median'] = target_df['total_price_median'] / target_df['land_area_median']
    target_df['price_building_rate_median'] = target_df['total_price_median'] / target_df['building_area_median']
    target_df['price_land_rate_mean'] = target_df['total_price_mean'] / target_df['land_area_mean']
    target_df['price_building_rate_mean'] = target_df['total_price_mean'] / target_df['building_area_mean']
    
    combine_cols = ['city', 'town', 'price_land_rate_median', 'price_building_rate_median', 'price_land_rate_mean', 'price_building_rate_mean']
    df = pd.merge(df, target_df[combine_cols], on =['city', 'town'], how='left')
    
    df['min_cat'] = 0
    for col in [i for i in df.columns if 'MIN' in i]:
        df['min_cat'] = df.apply(lambda x : col if x['min_cat'] == x[col] else x['min_cat'], axis=1)  
    
    #MIN_Rank
    Rand_df = df[[i for i in train.columns if 'MIN' in i]].rank(axis =1).add_prefix('Rank_')
    df = df.concat([df, Rand_df], axis =1)
    
    #Impute missing value
    df['village_income_mean'] = df.groupby(['city', 'town'])['village_income_median'].transform(lambda x : x .fillna(x.mean()))
    
    #location

    df['location_2'] = df.apply(lambda x : int(str(x['city']) + str(x['town'])), axis=1)
    df['city'] = df['city'].astype('category')
    df['location_2'] = df['location_2'].astype('category')
    
    #degree rate
    df['diff_doc_master'] = df['doc_rate'] - df['master_rate']
    df['diff_master_bachelor'] = df['master_rate'] - df['bachelor_rate']
    df['diff_bachelor_highsch'] = df['bachelor_rate'] - df['highschool_rate']
    df['diff_highsch_jobschool'] = df['highschool_rate'] - df['jobschool_rate']
    df['diff_jobschool_elesch'] = df['jobschool_rate'] - df['elementary_rate']
    
    df['all_degree'] = df['doc_rate'] + df['master_rate'] + df['bachelor_rate'] + df['highschool_rate'] + df['jobschool_rate'] + df['junior_rate'] + df['elementary_rate']
    df['junior_above_rate'] = df['doc_rate'] + df['master_rate'] + df['bachelor_rate'] + df['highschool_rate'] + df['jobschool_rate'] + df['junior_rate'] 
    df['jobschool_above_rate'] = df['doc_rate'] + df['master_rate'] + df['bachelor_rate'] + df['highschool_rate'] +  df['jobschool_rate']
    df['highschool_above_rate'] = df['doc_rate'] + df['master_rate'] + df['bachelor_rate'] + df['highschool_rate']
    df['bachelor_above_rate'] = df['doc_rate'] + df['master_rate'] + df['bachelor_rate']
    df['master_above_rate'] = df['doc_rate'] + df['master_rate'] 
    
    #building
    df['building_use'] = df['building_use'].astype('category')
    df['building_material'] = df['building_material'].astype('category')
    
    #floor
    df.loc[df['txn_floor'].isna(), 'department'] = 1 
    df.loc[df['txn_floor'].notna(), 'department'] = 0
    df['avg_height_floor'] = df['txn_floor'] / df['total_floor'] 
    df['avg_height_floor'].fillna(0, inplace = True)
    
    #area
    df['land/bulid_area'] = df['land_area'] / df ['building_area']
    df['parking_rate'] = df['building_area'] / df['parking_area']
    
    #date
    df['day_between_txn_complete'] = df['txn_dt'] - df['building_complete_dt']
    df['year_between_txn_complete'] = round(df['day_between_txn_complete'] / 365)
    
    #soical rate
    df['natural_diff'] = df['born_rate'] - df['death_rate']
    df['natural_rate'] = df['born_rate'] / df['death_rate']
    
    df['marry_diff'] = df['marriage_rate'] - df['divorce_rate']
    df['marry_rate'] = df['marriage_rate'] / df['divorce_rate']
    
    df['total_diff_sum'] = df['natural_rate'] + df['marry_diff']
    df['total_diff_diff'] = df['natural_rate'] - df['marry_diff']
    df['total_rate_diff'] = df['born_rate'] + df['marry_rate']
    
    df['positive_grow_rate'] = df['born_rate'] + df['marriage_rate']
    df['negative_grow_rate'] = df['death_rate'] + df['divorce_rate']
    df['tatal_rate_sum'] = df['born_rate'] + df['death_rate'] + df['born_rate'] + df['death_rate']
    
    #village_income_median
    temp = df.groupby(['city', 'town', 'village'])['village_income_median'].first().reset_index()
    temp = df.groupby(['city', 'town']).agg({'village_income_median' : ['mean', 'sum', 'median']})
    temp = temp.rename({'village_income_median' : 'town_income_median'}, level = 0)
    temp.columns = [e[0] + '_' + e[1] if e[1] != '' else e[0] for e in temp.columns.tolist()]
    df = pd.merge(df, temp, on = ['city', 'town'], how = 'left')

    temp = df.groupby(['city', 'town', 'village'])['village_income_median'].first().reset_index()
    temp = df.groupby(['city']).agg({'village_income_median' : ['mean', 'sum', 'median']})
    temp = temp.rename({'village_income_median' : 'city_income_median'}, level = 0)
    temp.columns = [e[0] + '_' + e[1] if e[1] != '' else e[0] for e in temp.columns.tolist()]
    df = pd.merge(df, temp, on = ['city'], how = 'left')
    
    #interection 
    '''
    locations =['city', 'location2']
    inter_cols = ['building_type', 'building_use', 'building_material']
    for location in locations:
        for inter_col in inter_cols:
            df['inter_btw_' + location +'_' + inter_col] = df.apply(lambda x : str(x[location]) + str(x[inter_col]), axis=1)
            df['inter_btw_' + location +'_' + inter_col] = df['inter_btw_' + location +'_' + inter_col].astype('category')
    '''
    inter_cols = ['building_type', 'building_use', 'building_material']
    for i in range(3):
        for j in range(3):
            if j > i:
                df['inter_btw_' + inter_cols[i] +'_' + inter_cols[j]] = df.apply(lambda x : str(x[inter_cols[i]]) + str(x[inter_cols[j]]), axis=1)
                df['inter_btw_' + inter_cols[i] +'_' + inter_cols[j]] = df['inter_btw_' + inter_cols[i] +'_' + inter_cols[j]].astype('category')
    #one-hot
    df = df.join(pd.get_dummies(df['parking_way'], prefix = 'parking_way'))
    df = df.join(pd.get_dummies(df['building_type'], prefix = 'building_type'))
    df = df.join(pd.get_dummies(df['building_material'], prefix = 'building_material'))
    df = df.join(pd.get_dummies(df['building_use'], prefix = 'building_use'))
    
    
    
    #useless cols
    df.drop(['village', 'town', 'building_type', 'parking_way','building_material', 'building_use', 'lat', 'lon'], axis = 1, inplace = True)
    df.drop([i for i in train.columns if np.sum(train[i]) == 60000 and 'index' in i], axis = 1, inplace = True)
    return df

In [101]:
final_data = feature_process(data)

In [102]:
final_data = feature_process(data)
FE_train = final_data[:60000]
FE_test = final_data[60000:]
FE_train.to_csv('FE_train.csv', index = False)
FE_test.to_csv('FE_test.csv', index = False)
len(FE_train.columns), len(FE_test.columns)

(271, 271)

In [103]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 235 entries, building_id to total_price
dtypes: float64(37), int64(197), object(1)
memory usage: 107.6+ MB


In [99]:
[i for i in FE_train.columns if FE_train[i].dtypes not in ['float', 'int', 'O']]

['building_material',
 'building_use',
 'location_2',
 'inter_btw_building_type_building_use',
 'inter_btw_building_type_building_material',
 'inter_btw_building_use_building_material',
 'parking_way_0',
 'parking_way_1',
 'parking_way_2',
 'building_type_0',
 'building_type_1',
 'building_type_2',
 'building_type_3',
 'building_type_4']

In [87]:
train.groupby(['city', 'town', 'village'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1a5e01a320>

In [5]:
train['min_cat'] = 0
for col in [i for i in train.columns if 'MIN' in i]:
    train['min_cat'] = train['min_cat'].apply(lambda x : col if x[col] == x[[i for i in train.columns if 'MIN' in i]].min() else x['min_cat'], axis=1)  

In [10]:
train.apply(lambda x : pd.nsmallest(1, x[[i for i in train.columns if 'MIN' in i]]), axis=1)

AttributeError: ("module 'pandas' has no attribute 'nsmallest'", 'occurred at index 0')

In [18]:
train[[i for i in train.columns if 'MIN' in i]].min(axis=1)

0         34.469803
1          6.864786
2         29.607781
3         42.380415
4         19.028439
5         42.124512
6         64.551199
7         20.279536
8         26.730699
9          6.465746
10       110.890233
11        65.178573
12        13.524526
13        21.150627
14        37.140021
15         0.491901
16        29.630055
17         0.467488
18        14.191786
19        12.738809
20        30.301126
21        26.787028
22        31.403497
23        26.855422
24         1.117494
25        14.921288
26         2.674526
27        26.356524
28       108.548303
29         5.571330
            ...    
59970      5.065689
59971     10.468654
59972     26.012803
59973     33.723873
59974      2.911546
59975     13.470984
59976     53.069820
59977     20.191813
59978      0.382863
59979     50.120386
59980      0.594536
59981     37.192560
59982     58.175387
59983      9.907731
59984     23.244026
59985     17.231346
59986     32.576263
59987     16.614777
59988     57.006357


In [31]:
train[[i for i in train.columns if 'MIN' in i]].rank(axis =1).add_prefix('Rank_').head()

Unnamed: 0,Rank_I_MIN,Rank_II_MIN,Rank_III_MIN,Rank_IV_MIN,Rank_V_MIN,Rank_VI_MIN,Rank_VII_MIN,Rank_VIII_MIN,Rank_IX_MIN,Rank_X_MIN,Rank_XI_MIN,Rank_XII_MIN,Rank_XIII_MIN,Rank_XIV_MIN
0,7.0,8.0,2.0,14.0,12.0,13.0,10.0,6.0,5.0,4.0,11.0,3.0,9.0,1.0
1,9.0,8.0,2.0,14.0,6.5,13.0,1.0,6.5,10.0,12.0,11.0,3.0,4.0,5.0
2,9.0,12.0,10.0,7.0,6.0,1.0,4.0,2.0,8.0,11.0,13.0,3.0,14.0,5.0
3,12.0,8.0,3.0,13.0,11.0,10.0,7.0,5.0,4.0,1.0,9.0,2.0,14.0,6.0
4,6.0,1.0,5.0,12.0,9.0,14.0,8.0,4.0,11.0,10.0,7.0,3.0,13.0,2.0


In [27]:
train[[i for i in train.columns if 'MIN' in i]].add_prefix('')

Unnamed: 0,1I_MIN,1II_MIN,1III_MIN,1IV_MIN,1V_MIN,1VI_MIN,1VII_MIN,1VIII_MIN,1IX_MIN,1X_MIN,1XI_MIN,1XII_MIN,1XIII_MIN,1XIV_MIN
0,84.745794,102.554396,42.635964,287.735804,205.295496,270.838262,116.075571,69.868801,68.178741,64.774668,132.498164,63.478618,112.582703,34.469803
1,85.529684,71.994648,7.157464,805.037288,65.829031,719.071571,6.864786,65.829031,102.299486,272.553558,125.670173,23.717447,32.370655,40.073573
2,82.676961,110.761328,86.589412,62.428191,58.042563,29.607781,40.439890,32.523085,75.268914,108.270812,164.052330,35.796546,174.954657,54.462081
3,590.458945,279.365544,45.508654,982.352700,423.658555,421.260244,157.922588,73.978254,47.969413,42.380415,283.620244,43.654182,1247.796459,99.628966
4,603.643541,19.028439,546.047664,1435.928054,689.539284,2727.723917,669.730617,332.171857,1354.750025,1165.273099,607.948642,124.956094,1766.374491,124.131236
5,126.982721,282.828789,142.919491,540.733939,245.140015,394.198101,108.901308,116.831391,204.107139,155.023232,183.585223,42.124512,529.787756,61.109908
6,213.091735,205.935244,87.008685,907.720034,118.176289,397.858990,105.563252,118.176289,87.364785,121.375810,134.376535,64.551199,516.835172,196.690224
7,78.841254,20.279536,20.279536,545.938435,126.771923,177.539019,20.453449,80.446424,20.371632,205.092085,36.681791,40.439723,193.193388,66.780308
8,93.676727,385.310768,46.817986,123.210405,26.730699,181.411330,26.730699,69.670506,165.471949,99.316066,128.545040,70.849715,445.639906,40.349486
9,124.443839,159.757485,36.215680,443.618507,144.214039,703.374874,17.805175,144.214039,120.091363,124.950692,6.465746,47.436622,207.218133,102.955013
