##  線形モデル用にデータを整形

In [356]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 200

import japanize_matplotlib
import seaborn as sns
import pandas_profiling as pdp
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from IPython.display import display
import collections
import re
import feather
import codecs

from sklearn.preprocessing import LabelEncoder,StandardScaler,RobustScaler,MinMaxScaler
from sklearn.model_selection import KFold

In [357]:
train=pd.read_csv('../input/train.csv')
test=pd.read_csv('../input/test.csv')
df_all=pd.concat([train,test],axis=0,sort=False).reset_index(drop=True)
df_all.columns=['id','rent','address','access','floor_info','old','direction','square','floor','bath','kitchen','broadcast','facility','parking','neighbors','structure','period']

train_feat4=pd.read_feather('feature4/data/train_feat4_all.ftr')
test_feat4=pd.read_feather('feature4/data/test_feat4_all.ftr').drop('index',axis=1)
df_all_feat4=pd.concat([train_feat4,test_feat4],axis=0,sort=False).reset_index(drop=True)
train_index=len(train_feat4)
test_index=len(test_feat4)

## 欠損値補完

In [358]:
null_count=pd.DataFrame()
for col in df_all_feat4.columns:
    print(col,sum(df_all_feat4[col]==-999))

id 0
rent 0
square_num 0
access_1_line 0
access_1_station 0
access_1_distance 0
access_2_line 0
access_2_station 0
access_2_distance 2985
room_floor 1450
building_floor 34
underground 34
total_floor 34
is_parking_car 20232
is_house_parking_car 20232
is_other_parking_car 20232
parking_number_car 20232
parking_price_car 38240
is_parking_bicycle 20830
is_house_parking_bicycle 20830
is_other_parking_bicycle 20830
parking_number_bicycle 20830
parking_price_bicycle 59557
is_parking_bike 26948
is_house_parking_bike 26948
is_other_parking_bike 26948
parking_number_bike 26948
parking_price_bike 61479
convenience_count 18669
supermarket_count 18669
neighbor_count 18669
デパート 0
幼稚園・保育園 0
図書館 0
ドラッグストア 0
郵便局 0
大学 0
公園 0
スーパー 0
銀行 0
コインパーキング 0
レンタルビデオ 0
飲食店 0
小学校 0
コンビニ 0
病院 0
総合病院 0
月極駐車場 0
クリーニング 0
学校 0
direction_enc 0
macro_structure 0
resistance_period 0
old_num 0
available_period 0
available_is_over 0
is_rent 0
period_num 0
future_available 0
future_available_is_over 0
floor_info_macro_structur

In [359]:
#  駅までの距離を平均値で補完
df_all_feat4['access_2_distance']=df_all_feat4['access_2_distance'].replace(-999,df_all_feat4.query('access_2_distance!=-999')['access_2_distance'].mean())

# 部屋の階を最頻値で補完
df_all_feat4['room_floor']=df_all_feat4['room_floor'].replace(-999,df_all_feat4.query('room_floor !=-999')['room_floor'].mode()[0])

# 建物の階を最頻値で補完
df_all_feat4['building_floor']=df_all_feat4['building_floor'].replace(-999,df_all_feat4.query('building_floor !=-999')['building_floor'].mode()[0])

# 建物の全ての階を最頻値で補完
df_all_feat4['total_floor']=df_all_feat4['total_floor'].replace(-999,df_all_feat4.query('total_floor !=-999')['total_floor'].mode()[0])

# 駐車場料金を平均値で補完
df_all_feat4['parking_price_car']=df_all_feat4['parking_price_car'].replace(-999,df_all_feat4.query('parking_price_car !=-999')['parking_price_car'].mean())

# 駐車場料金を平均値で補完
df_all_feat4['parking_price_bicycle']=df_all_feat4['parking_price_bicycle'].replace(-999,df_all_feat4.query('parking_price_bicycle !=-999')['parking_price_bicycle'].mean())

# 駐車場料金を平均値で補完
df_all_feat4['parking_price_bike']=df_all_feat4['parking_price_bike'].replace(-999,df_all_feat4.query('parking_price_bike !=-999')['parking_price_bike'].mean())

def is_null(x):
    if x==-999:
        return 1
    else:
        return 0

# 駐車台数を０で補完、欠損かどうかの特徴を追加
for col in ['parking_number_car','parking_number_bicycle','parking_number_bike']:
    df_all_feat4[col+'_isnull']=df_all_feat4[col].apply(lambda x:is_null(x))
    df_all_feat4[col]=df_all_feat4[col].replace(-999,0)
    
#  駐車場の有無をワンホットエンコ
dummy_cols=['is_parking_car','is_house_parking_car','is_other_parking_car',
            'is_parking_bicycle','is_house_parking_bicycle','is_other_parking_bicycle',
           'is_parking_bike','is_house_parking_bike','is_other_parking_bike']

for col in dummy_cols:
    df_all_feat4[col]=df_all_feat4[col].astype(str)
    df_all_feat4=pd.get_dummies(df_all_feat4,columns=[col])

# コンビニの数を最頻値で補完
df_all_feat4['convenience_count']=df_all_feat4['convenience_count'].replace(-999,df_all_feat4.query('convenience_count !=-999')['convenience_count'].mode()[0])

# スーパーの数を最頻値で補完
df_all_feat4['supermarket_count']=df_all_feat4['supermarket_count'].replace(-999,df_all_feat4.query('supermarket_count !=-999')['supermarket_count'].mode()[0])

# スーパーの数を最頻値で補完
df_all_feat4['neighbor_count']=df_all_feat4['neighbor_count'].replace(-999,df_all_feat4.query('neighbor_count !=-999')['neighbor_count'].mode()[0])



## カテゴリー変数をワンホットエンコード

In [360]:
category_cols=['access_1_line','access_2_line','access_1_station','access_2_station',
               'direction_enc','macro_structure','floor_info_macro_structure',
               'address_city_enc','address_town_enc','mesh_category_enc']
for col in category_cols:
    df_all_feat4[col]=df_all_feat4[col].astype(str)
    df_all_feat4=pd.get_dummies(df_all_feat4,columns=[col])

## スケーリング前の特徴を保存

In [361]:
train_feat5=df_all_feat4[:train_index]
test_feat5=df_all_feat4[train_index:]

train_feat5.to_feather('feature5/data/train_feat5_nonscaling.ftr')
test_feat5=test_feat5.reset_index(drop=True)
test_feat5.to_feather('feature5/data/test_feat5_nonscalling.ftr')

## 連続値をスケーリング
線形回帰に対しては、ロバストスケーラが最も良かった

In [362]:
scaler_cols=['square_num','access_1_distance','access_2_distance','room_floor','building_floor','total_floor','parking_number_car',
                        'parking_price_car', 'parking_number_bicycle', 'parking_price_bicycle',
   'parking_number_bike', 'parking_price_bike', 'convenience_count',
       'supermarket_count', 'neighbor_count', 'デパート', '幼稚園・保育園', '図書館',
       'ドラッグストア', '郵便局', '大学', '公園', 'スーパー', '銀行', 'コインパーキング', 'レンタルビデオ',
       '飲食店', '小学校', 'コンビニ', '病院', '総合病院', '月極駐車場', 'クリーニング', '学校',
       'resistance_period', 'old_num', 'available_period','period_num', 'future_available']


def standard_scale(df_train,df_test):
    train_scale=df_train.copy()
    test_scale=df_test.copy()
    
    for col in scaler_cols:
        sc=StandardScaler()
        sc.fit(train_scale[col].values.reshape(-1,1))
        train_scale[col]=sc.transform(train_scale[col].values.reshape(-1,1))
        test_scale[col]=sc.transform(test_scale[col].values.reshape(-1,1))
    return train_scale,test_scale


def robust_scale(df_train,df_test):
    train_scale=df_train.copy()
    test_scale=df_test.copy()
    
    for col in scaler_cols:
        rc=RobustScaler()
        rc.fit(train_scale[col].values.reshape(-1,1))
        train_scale[col]=rc.transform(train_scale[col].values.reshape(-1,1))
        test_scale[col]=rc.transform(test_scale[col].values.reshape(-1,1))
    return train_scale,test_scale


def minmax_scale(df_train,df_test):
    train_scale=df_train.copy()
    test_scale=df_test.copy()
    
    for col in scaler_cols:
        mc=MinMaxScaler()
        mc.fit(train_scale[col].values.reshape(-1,1))
        train_scale[col]=mc.transform(train_scale[col].values.reshape(-1,1))
        test_scale[col]=mc.transform(test_scale[col].values.reshape(-1,1))
    return train_scale,test_scale


In [363]:
train_feat5_stand,test_feat5_stand=standard_scale(train_feat5,test_feat5)
train_feat5_robust,test_feat5_robust=robust_scale(train_feat5,test_feat5)
train_feat5_minmax,test_feat5_minmax=minmax_scale(train_feat5,test_feat5)

train_feat5_stand.to_feather('feature5/data/train_feat5_scalling_stand.ftr')
test_feat5_stand=test_feat5_stand.reset_index(drop=True)
test_feat5_stand.to_feather('feature5/data/test_feat5_scalling_stand.ftr')

train_feat5_robust.to_feather('feature5/data/train_feat5_scalling_robust.ftr')
test_feat5_robust=test_feat5_robust.reset_index(drop=True)
test_feat5_robust.to_feather('feature5/data/test_feat5_scalling_robust.ftr')

train_feat5_minmax.to_feather('feature5/data/train_feat5_scalling_minmax.ftr')
test_feat5_minmax=test_feat5_minmax.reset_index(drop=True)
test_feat5_minmax.to_feather('feature5/data/test_feat5_scalling_minmax.ftr')

## 連続値のみの特徴を保存
線形モデルでは精度の向上は見られなかった

In [364]:
scaler_cols.extend(['id','rent'])

for scale,df_train,df_test in zip(['stand','robust','minmax'],[train_feat5_stand,train_feat5_robust,train_feat5_minmax],\
                 [test_feat5_stand,test_feat5_robust,test_feat5_minmax]):
    df_train[scaler_cols].to_feather('feature5/data/train_feat5_scalling_{}_continuous.ftr'.format(scale))
    df_test=df_test.reset_index(drop=True)
    df_test[scaler_cols].to_feather('feature5/data/test_feat5_scalling_{}_continuous.ftr'.format(scale))