In [483]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 100

import seaborn as sns
import pandas_profiling as pdp
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from IPython.display import display
import collections
import re
import feather
import codecs

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [484]:
train=pd.read_csv('../input/train.csv')
test=pd.read_csv('../input/test.csv')
df_all=pd.concat([train,test],axis=0,sort=False).reset_index(drop=True)
df_all.columns=['id','rent','address','access','floor_info','old','direction','square','floor','bath','kitchen','broadcast','facility','parking','neighbors','structure','period']

train_feat1=pd.read_feather('feature1/data/train_feat1.ftr')
test_feat1=pd.read_feather('feature1/data/test_feat1.ftr')

In [485]:
# 外れ値、異常データを削除

def drop_train_outlier(df_train):
    train_feat1=pd.read_feather('feature1/data/train_feat1.ftr')
    train=df_train[df_train['id']!=20428][df_train['id']!=20232][df_train['id']!=20927]
    train_feat1=train_feat1[train_feat1['id']!=20428][train_feat1['id']!=20232][train_feat1['id']!=20927]

# 面積に対する外れ値を取得
    index1=train_feat1.query("rent>1000000 & square_num<100").index
    index2=train_feat1.query("rent>1500000").index
    index3=train_feat1.query("square_num>400").index
    index4=train_feat1.query('rent>100000 & square_num<10').index
    
# 築年数に対する外れ値を取得
    index5=train_feat1.query("old_num>1000").index
    
    drop_index=np.concatenate([index1,index2,index3,index4,index5])
    
    train_drop=train.drop(drop_index).reset_index(drop=True)
    train_feat1_drop=train_feat1.drop(drop_index).reset_index(drop=True)
    
    return train_drop

# テストデータ中の外れ値を補正
def modify_test_outlier(df_test):
    test_feat1=pd.read_feather('feature1/data/test_feat1.ftr')

    df_test.loc[test_feat1.query('old_num>1000').index,'築年数']='11年4ヶ月'
    return df_test
    
train_drop=drop_train_outlier(train)
test_mod=modify_test_outlier(test)

In [486]:
train_index=len(train_drop)
test_index=len(test_mod)

df_all=pd.concat([train_drop,test_mod],axis=0,sort=False).reset_index(drop=True)
df_all.columns=['id','rent','address','access','floor_info','old','direction','square','floor','bath','kitchen','broadcast','facility','parking','neighbors','structure','period']

In [487]:
train_feat3=pd.read_feather('feature3/data/train_feat3.ftr')
test_feat3=pd.read_feather('feature3/data/test_feat3.ftr')

## 最寄り駅までアクセスを距離に換算する
 バスの移動速度を11km/h=183m/分  
 https://www.mlit.go.jp/road/ir/ir-data/jroad04/01-06.html  
 徒歩80m/分,車400m/分  
 https://www.id-home.net/knowledge/%E4%B8%8D%E5%8B%95%E7%94%A3%E3%81%AE%E5%BE%92%E6%AD%A9%E6%99%82%E9%96%93%E3%81%A8%E8%A1%A8%E7%A4%BA%E5%9F%BA%E6%BA%96%E3%82%92%E8%A7%A3%E8%AA%AC%EF%BC%81/

In [488]:
# 最寄りの駅情報を４つに分ける
def split_access(x):
    access_list=x.split('\t\t')
    num_list=len(access_list)

    if num_list==1:
        return access_list[0],'nan','nan','nan'
    elif num_list==2:
        return access_list[0],access_list[1],'nan','nan'
    elif num_list==3:
        return access_list[0],access_list[1],access_list[2],'nan'
    else:
        return access_list[0],access_list[1],access_list[2],access_list[3]

# 各最寄り駅情報を路線、駅、移動方法に分ける
def access_enc(x,func):
    if x==0:
        return 'nan'
    else:
        split_list=x.split('\t')
    #  路線名を取得
        if func=='line':
            if '線' in split_list[0]:
                return split_list[0]
            else:
                return 'nan'

        # 駅名を取得
        elif func=='station':
            if len(split_list)>=2 and '駅' in split_list[1]: 
                return split_list[1]
            else:
                return 'nan'

        # 距離を取得
        else:
            if len(split_list)==3:
                    return split_list[2]
            else:
                return 'nan'
        
#  距離を数値型に変更
def enc_num(x):

    dist=0
    
    walk = re.findall(r'徒歩.+?分', x)
    if len(walk)!=0:
        walk_t=int(walk[0][2:-1])
        dist+=80*walk_t
        
    bus = re.findall(r'バス\(.+?\)', x)
    if len(bus)!=0:
        bus_t=int(bus[0][3:-2])
        dist+=183*bus_t

    
    car = re.findall(r'車.+?km', x)
    if len(car)!=0:
        car_m=float(car[0][1:-2])*1000
        dist+=car_m
    
# 欠損の場合は-999をいれる
    if dist==0:
        return -999
    else:
        return dist


tmp=df_all['access'].apply(lambda x:split_access(x))

# 最寄り駅、二つまで取得する。
for i in range(2):
    df_all['access_'+str(i+1)]=tmp.apply(lambda x:x[i])
    df_all['access_'+str(i+1)+'_line']=df_all['access_'+str(i+1)].apply(lambda x:access_enc(x,'line'))
    df_all['access_'+str(i+1)+'_station']=df_all['access_'+str(i+1)].apply(lambda x:access_enc(x,'station'))
    df_all['access_'+str(i+1)+'_distance']=df_all['access_'+str(i+1)].apply(lambda x:access_enc(x,'distance'))
    df_all['access_'+str(i+1)+'_distance']=df_all['access_'+str(i+1)+'_distance'].apply(lambda x:enc_num(x))

# 数値型にエンコード

for col in ['line','station']:
    le=LabelEncoder()
    encoder=list(set(df_all['access_1_'+col].unique()) | set(df_all['access_2_'+col].unique())) 
    
    le.fit(encoder)
    for i in range(2):
        df_all['access_'+str(i+1)+'_'+col]=le.transform(df_all['access_'+str(i+1)+'_'+col])
        df_all['access_'+str(i+1)+'_'+col]=df_all['access_'+str(i+1)+'_'+col].astype('category')


## 地下室含めた、階の総数を加える

In [489]:
def room_floor(src):
    tmp=src.split('／')
    # floor_infoは２分割までできるので、分割できる場合は、一つ目の要素が部屋の階
    if len(tmp)==2:
        return tmp[0]
    
    if len(tmp)==1:
        if '階建' in tmp[0]:
             return 'nan'
        else:
            return tmp[0]

def room_floor_enc(x):
    #  数字部分の抜き出し
    if x=='nan':
        return 'nan'
    elif x=='':
        return 'nan'
    
    else:

        num=re.findall(r'\d+',x)[0]
    #  地下の場合はマイナスにする
    if '地下' in x:
        return -1*int(num)
    else:
        return int(num)
    
 #  部屋の階を数値化
df_all['floor']=df_all['floor'].fillna('nan')        
df_all['room_floor']=df_all['floor'].apply(lambda x:room_floor(x))
df_all['room_floor']=df_all['room_floor'].apply(lambda x:room_floor_enc(x))

In [490]:
def building_floor(src):
    tmp=src.split('／')
    # floor_infoは２分割までできるので、分割できる場合は、一つ目の要素が部屋の階
    if len(tmp)==2:
        return tmp[1]
    
    if len(tmp)==1:
        if '階建' in tmp[0]:
             return tmp[0]
        else:
            return 'nan'

# 何階建かを取得
def building_floor_enc(x):
    if x=='nan':
        return -999
        
    else:
        return int(x.split('階建')[0])
    
# 　地下の階数を取得
def check_underground(x):
# 欠損部分は、0を埋める
    if x=='nan':
        return -999
    else:
        under_info=re.findall('地下+\d',x)
        if len(under_info)==0:
            return 0
        else:
            return int(re.findall('地下+\d',x)[0][-1])
    
df_all['floor']=df_all['floor'].fillna('nan')        
df_all['building_floor']=df_all['floor'].apply(lambda x:building_floor(x))

# 地下特徴量を加える
df_all['underground']=0
df_all['underground']=df_all['building_floor'].apply(lambda x:check_underground(x))

# 建物の階数を加える
df_all['building_floor']=df_all['building_floor'].apply(lambda x:building_floor_enc(x))

df_all['total_floor']=df_all['building_floor']+df_all['underground']

## 駐車場、駐輪場、バイク置き場の情報を追加する。

In [621]:
def add_parking_info(x,park_type):

    if x=='nan':
        return 'nan','nan'
    else:
        text=x
    park=x.split('\t')

    if park_type=='駐車場':
        car_info='nan'
        car_price='nan'
        if '駐車場' in text:
            try:
                car_idx=park.index('駐車場')
                car_info=park[car_idx+1]
            except ValueError:
                return 'nan','nan'
                
            try:
                if '円' in park[car_idx+2]:
                    car_price=park[car_idx+2]
                else:
                    car_price='nan'
            except IndexError:
                car_price='nan'

        return car_info,car_price


    if park_type=='駐輪場':
        bicycle_info='nan'
        bicycle_price='nan'
        if '駐輪場' in text:
            try:
                bicycle_idx=park.index('駐輪場')
                bicycle_info=park[bicycle_idx+1]
            except ValueError:
                return 'nan','nan'
            
            try:
                if '円' in park[bicycle_idx+2]:
                    bicycle_price=park[bicycle_idx+2]
                else:
                    bicycle_price='nan'
            except IndexError:
                bicycle_price='nan'

        return bicycle_info,bicycle_price
    
    if park_type=='バイク置き場':
        bike_info='nan'
        bike_price='nan'
        if 'バイク置き場' in text:
            try:
                bike_idx=park.index('バイク置き場')
                bike_info=park[bike_idx+1]
            except ValueError:
                return 'nan','nan'
            
            try:
                if '円' in park[bike_idx+2]:
                    bike_price=park[bike_idx+2]
                else:
                    bike_price='nan'
            except IndexError:
                bike_price='nan'
                
        return bike_info,bike_price
    
# 駐車場があるかどうか０、１分類
def add_is_park(x):
    if x=='nan':
        return 'nan'
    
    elif x=='無':
        return 0
    
    else:
        return 1

# 住居に備え付けの駐車場があるかどうか、0,1分類
def add_house_park(x):
    if x=='nan':
        return 'nan'
    
    elif '空有' in x or '空無' in x:
        return 1
    
    else:
        return 0
    
def add_other_park(x):
    if x=='nan':
        return 'nan'
    
    elif '近隣' in x:
        return 1
    
    else:
        return 0
    
def add_park_num(x):
# 台数が書いてない場合は１台にする
    if x=='空有' or x=='近隣':
        return 1

    elif x=='空無':
        return 0
    
    elif len(re.sub(r'\D', '', x))!=0:
        return int(re.sub(r'\D', '', x))
    
    elif x=='nan':
        return 'nan'
    
    else:
        return 0
        
def add_park_price(x):
    if x=='nan':
        return 'nan'
    
    price=int(re.sub(r'\D', '',x ))
    if '税抜' in x:
        price=price*1.08

    return price
        
# 欠損を埋める
df_all['parking']=df_all['parking'].fillna('nan')

for v,p in zip(['car','bicycle','bike'],['駐車場','駐輪場','バイク置き場']):
    parking=df_all['parking'].apply(lambda x :add_parking_info(x,p)) 
    df_all[v+'_info']=parking.apply(lambda x:x[0])
    df_all[v+'_price']=parking.apply(lambda x:x[1])
    df_all.loc[df_all[df_all['car_price']=='(自転車駐輪相談可能（空き要確認）6，000円を1年分一括徴収（非課税）/原付駐輪は現在満車)'].index,'car_price']='nan'

for v in ['car','bicycle','bike']:
    df_all['is_parking_'+v]=df_all[v+'_info'].apply(lambda x:add_is_park(x))
    df_all['is_house_parking_'+v]=df_all[v+'_info'].apply(lambda x:add_house_park(x))
    df_all['is_other_parking_'+v]=df_all[v+'_info'].apply(lambda x:add_other_park(x))
    df_all['parking_number_'+v]=df_all[v+'_info'].apply(lambda x:add_park_num(x))
    df_all['parking_price_'+v]=df_all[v+'_price'].apply(lambda x:add_park_price(x))

In [628]:
df_all['by']

nan      38237
32400     1764
27000     1536
30000     1502
25000     1433
         ...  
27600        1
31752        1
30500        1
22500        1
45150        1
Name: parking_price_car, Length: 347, dtype: int64

In [581]:
text=df_all['car_price'][5]
re.findall(r'徒歩.+?分', text)

[]

In [528]:
df_all['bicycle_info'].unique()

array(['空有', '無', 'nan', '空無', '空有(1台)', '空有(2台)', '近隣', '空有(68台)',
       '空有(3台)', '空有(5台)', '空有(16台)', '空有(27台)', '空有(86台)', '空有(24台)',
       '近隣(1台)'], dtype=object)

In [615]:
df_all['car_price'].value_counts()[:100]

nan            38237
32,400円(税込)     1325
43,200円(税込)     1026
27,000円(税込)      994
30,000円(税込)      873
               ...  
10,000円(税込)       37
33,480円           36
15,120円(税込)       35
34,560円(税込)       34
39,960円           33
Name: car_price, Length: 100, dtype: int64

In [619]:
text='無'
price=int(re.sub(r'\D', '',text))
if '税抜' in text:
    price=price*1.08
print(price)

ValueError: invalid literal for int() with base 10: ''

In [600]:
for i in df_all['car_price'].unique():
    print(i,re.sub(r'\D', '', i))

nan 
30,000円(税込) 30000
17,000円(税込) 17000
27,000円(税込) 27000
16,000円(税込) 16000
32,400円(税込) 32400
23,000円 23000
15,120円 15120
0円 0
25,000円(税込) 25000
10,000円 10000
37,800円(税込) 37800
28,000円(税込) 28000
15,000円 15000
21,600円(税込) 21600
17,280円(税込) 17280
12,960円 12960
32,000円(税込) 32000
12,000円(税込) 12000
30,000円 30000
18,000円 18000
40,000円(税込) 40000
18,000円(税込) 18000
14,040円(税込) 14040
28,000円 28000
45,000円(税込) 45000
17,000円 17000
50,000円(税込) 50000
15,000円(税込) 15000
31,320円 31320
27,000円 27000
35,000円 35000
20,000円(税込) 20000
13,000円 13000
38,000円(税込) 38000
43,200円(税込) 43200
28,080円(税込) 28080
16,200円 16200
33,200円(税込) 33200
25,725円 25725
22,680円(税込) 22680
21,000円 21000
20,000円 20000
46,440円 46440
19,440円 19440
19,440円(税込) 19440
16,200円(税込) 16200
7,000円 7000
32,400円 32400
34,000円 34000
29,160円 29160
15,660円 15660
18,360円 18360
35,000円(税込) 35000
14,000円 14000
22,000円 22000
25,000円 25000
36,720円 36720
45,000円 45000
28,100円 28100
30,240円 30240
12,000円 12000
12,343円 12343
15,120円(税込) 15120
33,480円(税込) 

In [599]:
df_all[df_all['car_price']=='(自転車駐輪相談可能（空き要確認）6，000円を1年分一括徴収（非課税）/原付駐輪は現在満車)']

Unnamed: 0,id,rent,address,access,floor_info,old,direction,square,floor,bath,kitchen,broadcast,facility,parking,neighbors,structure,period,access_1,access_1_line,access_1_station,access_1_distance,access_2,access_2_line,access_2_station,access_2_distance,room_floor,building_floor,underground,total_floor,car_info,car_price,bicycle_info,bicycle_price,bike_info,bike_price,Is_parking_car,Is_parking_bicycle,Is_parking_bike,is_house_parking_car,is_house_parking_bicycle,is_house_parking_bike,is_parking_car,is_other_parking_car,is_parking_bicycle,is_other_parking_bicycle,is_parking_bike,is_other_parking_bike,parking_number_car,parking_number_bicycle,parking_number_bike


In [596]:
df_all.loc[df_all[df_all['car_price']=='(自転車駐輪相談可能（空き要確認）6，000円を1年分一括徴収（非課税）/原付駐輪は現在満車)'].index,'car_price']='nan'

In [597]:
df_all[df_all['id']==49113]

Unnamed: 0,id,rent,address,access,floor_info,old,direction,square,floor,bath,kitchen,broadcast,facility,parking,neighbors,structure,period,access_1,access_1_line,access_1_station,access_1_distance,access_2,access_2_line,access_2_station,access_2_distance,room_floor,building_floor,underground,total_floor,car_info,car_price,bicycle_info,bicycle_price,bike_info,bike_price,Is_parking_car,Is_parking_bicycle,Is_parking_bike,is_house_parking_car,is_house_parking_bicycle,is_house_parking_bike,is_parking_car,is_other_parking_car,is_parking_bicycle,is_other_parking_bicycle,is_parking_bike,is_other_parking_bike,parking_number_car,parking_number_bicycle,parking_number_bike
49103,49113,,東京都大田区南馬込６丁目12-1,都営浅草線\t西馬込駅\t徒歩8分\t\t都営浅草線\t馬込駅\t徒歩16分\t\t京浜東北...,1K,28年10ヶ月,南,16.41m2,2階／5階建,,IHコンロ\t／\tシステムキッチン\t／\t給湯,光ファイバー／\tCATV,エアコン付\tシューズボックス／\tバルコニー／\t室外洗濯機置場／\t敷地内ごみ置き場\t...,駐輪場\t空有\t駐車場\t無\t(自転車駐輪相談可能（空き要確認）6，000円を1年分一括...,【コンビニ】 35m\t【スーパー】 288m\t【コンビニ】 285m\t【ドラッグストア...,RC（鉄筋コンクリート）,2年間,都営浅草線\t西馬込駅\t徒歩8分,62,445,640.0,都営浅草線\t馬込駅\t徒歩16分,62,492,1280.0,2,5,0,5,無,,空有,,無,,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0
