In [166]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 100

import japanize_matplotlib
import seaborn as sns
import pandas_profiling as pdp
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from IPython.display import display
import collections
import re
import feather
import codecs

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [167]:
train=pd.read_csv('../input/train.csv')
test=pd.read_csv('../input/test.csv')
df_all=pd.concat([train,test],axis=0,sort=False).reset_index(drop=True)
df_all.columns=['id','rent','address','access','floor_info','old','direction','square','floor','bath','kitchen','broadcast','facility','parking','neighbors','structure','period']

train_feat1=pd.read_feather('feature1/data/train_feat1.ftr')
test_feat1=pd.read_feather('feature1/data/test_feat1.ftr')

In [168]:
# 外れ値、異常データを削除

def drop_train_outlier(df_train):
    train_feat1=pd.read_feather('feature1/data/train_feat1.ftr')
    train=df_train[df_train['id']!=20428][df_train['id']!=20232][df_train['id']!=20927]
    train_feat1=train_feat1[train_feat1['id']!=20428][train_feat1['id']!=20232][train_feat1['id']!=20927]

# 面積に対する外れ値を取得
    index1=train_feat1.query("rent>1000000 & square_num<100").index
    index2=train_feat1.query("rent>1500000").index
    index3=train_feat1.query("square_num>400").index
    index4=train_feat1.query('rent>100000 & square_num<10').index
    
# 築年数に対する外れ値を取得
    index5=train_feat1.query("old_num>1000").index
    
    drop_index=np.concatenate([index1,index2,index3,index4,index5])
    
    train_drop=train.drop(drop_index).reset_index(drop=True)
    train_feat1_drop=train_feat1.drop(drop_index).reset_index(drop=True)
    
    return train_drop

# テストデータ中の外れ値を補正
def modify_test_outlier(df_test):
    test_feat1=pd.read_feather('feature1/data/test_feat1.ftr')

    df_test.loc[test_feat1.query('old_num>1000').index,'築年数']='11年4ヶ月'
    return df_test
    
train_drop=drop_train_outlier(train)
test_mod=modify_test_outlier(test)

In [169]:
train_index=len(train_drop)
test_index=len(test_mod)

df_all=pd.concat([train_drop,test_mod],axis=0,sort=False).reset_index(drop=True)
df_all.columns=['id','rent','address','access','floor_info','old','direction','square','floor','bath','kitchen','broadcast','facility','parking','neighbors','structure','period']

###  処理に必要な特徴を生成

In [170]:
df_all['square_num']=df_all['square'].apply(lambda x:float(x.split('m')[0]))

## 最寄り駅までアクセスを距離に換算する
 バスの移動速度を11km/h=183m/分  
 https://www.mlit.go.jp/road/ir/ir-data/jroad04/01-06.html  
 徒歩80m/分,車400m/分  
 https://www.id-home.net/knowledge/%E4%B8%8D%E5%8B%95%E7%94%A3%E3%81%AE%E5%BE%92%E6%AD%A9%E6%99%82%E9%96%93%E3%81%A8%E8%A1%A8%E7%A4%BA%E5%9F%BA%E6%BA%96%E3%82%92%E8%A7%A3%E8%AA%AC%EF%BC%81/

In [171]:
# 最寄りの駅情報を４つに分ける
def split_access(x):
    access_list=x.split('\t\t')
    num_list=len(access_list)

    if num_list==1:
        return access_list[0],'nan','nan','nan'
    elif num_list==2:
        return access_list[0],access_list[1],'nan','nan'
    elif num_list==3:
        return access_list[0],access_list[1],access_list[2],'nan'
    else:
        return access_list[0],access_list[1],access_list[2],access_list[3]

# 各最寄り駅情報を路線、駅、移動方法に分ける
def access_enc(x,func):
    if x==0:
        return 'nan'
    else:
        split_list=x.split('\t')
    #  路線名を取得
        if func=='line':
            if '線' in split_list[0]:
                return split_list[0]
            else:
                return 'nan'

        # 駅名を取得
        elif func=='station':
            if len(split_list)>=2 and '駅' in split_list[1]: 
                return split_list[1]
            else:
                return 'nan'

        # 距離を取得
        else:
            if len(split_list)==3:
                    return split_list[2]
            else:
                return 'nan'
        
#  距離を数値型に変更
def enc_num(x):

    dist=0
    
    walk = re.findall(r'徒歩.+?分', x)
    if len(walk)!=0:
        walk_t=int(walk[0][2:-1])
        dist+=80*walk_t
        
    bus = re.findall(r'バス\(.+?\)', x)
    if len(bus)!=0:
        bus_t=int(bus[0][3:-2])
        dist+=183*bus_t

    
    car = re.findall(r'車.+?km', x)
    if len(car)!=0:
        car_m=float(car[0][1:-2])*1000
        dist+=car_m
    
# 欠損の場合は-999をいれる
    if dist==0:
        return -999
    else:
        return dist


tmp=df_all['access'].apply(lambda x:split_access(x))

# 最寄り駅、二つまで取得する。
for i in range(2):
    df_all['access_'+str(i+1)]=tmp.apply(lambda x:x[i])
    df_all['access_'+str(i+1)+'_line']=df_all['access_'+str(i+1)].apply(lambda x:access_enc(x,'line'))
    df_all['access_'+str(i+1)+'_station']=df_all['access_'+str(i+1)].apply(lambda x:access_enc(x,'station'))
    df_all['access_'+str(i+1)+'_distance']=df_all['access_'+str(i+1)].apply(lambda x:access_enc(x,'distance'))
    df_all['access_'+str(i+1)+'_distance']=df_all['access_'+str(i+1)+'_distance'].apply(lambda x:enc_num(x))

# 数値型にエンコード

for col in ['line','station']:
    le=LabelEncoder()
    encoder=list(set(df_all['access_1_'+col].unique()) | set(df_all['access_2_'+col].unique())) 
    
    le.fit(encoder)
    for i in range(2):
        df_all['access_'+str(i+1)+'_'+col]=le.transform(df_all['access_'+str(i+1)+'_'+col])
        df_all['access_'+str(i+1)+'_'+col]=df_all['access_'+str(i+1)+'_'+col].astype('category')


In [172]:
df_all.select_dtypes(include='category')

Unnamed: 0,access_1_line,access_1_station,access_2_line,access_2_station
0,59,433,25,289
1,60,97,32,251
2,17,383,17,80
3,54,497,6,235
4,13,69,29,470
...,...,...,...,...
62724,25,289,59,228
62725,61,181,28,376
62726,60,232,57,313
62727,32,509,6,130


## 地下室含めた、階の総数を加える

In [173]:
def room_floor(src):
    tmp=src.split('／')
    # floor_infoは２分割までできるので、分割できる場合は、一つ目の要素が部屋の階
    if len(tmp)==2:
        return tmp[0]
    
    if len(tmp)==1:
        if '階建' in tmp[0]:
             return 'nan'
        else:
            return tmp[0]

def room_floor_enc(x):
    #  数字部分の抜き出し
    if x=='nan':
        return 'nan'
    elif x=='':
        return 'nan'
    
    else:

        num=re.findall(r'\d+',x)[0]
    #  地下の場合はマイナスにする
    if '地下' in x:
        return -1*int(num)
    else:
        return int(num)
    
 #  部屋の階を数値化
df_all['floor']=df_all['floor'].fillna('nan')        
df_all['room_floor']=df_all['floor'].apply(lambda x:room_floor(x))
df_all['room_floor']=df_all['room_floor'].apply(lambda x:room_floor_enc(x))

In [None]:
df_all.select_dtypes(include='category')

In [141]:
def building_floor(src):
    tmp=src.split('／')
    # floor_infoは２分割までできるので、分割できる場合は、一つ目の要素が部屋の階
    if len(tmp)==2:
        return tmp[1]
    
    if len(tmp)==1:
        if '階建' in tmp[0]:
             return tmp[0]
        else:
            return 'nan'

# 何階建かを取得
def building_floor_enc(x):
    if x=='nan':
        return 'nan'
        
    else:
        return int(x.split('階建')[0])
    
# 　地下の階数を取得
def check_underground(x):
# 欠損部分は、-999を埋める
    if x=='nan':
        return 'nan'
    else:
        under_info=re.findall('地下+\d',x)
        if len(under_info)==0:
            return 0
        else:
            return int(re.findall('地下+\d',x)[0][-1])
    
df_all['floor']=df_all['floor'].fillna('nan')        
df_all['building_floor']=df_all['floor'].apply(lambda x:building_floor(x))

# 地下特徴量を加える
df_all['underground']=0
df_all['underground']=df_all['building_floor'].apply(lambda x:check_underground(x))

# 建物の階数を加える
df_all['building_floor']=df_all['building_floor'].apply(lambda x:building_floor_enc(x))

df_all['total_floor']=df_all['building_floor']+df_all['underground']

df_all=df_all.replace('nan',-999)
df_all=df_all.replace('nannan',-999)

## 駐車場、駐輪場、バイク置き場の情報を追加する。

In [142]:
def add_parking_info(x,park_type):

    if x=='nan':
        return 'nan','nan'
    else:
        text=x
    park=x.split('\t')

    if park_type=='駐車場':
        car_info='nan'
        car_price='nan'
        if '駐車場' in text:
            try:
                car_idx=park.index('駐車場')
                car_info=park[car_idx+1]
            except ValueError:
                return 'nan','nan'
                
            try:
                if '円' in park[car_idx+2]:
                    car_price=park[car_idx+2]
                else:
                    car_price='nan'
            except IndexError:
                car_price='nan'

        return car_info,car_price


    if park_type=='駐輪場':
        bicycle_info='nan'
        bicycle_price='nan'
        if '駐輪場' in text:
            try:
                bicycle_idx=park.index('駐輪場')
                bicycle_info=park[bicycle_idx+1]
            except ValueError:
                return 'nan','nan'
            
            try:
                if '円' in park[bicycle_idx+2]:
                    bicycle_price=park[bicycle_idx+2]
                else:
                    bicycle_price='nan'
            except IndexError:
                bicycle_price='nan'

        return bicycle_info,bicycle_price
    
    if park_type=='バイク置き場':
        bike_info='nan'
        bike_price='nan'
        if 'バイク置き場' in text:
            try:
                bike_idx=park.index('バイク置き場')
                bike_info=park[bike_idx+1]
            except ValueError:
                return 'nan','nan'
            
            try:
                if '円' in park[bike_idx+2]:
                    bike_price=park[bike_idx+2]
                else:
                    bike_price='nan'
            except IndexError:
                bike_price='nan'
                
        return bike_info,bike_price
    
# 駐車場があるかどうか０、１分類
def add_is_park(x):
    if x=='nan':
        return 'nan'
    
    elif x=='無':
        return 0
    
    else:
        return 1

# 住居に備え付けの駐車場があるかどうか、0,1分類
def add_house_park(x):
    if x=='nan':
        return 'nan'
    
    elif '空有' in x or '空無' in x:
        return 1
    
    else:
        return 0
    
def add_other_park(x):
    if x=='nan':
        return 'nan'
    
    elif '近隣' in x:
        return 1
    
    else:
        return 0
    
def add_park_num(x):
# 台数が書いてない場合は１台にする
    if x=='空有' or x=='近隣':
        return 1

    elif x=='空無':
        return 0
    
    elif len(re.sub(r'\D', '', x))!=0:
        return int(re.sub(r'\D', '', x))
    
    elif x=='nan':
        return 'nan'
    
    else:
        return 0
        
def add_park_price(x):
    if x=='nan':
        return 'nan'
    
    price=int(re.sub(r'\D', '',x ))
    if '税抜' in x:
        price=price*1.08

    return price
        
    
def modify_outlier_price(df):
    index1=df.query('parking_price_car!="nan"').query('parking_price_car>100000').index
    df.loc[index1,'parking_price_car']='nan'
    index2=df.query('parking_price_bicycle!="nan"').query('parking_price_bicycle>20000').index
    df.loc[index2,'parking_price_bicycle']='nan'
    index3=df.query('parking_price_bike!="nan"').query('parking_price_bike>30000').index
    df.loc[index3,'parking_price_bike']='nan'
    return df


# 欠損を埋める
df_all['parking']=df_all['parking'].fillna('nan')

for v,p in zip(['car','bicycle','bike'],['駐車場','駐輪場','バイク置き場']):
    parking=df_all['parking'].apply(lambda x :add_parking_info(x,p)) 
    df_all[v+'_info']=parking.apply(lambda x:x[0])
    df_all[v+'_price']=parking.apply(lambda x:x[1])
    df_all.loc[df_all[df_all['car_price']=='(自転車駐輪相談可能（空き要確認）6，000円を1年分一括徴収（非課税）/原付駐輪は現在満車)'].index,'car_price']='nan'

for v in ['car','bicycle','bike']:
    df_all['is_parking_'+v]=df_all[v+'_info'].apply(lambda x:add_is_park(x))
    df_all['is_house_parking_'+v]=df_all[v+'_info'].apply(lambda x:add_house_park(x))
    df_all['is_other_parking_'+v]=df_all[v+'_info'].apply(lambda x:add_other_park(x))
    df_all['parking_number_'+v]=df_all[v+'_info'].apply(lambda x:add_park_num(x))
    df_all['parking_price_'+v]=df_all[v+'_price'].apply(lambda x:add_park_price(x))

    
df_all=modify_outlier_price(df_all)
df_all=df_all.replace('nan',-999)

## 掲載している近隣情報の数を追加

In [143]:
df_all['neighbors']=df_all['neighbors'].fillna('nan')

def add_neighbor_info(df):
    neighbor_dict=[]
    convenience_count=[]
    supermarket_count=[]
    neighbor_count=[]
    neighbor_split=[]

    for i in tqdm(range(len(df))):
        tmp=dict()
        convenience=0
        supermarket=0

        neighbor_info=df['neighbors'][i]

        # 近隣情報が0 の場合は０を返す
        if neighbor_info=='nan':
            neighbor_dict.append('nan')
            convenience_count.append('nan')
            supermarket_count.append('nan')
            neighbor_count.append('nan')
            neighbor_split.append('nan')

        # 近隣情報の辞書配列を作成
        else:   
            for word in neighbor_info.split('\t'):
                place = re.findall(r'\【.+?\】', word)[0][1:-1]  # 建物の名前
                distance=int(re.findall(r'\d+m', word)[0][:-1])  #  距離

        # すでに同じ建物がある場合は、近い距離の値を入れる
                if place in tmp.keys():
                    tmp[place]=min(tmp[place],distance)
                else:
                    tmp[place]=distance

        #  コンビニとスーパー,近隣情報の合計の数をカウント
                if place=='コンビニ':
                    convenience+=1
                if place=='スーパー':
                    supermarket+=1
            neighbor=len(neighbor_info.split('\t'))
                
            neighbor_dict.append(tmp)
            convenience_count.append(convenience)
            supermarket_count.append(supermarket)
            neighbor_count.append(neighbor)
            neighbor_split.append(neighbor_info.split('\t'))

    df['neighbor_dict']=neighbor_dict
    df['convenience_count']=convenience_count
    df['supermarket_count']=supermarket_count
    df['neighbor_count']=neighbor_count
    df['neighbor_split']=neighbor_split
    
    return df

# 辞書から指定された場所の距離を取得
def neighbor_distance(x,place):
    if x=='nan':
        return 0
    elif place in x.keys():
        return x[place]
    else:
        return 0

def neighbor_encoder(df):
# 全ての場所の配列を取得
    places=[]
    for i in range(len(df)):
        neighbor=df['neighbor_dict'][i]
        if neighbor=='nan':
            continue
        else:
            places.extend(neighbor.keys())

    for place in set(places):
        df[place]=df['neighbor_dict'].apply(lambda x:neighbor_distance(x,place))

    return df

df_all=add_neighbor_info(df_all)
df_all=neighbor_encoder(df_all)

df_all=df_all.replace('nan',-999)

100%|██████████| 62723/62723 [00:01<00:00, 39751.77it/s]


## 方角の欠損を欠損としてカテゴリーにする

In [144]:
def fillna_direction(df):
    df['direction']=df['direction'].fillna('nan')
    return df

def direction_enc(df):
    le_direction=LabelEncoder()
    return le_direction.fit_transform(df['direction'])

df_all=fillna_direction(df_all)

df_all['direction_enc']=direction_enc(df_all)
df_all['direction_enc']=df_all['direction_enc'].astype('category')

## 建物の構造の特徴量を加える
木造、軽量鉄骨は安価などの性質がある。木造、鉄骨、,RC,SRC,ブロック、その他（木造モルタル？）の構造かで分ける  
https://sell.yeay.jp/reading/knowledge/10067/  
耐用年数：木造２２年、鉄骨造27年、軽量鉄骨造１９年、RC、SRC、ALC、PC４７年、ブロック３８年、その他（木造モルタル）20年  
http://www.fudosantoshi.jp/glossary/%E6%B3%95%E5%AE%9A%E8%80%90%E7%94%A8%E5%B9%B4%E6%95%B0/  

ALCは外壁に使われるコンクリートの種類、PCはコンクリートを工場で作成する方法のことRC造にぞくする

In [145]:
# 木造、鉄骨、鉄筋、その他の構造かで分ける
def macro_structure(x):
    if  '木造' in x:
        return 1
    
    elif '鉄骨造' in x or '軽量鉄骨' in x:
        return 2
    
    elif 'SRC' in x:
        return 3
    
    elif 'RC' in x or 'ALC' in x or  'HPC' in x:
        return 4
    
    elif 'ブロック' in x:
        return 5
    
    else:
        return 6

# 耐用年数を追加
def resistance_period(x):
    if  '木造' in x:
        return 22*12
    
    elif '鉄骨造' in x:
        return 27*12
    
    elif '軽量鉄骨' in x:
        return 19*12
    
    elif 'RC' in x or 'ALC' in x or  'HPC' in x:
        return 47*12
    
    elif 'ブロック' in x:
        return 38*12
    
    else:
        return 20*12    

        
df_all['macro_structure']=df_all['structure'].apply(lambda x:macro_structure(x))
df_all['macro_structure']=df_all['macro_structure'].astype('category')

df_all['resistance_period']=df_all['structure'].apply(lambda x:resistance_period(x))

df_all['old_num']=df_all['old'].apply(lambda x:int(x.split('年')[0])*12+int(x.split('年')[1].split('ヶ月')[0])+1 if x!='新築' else 0)

#　耐年数が切れるまでの残り期間。耐年数ー築年数
df_all['available_period']=df_all['resistance_period']-df_all['old_num']

# 既に耐年数がきれているかどうか
df_all['available_is_over']=df_all['available_period'].apply(lambda x:np.where(x<0,1,0))

## 契約期間の間に耐用年数をすぎるかをチェック

In [146]:
# 欠損を最頻値で補完
def fillna_period(df):
    return df['period'].fillna('2年間')

# 定期借家かどうか
def is_rent(df):
    return df['period'].apply(lambda x:1 if '定期借家' in x else 0)

# 期間を月単位に変換,2019年8月を起点
def rent_period(x):
    if 'まで' in x:
        year=re.findall(r'\d+年',x)[0][:-1]
        month=re.findall(r'\d+月',x)[0][:-1]
        return int(year)*12+int(month)-2019*12-8
    
    else:
        year=re.findall(r'\d+年',x)
        year=0 if len(year)==0 else year[0][:-1]

        month=re.findall(r'\d+ヶ', x)
        month=0 if len(month)==0 else month[0][:-1]
        
        return int(year)*12+int(month)
        
df_all['period']=fillna_period(df_all)
df_all['is_rent']=is_rent(df_all)
df_all['period_num']=df_all['period'].apply(lambda x:rent_period(x.split('\t')[0]))


#  契約期間終了時に、残っている耐用年数
df_all['future_available']=df_all['available_period']-df_all['period_num']

# 契約期間中に耐用年数を超過するかどうか。切れていれば１、なければ０
df_all['future_available_is_over']=df_all['future_available'].apply(lambda x:np.where(x<0,1,0))

## 部屋の間取りと、建物の構造の組み合わせのカテゴリー変数を追加

In [147]:
def add_floor_detail(df):
    # LK をDKのタイポとし、DKに変換
    df['floor_info']=df['floor_info'].replace('1LK+S(納戸)','1DK+S(納戸)')
    df['floor_info']=df['floor_info'].replace('1LK','1DK')

    return df

# 間取りと建物構造の組みわせでカテゴリー変数を作る
def floor_info_macro_structure(df):
    le_floor=LabelEncoder()
    floor_info=le_floor.fit_transform(df['floor_info'])
    macro_structure=df['macro_structure']
    floor_structure_category=[]
    for floor,structure in zip(floor_info,macro_structure):
        floor_structure_category.append(str(floor)+'_'+str(structure))
        
    le_floor_structure=LabelEncoder()
    
    floor_structure_category=le_floor_structure.fit_transform(floor_structure_category)
    return floor_structure_category

df_all=add_floor_detail(df_all)

df_all['floor_info_macro_structure']=floor_info_macro_structure(df_all)    
df_all['floor_info_macro_structure']=df_all['floor_info_macro_structure'].astype('category')

## =====保存＝＝＝＝＝
ちゃんと分けて管理しないと分けがわからないことになり始めた。。。
### 特徴量４にユニークな特徴を保存

In [148]:
feat4_cols=['id', 'rent','square_num','access_1',
       'access_1_line', 'access_1_station', 'access_1_distance', 'access_2',
       'access_2_line', 'access_2_station', 'access_2_distance', 'room_floor',
       'building_floor', 'underground', 'total_floor', 'car_info', 'car_price',
       'bicycle_info', 'bicycle_price', 'bike_info', 'bike_price',
       'is_parking_car', 'is_house_parking_car', 'is_other_parking_car',
       'parking_number_car', 'parking_price_car', 'is_parking_bicycle',
       'is_house_parking_bicycle', 'is_other_parking_bicycle',
       'parking_number_bicycle', 'parking_price_bicycle', 'is_parking_bike',
       'is_house_parking_bike', 'is_other_parking_bike', 'parking_number_bike',
       'parking_price_bike', 'neighbor_dict', 'convenience_count',
       'supermarket_count', 'neighbor_count', 'neighbor_split', 'デパート',
       '幼稚園・保育園', '図書館', 'ドラッグストア', '郵便局', '大学', '公園', 'スーパー', '銀行',
       'コインパーキング', 'レンタルビデオ', '飲食店', '小学校', 'コンビニ', '病院', '総合病院', '月極駐車場',
       'クリーニング', '学校', 'direction_enc','macro_structure', 'resistance_period', 'old_num',
       'available_period', 'available_is_over', 'is_rent', 'period_num',
       'future_available', 'future_available_is_over','floor_info_macro_structure']

feat4_cols=list(df_all[feat4_cols].select_dtypes(exclude='object').columns)



In [149]:
train_feat4=df_all[feat4_cols][:train_index]
test_feat4=df_all[feat4_cols][train_index:]

train_feat4.to_feather('feature4/data/train_feat4_unique.ftr')

test_feat4.reset_index(inplace=True)
test_feat4.to_feather('feature4/data/test_feat4_unique.ftr')

In [150]:
df_all[feat4_cols]

Unnamed: 0,id,rent,square_num,access_1_line,access_1_station,access_1_distance,access_2_line,access_2_station,access_2_distance,room_floor,building_floor,underground,total_floor,is_parking_car,is_house_parking_car,is_other_parking_car,parking_number_car,parking_price_car,is_parking_bicycle,is_house_parking_bicycle,is_other_parking_bicycle,parking_number_bicycle,parking_price_bicycle,is_parking_bike,is_house_parking_bike,is_other_parking_bike,parking_number_bike,parking_price_bike,convenience_count,supermarket_count,neighbor_count,デパート,幼稚園・保育園,図書館,ドラッグストア,郵便局,大学,公園,スーパー,銀行,コインパーキング,レンタルビデオ,飲食店,小学校,コンビニ,病院,総合病院,月極駐車場,クリーニング,学校,direction_enc,macro_structure,resistance_period,old_num,available_period,available_is_over,is_rent,period_num,future_available,future_available_is_over,floor_info_macro_structure
0,1,75000.0,20.01,59,433,320.0,25,289,1120.0,1,12,0,12,-999,-999,-999,-999,-999.0,1,1,0,1,-999.0,-999,-999,-999,-999,-999.0,1,2,10,0,0,0,956,1246,461,1103,311,0,0,0,378,495,588,0,0,0,0,0,5,4,564,118,446,0,0,24,422,0,96
1,2,76000.0,16.50,60,97,400.0,32,251,720.0,5,10,0,10,0,0,0,0,-999.0,1,1,0,1,-999.0,0,0,0,0,-999.0,0,1,1,0,0,0,0,0,0,0,1283,0,0,0,0,0,0,0,0,0,0,0,0,2,324,539,-215,1,0,24,-239,1,114
2,3,110000.0,22.05,17,383,480.0,17,80,560.0,12,15,0,15,1,0,1,1,30000.0,1,1,0,1,-999.0,1,1,0,1,-999.0,2,2,4,0,0,0,0,0,0,0,89,0,0,0,0,0,184,0,0,0,0,0,4,4,564,103,461,0,0,24,437,0,96
3,4,150000.0,60.48,54,497,720.0,6,235,240.0,3,4,0,4,0,0,0,0,-999.0,0,0,0,0,-999.0,0,0,0,0,-999.0,2,4,6,0,0,0,0,0,0,0,225,0,0,0,0,0,326,0,0,0,0,0,4,4,564,353,211,0,1,24,187,0,13
4,5,74000.0,39.66,13,69,400.0,29,470,560.0,1,2,0,2,1,0,1,1,17000.0,0,0,0,0,-999.0,0,0,0,0,-999.0,1,4,5,0,0,0,0,0,0,0,193,0,0,0,0,0,351,0,0,0,0,0,4,1,264,380,-116,1,0,24,-140,1,119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62718,62728,,25.66,25,289,240.0,59,228,720.0,6,8,0,8,0,0,0,0,-999.0,0,0,0,0,-999.0,0,0,0,0,-999.0,2,3,5,0,0,0,0,0,0,0,284,0,0,0,0,0,204,0,0,0,0,0,7,4,564,2,562,0,0,24,538,0,96
62719,62729,,22.71,61,181,160.0,28,376,320.0,8,15,0,15,1,1,0,1,43200.0,1,1,0,1,-999.0,-999,-999,-999,-999,-999.0,-999,-999,-999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4,564,187,377,0,0,24,353,0,116
62720,62730,,45.76,60,232,800.0,57,313,1600.0,10,14,0,14,1,1,0,1,32400.0,1,1,0,1,-999.0,0,0,0,0,-999.0,-999,-999,-999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,4,564,13,551,0,1,24,527,0,106
62721,62731,,55.20,32,509,240.0,6,130,320.0,14,14,0,14,-999,-999,-999,-999,-999.0,1,1,0,1,-999.0,-999,-999,-999,-999,-999.0,-999,-999,-999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,564,185,379,0,0,24,355,0,95


## 今までの特徴量と合わせる

In [155]:
train_feat1=pd.read_feather('feature1/data/train_feat1_onehot.ftr')
test_feat1=pd.read_feather('feature1/data/test_feat1_onehot.ftr')
df_all_feat1=pd.concat([train_feat1,test_feat1],axis=0,sort=False).reset_index(drop=True)

train_feat2=pd.read_feather('feature2/data/train_feat2_unique.ftr')
test_feat2=pd.read_feather('feature2/data/test_feat2_unique.ftr').drop('index',axis=1)
df_all_feat2=pd.concat([train_feat2,test_feat2],axis=0,sort=False).reset_index(drop=True)

train_feat3=pd.read_feather('feature3/data/train_feat3_unique.ftr')
test_feat3=pd.read_feather('feature3/data/test_feat3_unique.ftr').drop('index',axis=1)
df_all_feat3=pd.concat([train_feat3,test_feat3],axis=0,sort=False).reset_index(drop=True)

train_feat4=pd.read_feather('feature4/data/train_feat4_unique.ftr')
test_feat4=pd.read_feather('feature4/data/test_feat4_unique.ftr').drop('index',axis=1)
df_all_feat4=pd.concat([train_feat4,test_feat4],axis=0,sort=False).reset_index(drop=True)


In [156]:
print(train_feat2.columns)
print(train_feat3.columns)

Index(['id', 'longitude', 'latitude', 'address_city_enc', 'address_town_enc'], dtype='object')
Index(['id', 'R', 'K', 'DK', 'LDK', 'S', 'room_number', 'Square/Room',
       'mesh_category_enc', 'others_rent'],
      dtype='object')


In [157]:
df=pd.merge(df_all_feat4,df_all_feat1,on='id',how='left')
df=pd.merge(df,df_all_feat2,on='id',how='left')
df=pd.merge(df,df_all_feat3,on='id',how='left')

In [160]:
train_feat4_all=df[:train_index]
test_feat4_all=df[train_index:]

In [162]:
train_feat4_all.to_feather('feature4/data/train_feat4_all.ftr')

test_feat4_all.reset_index(inplace=True)
test_feat4_all.to_feather('feature4/data/test_feat4_all.ftr')

Unnamed: 0,direction_enc,macro_structure,floor_info_macro_structure
0,5,4,96
1,0,2,114
2,4,4,96
3,4,4,13
4,4,1,119
...,...,...,...
31456,4,2,120
31457,8,4,116
31458,4,2,22
31459,4,1,42
