In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 100

import seaborn as sns
import pandas_profiling as pdp
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from IPython.display import display
import collections
import re
import feather
import codecs

from sklearn.preprocessing import LabelEncoder

In [34]:
train=pd.read_csv('../input/train.csv')
test=pd.read_csv('../input/test.csv')
df_all=pd.concat([train,test],axis=0,sort=False).reset_index(drop=True)
df_all.columns=['id','rent','address','access','floor_info','old','direction','square','floor','bath','kitchen','broadcast','facility','parking','neighbors','structure','period']


train_feat1=pd.read_feather('feature1/data/train_feat1.ftr')
test_feat1=pd.read_feather('feature1/data/test_feat1.ftr')

## 外れ値、ミスタイポを除去

In [35]:
# 外れ値、異常データを削除

def drop_train_outlier(df_train):
    train_feat1=pd.read_feather('feature1/data/train_feat1.ftr')
    train=df_train[df_train['id']!=20428][df_train['id']!=20232][df_train['id']!=20927]
    train_feat1=train_feat1[train_feat1['id']!=20428][train_feat1['id']!=20232][train_feat1['id']!=20927]

# 面積に対する外れ値を取得
    index1=train_feat1.query("rent>1000000 & square_num<100").index
    index2=train_feat1.query("rent>1500000").index
    index3=train_feat1.query("square_num>400").index
    index4=train_feat1.query('rent>100000 & square_num<10').index
    
# 築年数に対する外れ値を取得
    index5=train_feat1.query("old_num>1000").index
    
    drop_index=np.concatenate([index1,index2,index3,index4,index5])
    
    train_drop=train.drop(drop_index).reset_index(drop=True)
    train_feat1_drop=train_feat1.drop(drop_index).reset_index(drop=True)
    
    return train_drop

# テストデータ中の外れ値を補正
def modify_test_outlier(df_test):
    test_feat1=pd.read_feather('feature1/data/test_feat1.ftr')

    df_test.loc[test_feat1.query('old_num>1000').index,'築年数']='11年4ヶ月'
    return df_test
    
train_drop=drop_train_outlier(train)
test_mod=modify_test_outlier(test)

train_index=len(train_drop)
test_index=len(test_mod)

df_all=pd.concat([train_drop,test_mod],axis=0,sort=False).reset_index(drop=True)
df_all.columns=['id','rent','address','access','floor_info','old','direction','square','floor','bath','kitchen','broadcast','facility','parking','neighbors','structure','period']

train_feat1_drop=pd.merge(train_drop.loc[:,['id']],train_feat1,on='id',how='left')
test_feat1_mod=modify_test_outlier(test_feat1)

df_all_feat1=pd.concat([train_feat1_drop,test_feat1_mod],axis=0,sort=False).reset_index(drop=True)

## 経度・緯度を加える

In [36]:
with codecs.open("../input/13000-17.0a/13_2018.csv", "r", "Shift-JIS", "ignore") as file:
    df_cordA = pd.read_table(file, delimiter=",")
    display(df_cordA[:3])    
with codecs.open("../input/13000-12.0b/13_2018.csv", "r", "Shift-JIS", "ignore") as file:
    df_cordB = pd.read_table(file, delimiter=",")
    display(df_cordB[:3])
    
cordinate=df_cordA[['市区町村名','大字・丁目名','緯度','経度']]
cordinate.columns=['address_city','address_town','longitude','latitude']
cordinate.drop_duplicates(subset=['address_city','address_town'],keep='last',inplace=True)

Unnamed: 0,都道府県名,市区町村名,大字・丁目名,小字・通称名,街区符号・地番,座標系番号,Ｘ座標,Ｙ座標,緯度,経度,住居表示フラグ,代表フラグ,更新前履歴フラグ,更新後履歴フラグ
0,東京都,千代田区,麹町六丁目,,1,9,-34981.8,-9228.7,35.684649,139.731373,0,1,0,0
1,東京都,千代田区,麹町六丁目,,5,9,-34981.8,-9228.7,35.684649,139.731373,0,1,0,0
2,東京都,千代田区,六番町,,6,9,-34653.0,-9100.3,35.687614,139.732787,0,1,0,0


Unnamed: 0,都道府県コード,都道府県名,市区町村コード,市区町村名,大字町丁目コード,大字町丁目名,緯度,経度,原典資料コード,大字・字・丁目区分コード
0,13,東京都,13101,千代田区,131010001002,内幸町二丁目,35.670812,139.754182,0,3
1,13,東京都,13101,千代田区,131010001001,内幸町一丁目,35.670839,139.758119,0,3
2,13,東京都,13101,千代田区,131010002003,霞が関三丁目,35.671825,139.746988,0,3


In [37]:
# 住所から区を取得
city_tmp=df_all['address'].apply(lambda x:x.split('都')[1])
df_all['address_city']=city_tmp.apply(lambda x:x.split('区')[0]+'区')

# 住所から町名を取得
town_tmp=city_tmp.apply(lambda x:x.split('区')[1])

def town_enc(x):
    # 番地を削除
    x=re.split(r'\d+-|ー|－+\d+',x)[0]   
    # 何丁目か分かるものと、わからないもので分類
    split_list=x.split('丁目')
    if len(split_list)==2:
        return split_list[0]+'丁目'
    else:
    # 余分な数字を削除
        town=re.split(r'\d+',split_list[0])[0]
        return town
    
def int2kanji(x):
    kanji_nums={'1':'一', '2':'二', '3':'三', '4':'四', '5':'五', '6':'六', '１':'一', '２':'二', '３':'三', '４':'四', '５':'五', '６':'六', '７':'七', '８':'八', '９':'九'}
    num=re.findall(r'\d+',x)
    if len(num)==0:
        return x
    else:
        return x.replace(num[0],kanji_nums[num[0]])
    
def address_enc(df):
    le_city=LabelEncoder()
    df['address_city_enc']=le_city.fit_transform(df['address_city'])
    
    le_town=LabelEncoder()
    df['address_town_enc']=le_town.fit_transform(df['address_town'])
    
    return df
df_all['address_town']=town_tmp.apply(lambda x:town_enc(x))
df_all['address_town']=df_all['address_town'].apply(lambda x:int2kanji(x))

df_all=address_enc(df_all)

In [38]:
df_all=pd.merge(df_all,cordinate,on=['address_city','address_town'],how='left')

# 欠損を平均値で補完
df_all['longitude']=df_all['longitude'].fillna(np.mean(df_all['longitude']))
df_all['latitude']=df_all['latitude'].fillna(np.mean(df_all['latitude']))

## アクセスから、４駅の最寄りと距離を特徴量にいれる.
## 特徴量４で正確な値を求めたので、いらない

In [599]:
#  アクセス情報の初めの要素から路線と駅名をとる
df_all['train_line']=df_all['access'].apply(lambda x:x.split('\t')[0])
df_all['train_station']=df_all['access'].apply(lambda x:x.split('\t')[1])

# バス移動のものはバス特徴量にまとめる
df_all['station_access']=df_all['access'].apply(lambda x:x.split('\t')[2])
df_all['station_access']=df_all['station_access'].apply(lambda x:'車移動' if 'バス' in x or '車' in x else x)

#  数値型に変更
def enc_num(x):
    if x=='車移動':
        return 999
    else:
        return int(re.findall(f'\d+',x)[0])
    
df_all['station_access']=df_all['station_access'].apply(lambda x:enc_num(x))


In [600]:
# 最寄りの駅情報を４つに分ける
def split_access(x):
    access_list=x.split('\t\t')
    num_list=len(access_list)

    if num_list==1:
        return access_list[0],0,0,0
    elif num_list==2:
        return access_list[0],access_list[1],0,0
    elif num_list==3:
        return access_list[0],access_list[1],access_list[2],0
    else:
        return access_list[0],access_list[1],access_list[2],access_list[3]

# 各最寄り駅情報を路線、駅、移動方法に分ける
def access_enc(x,func):
    if x==0:
        return 'nan'
    else:
        split_list=x.split('\t')
    #  路線名を取得
        if func=='line':
            if '線' in split_list[0]:
                return split_list[0]
            else:
                return 'nan'

        # 駅名を取得
        elif func=='station':
            if len(split_list)>=2 and '駅' in split_list[1]: 
                return split_list[1]
            else:
                return 'nan'

        # 距離を取得
        else:
            if len(split_list)==3:
                if 'バス' in x or '車' in split_list[2]:
                    return '車移動'
                else:
                    return split_list[2]
            else:
                return 'nan'
        
#  距離を数値型に変更
def enc_num(x):

    if x=='車移動':
        return 999
    
    elif x=='nan':
        return -999
    
    else:
        return int(re.findall(f'\d+',x)[0])

tmp=df_all['access'].apply(lambda x:split_access(x))

for i in range(4):
    df_all['access_'+str(i+1)]=tmp.apply(lambda x:x[i])
    df_all['access_'+str(i+1)+'_line']=df_all['access_'+str(i+1)].apply(lambda x:access_enc(x,'line'))
    df_all['access_'+str(i+1)+'_station']=df_all['access_'+str(i+1)].apply(lambda x:access_enc(x,'station'))
    df_all['access_'+str(i+1)+'_distance']=df_all['access_'+str(i+1)].apply(lambda x:access_enc(x,'distance'))
    df_all['access_'+str(i+1)+'_distance']=df_all['access_'+str(i+1)+'_distance'].apply(lambda x:enc_num(x))

# 数値型にエンコード
for col in ['line','station']:
    le=LabelEncoder()
    encoder=list(set(df_all['access_1_'+col].unique()) | set(df_all['access_2_'+col].unique()) | \
    set(df_all['access_3_'+col].unique()) | set(df_all['access_4_'+col].unique()))
    
    le.fit(encoder)
    for i in range(4):
        df_all['access_'+str(i+1)+'_'+col]=le.transform(df_all['access_'+str(i+1)+'_'+col])
        df_all['access_'+str(i+1)+'_'+col]=df_all['access_'+str(i+1)+'_'+col].astype('category')

## 面積から統計特徴量を加える
1 単位面積あたりの価格の区ごとの平均、最大値、最小値、中央値、標準偏差を加える

In [606]:
# 訓練データだけを取り出す
tmp=pd.concat([df_all,df_all_feat1['square_num']],axis=1)[:train_index]
tmp['rent_per_square']=tmp['rent']/tmp['square_num']

# 単位面積あたりの区ごとの統計量を求める
rent_per_square_static=pd.DataFrame(tmp.groupby('address_city')['rent_per_square'].agg(['mean','max','min','median','std'])).reset_index()
rent_per_square_static.columns=['address_city','rent/S_mean','rent/S_max','rent/S_min','rent/S_median','rent/S_std']

#  全体に加える
df_all=pd.merge(df_all,rent_per_square_static,on='address_city',how='left')

2 住宅価格の区ごとの統計特徴量を加える

In [610]:
tmp=pd.concat([df_all,df_all_feat1['square_num']],axis=1)[:train_index]
rent_static=pd.DataFrame(tmp.groupby('address_city')['rent'].agg(['mean','max','min','median','std'])).reset_index()
rent_static.columns=['address_city','rent_mean','rent_max','rent_min','rent_median','rent_std']
#  全体に加える
df_all=pd.merge(df_all,rent_static,on='address_city',how='left')

## ======保存=======

## 特徴量２にユニークな特徴を保存

In [43]:
feat2_cols=['id','longitude','latitude','address_city_enc','address_town_enc']

train_feat2=df_all[feat2_cols][:train_index]
test_feat2=df_all[feat2_cols][train_index:]


train_feat2.to_feather('feature2/data/train_feat2_unique.ftr')
test_feat2.reset_index(inplace=True)
test_feat2.to_feather('feature2/data/test_feat2_unique.ftr')

In [2]:
feature1=df_all_feat1.drop(['train_line','train_station','station_access'],axis=1)
new_feature=df_all.select_dtypes(exclude='object').drop(['id','rent'],axis=1)
df_all_feat2=pd.concat([feature1,new_feature],axis=1)

NameError: name 'df_all_feat1' is not defined

In [621]:
train=df_all_feat2[:train_index]
test=df_all_feat2[train_index:]

In [627]:
train.to_feather('train_feat2.ftr')
test.reset_index(inplace=True)
test.to_feather('test_feat2.ftr')

In [1]:
train_feat1

NameError: name 'train_feat1' is not defined