In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_parquet('../data/tokyo.parquet')
data.drop('MunicipalityCode', axis=1, inplace=True)
data.drop('DistrictCode', axis=1, inplace=True)
data.drop('PriceCategory', axis=1, inplace=True)
data.drop('PricePerUnit', axis=1, inplace=True)  # blank
data.drop('UnitPrice', axis=1, inplace=True)  # blank
data.drop('Prefecture', axis=1, inplace=True)  # all Tokyo

data = data.rename(columns={
    'TradePrice': 'TradePriceYen',
    'Direction': 'RoadDirection'
})

In [3]:
data

Unnamed: 0,Type,Region,Municipality,DistrictName,TradePriceYen,FloorPlan,Area,LandShape,Frontage,TotalFloorArea,...,Purpose,RoadDirection,Classification,Breadth,CityPlanning,CoverageRatio,FloorAreaRatio,Period,Renovation,Remarks
0,"Pre-owned Condominiums, etc.",,Chiyoda Ward,Kandasudacho,17000000,1K,25,,,,...,,,,,Commercial Zone,80,800,2nd quarter 2010,Done,
1,"Pre-owned Condominiums, etc.",,Chiyoda Ward,Yombancho,83000000,2LDK,65,,,,...,,,,,Category I Residential Zone,80,500,2nd quarter 2010,Not yet,
2,Residential Land(Land and Building),Commercial Area,Chiyoda Ward,,170000000,,135,Rectangular Shaped,8,630,...,,Northwest,Ward Road,8,Commercial Zone,80,500,2nd quarter 2010,,
3,Residential Land(Land and Building),Commercial Area,Chiyoda Ward,Fujimi,150000000,,50,Trapezoidal Shaped,6,220,...,,Southwest,Ward Road,11,Commercial Zone,80,500,2nd quarter 2010,,
4,"Pre-owned Condominiums, etc.",,Chiyoda Ward,Hirakawacho,130000000,1LDK,80,,,,...,,,,,Category II Residential Zone,60,400,2nd quarter 2010,Not yet,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607751,Residential Land(Land and Building),Residential Area,Hachijo Town,Mitsune,4000000,,890,Semi-shaped,27,140,...,,South,Town Road,5.5,Non-divided City Planning Area,70,200,1st quarter 2025,,
607752,Residential Land(Land Only),Residential Area,Hachijo Town,Mitsune,550000,,155,Irregular Shaped,15,,...,Other,Southwest,Town Road,6.6,Non-divided City Planning Area,70,200,1st quarter 2025,,
607753,Residential Land(Land and Building),Residential Area,Hachijo Town,Mitsune,10000000,,200,Semi-rectangular Shaped,10,95,...,House,East,Town Road,3,Non-divided City Planning Area,70,200,2nd quarter 2025,,
607754,Residential Land(Land and Building),Commercial Area,Hachijo Town,Mitsune,5000000,,310,Irregular Shaped,18,130,...,Office,Southeast,Tokyo Metropolitan Road,13,Non-divided City Planning Area,70,200,2nd quarter 2025,,


In [4]:
# real estate categories

data.Type.unique()

array(['Pre-owned Condominiums, etc.',
       'Residential Land(Land and Building)',
       'Residential Land(Land Only)', 'Forest Land', 'Agricultural Land'],
      dtype=object)

In [5]:
# filter to residential real estate, including condominiums

data = data[(data.Type == 'Residential Land(Land and Building)') | (data.Type == 'Pre-owned Condominiums, etc.')].copy()
data.reset_index(drop=True, inplace=True)

In [6]:
# original Japanese names of Wards/Cities

municipality_mapping = {
    'Chiyoda Ward': '千代田区 (Chiyoda Ward)',
    'Chuo Ward': '中央区 (Chuo Ward)',
    'Minato Ward': '港区 (Minato Ward)',
    'Shinjuku Ward': '新宿区 (Shinjuku Ward)',
    'Bunkyo Ward': '文京区 (Bunkyo Ward)',
    'Taito Ward': '台東区 (Taito Ward)',
    'Sumida Ward': '墨田区 (Sumida Ward)',
    'Koto Ward': '江東区 (Koto Ward)',
    'Shinagawa Ward': '品川区 (Shinagawa Ward)',
    'Meguro Ward': '目黒区 (Meguro Ward)',
    'Ota Ward': '大田区 (Ota Ward)',
    'Setagaya Ward': '世田谷区 (Setagaya Ward)',
    'Shibuya Ward': '渋谷区 (Shibuya Ward)',
    'Nakano Ward': '中野区 (Nakano Ward)',
    'Suginami Ward': '杉並区 (Suginami Ward)',
    'Toshima Ward': '豊島区 (Toshima Ward)',
    'Kita Ward': '北区 (Kita Ward)',
    'Arakawa Ward': '荒川区 (Arakawa Ward)',
    'Itabashi Ward': '板橋区 (Itabashi Ward)',
    'Nerima Ward': '練馬区 (Nerima Ward)',
    'Adachi Ward': '足立区 (Adachi Ward)',
    'Katsushika Ward': '葛飾区 (Katsushika Ward)',
    'Edogawa Ward': '江戸川区 (Edogawa Ward)',
    'Hachioji City': '八王子市 (Hachioji City)',
    'Tachikawa City': '立川市 (Tachikawa City)',
    'Musashino City': '武蔵野市 (Musashino City)',
    'Mitaka City': '三鷹市 (Mitaka City)',
    'Oume City': '青梅市 (Oume City)',
    'Fuchu City': '府中市 (Fuchu City)',
    'Akishima City': '昭島市 (Akishima City)',
    'Chofu City': '調布市 (Chofu City)',
    'Machida City': '町田市 (Machida City)',
    'Koganei City': '小金井市 (Koganei City)',
    'Kodaira City': '小平市 (Kodaira City)',
    'Hino City': '日野市 (Hino City)',
    'Higashimurayama City': '東村山市 (Higashimurayama City)',
    'Kokubunji City': '国分寺市 (Kokubunji City)',
    'Kunitachi City': '国立市 (Kunitachi City)',
    'Fussa City': '福生市 (Fussa City)',
    'Komae City': '狛江市 (Komae City)',
    'Higashiyamato City': '東大和市 (Higashiyamato City)',
    'Kiyose City': '清瀬市 (Kiyose City)',
    'Higashikurume City': '東久留米市 (Higashikurume City)',
    'Musashimurayama City': '武蔵村山市 (Musashimurayama City)',
    'Tama City': '多摩市 (Tama City)',
    'Inagi City': '稲城市 (Inagi City)',
    'Hamura City': '羽村市 (Hamura City)',
    'Akiruno City': 'あきる野市 (Akiruno City)',
    'Nishitokyo City': '西東京市 (Nishitokyo City)',
    'Mizuho Town, Nishitama County': '瑞穂町 (Mizuho Town, Nishitama County)',
    'Hinode Town, Nishitama County': '日の出町 (Hinode Town, Nishitama County)',
    'Hinohara Village, Nishitama County': '檜原村 (Hinohara Village, Nishitama County)',
    'Okutama Town, Nishitama County': '奥多摩町 (Okutama Town, Nishitama County)',
    'Oshima Town': '大島町 (Oshima Town)',
    'Niijima Village': '新島村 (Niijima Village)',
    'Miyake Village': '三宅村 (Miyake Village)',
    'Hachijo Town': '八丈町 (Hachijo Town)',
    'Ogasawara Village': '小笠原村 (Ogasawara Village)',
    'Kozushima Village': '神津島村 (Kozushima Village)',
}

In [7]:
# add original Japanese

data['Municipality'] = data['Municipality'].map(municipality_mapping)

In [8]:
# data type conversions

data.replace('', np.nan, inplace=True)

data.TradePriceYen = data.TradePriceYen.astype(int)
data.Area = data.Area.astype(int)
data.Frontage = data.Frontage.astype(float)
data.TotalFloorArea = data.TotalFloorArea.astype('Int64')
data.CoverageRatio = data.CoverageRatio.astype('Int64')  # these are all integers
data.FloorAreaRatio = data.FloorAreaRatio.astype('Int64')  # these are all integers
data.Breadth = data.Breadth.astype(float)

In [9]:
# large lots seem to be capped at 9999. remove these to narrow the scope to "typical" lot sizes

print(np.sort(data.Area.unique()))
print()
print('outliers with 9999 area size (count):', len(data[data.Area == 9999]))
print('outliers with 9999 area size (%): ' + str(np.round(len(data[data.Area == 9999])/len(data)*100,2)) + '%')

data = data[data.Area < 9999].copy()
data.reset_index(drop=True, inplace=True)

[  10   15   20   25   30   35   40   45   50   55   60   65   70   75
   80   85   90   95  100  105  110  115  120  125  130  135  140  145
  150  155  160  165  170  175  180  185  190  195  200  205  210  215
  220  225  230  235  240  245  250  255  260  265  270  275  280  285
  290  295  300  305  310  315  320  325  330  335  340  345  350  355
  360  365  370  375  380  385  390  395  400  405  410  415  420  425
  430  435  440  445  450  455  460  470  475  480  485  490  495  500
  505  510  515  520  525  530  540  545  550  555  560  570  580  590
  600  610  620  630  640  645  650  660  670  680  690  700  705  710
  715  720  730  740  750  755  760  765  770  780  790  800  810  815
  820  830  840  850  855  860  870  880  890  900  910  920  930  940
  950  960  970  980  990 1000 1020 1100 1200 1300 1400 1500 1600 1700
 1800 1900 9999]

outliers with 9999 area size (count): 452
outliers with 9999 area size (%): 0.09%


In [10]:
# flag columns which have floored/capped values

# BuildingYear has a value 'before the war' which acts as a floor year
# we will use the 'flag and floor method' before training the XGBoost model:
# set the value as 1945 but flag it as floored in a separate column
data.BuildingYear.replace('', np.nan, inplace=True)
data['BuildingYearFloored'] = data.BuildingYear.apply(lambda x: True if x == 'before the war' else False)
data.BuildingYear = data.BuildingYear.apply(lambda x: 1945 if x == 'before the war' else x).astype(float).astype('Int64')

# Frontage has a value 9999.9 that appears for capped values
# we will leave the value alone but flag it for XGBoost
data['FrontageCapped'] = data.Frontage.apply(lambda x: True if x == 9999.9 else False)

# TotalFloorArea has a value 9999 that appears for capped values
# we will leave the value alone but flag it for XGBoost
data['TotalFloorAreaCapped'] = data.TotalFloorArea.apply(lambda x: True if x == 9999 else False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data.BuildingYear.replace('', np.nan, inplace=True)


In [11]:
# Period has string format eg '2nd quarter 2010'

data['TransactionYear'] = data.Period.apply(lambda x: int(x[-4:]))
data['TransactionQuarter'] = data.Period.apply(lambda x: int(x[0]))

# create a separate a datetime column to replace Period, using the end date of each quarter
quarter_end = {
    1: (3, 31),
    2: (6, 30),
    3: (9, 30),
    4: (12, 31)
}

data['TransactionQuarterEndDate'] = data.apply(
    lambda row: pd.Timestamp(row['TransactionYear'],
                             quarter_end[row['TransactionQuarter']][0],
                             quarter_end[row['TransactionQuarter']][1]), axis=1)

data.drop('Period', axis=1, inplace=True)

In [12]:
print('columns:', len(data.columns))
print('rows:', len(data))

columns: 28
rows: 521867


In [13]:
for col in data.columns:
    if '' in data[col].unique():
        print(col)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 521867 entries, 0 to 521866
Data columns (total 28 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   Type                       521867 non-null  object        
 1   Region                     169305 non-null  object        
 2   Municipality               521867 non-null  object        
 3   DistrictName               521828 non-null  object        
 4   TradePriceYen              521867 non-null  int64         
 5   FloorPlan                  342105 non-null  object        
 6   Area                       521867 non-null  int64         
 7   LandShape                  169247 non-null  object        
 8   Frontage                   154219 non-null  float64       
 9   TotalFloorArea             185942 non-null  Int64         
 10  BuildingYear               503569 non-null  Int64         
 11  Structure                  506866 non-null  object  

In [15]:
data.head()

Unnamed: 0,Type,Region,Municipality,DistrictName,TradePriceYen,FloorPlan,Area,LandShape,Frontage,TotalFloorArea,...,CoverageRatio,FloorAreaRatio,Renovation,Remarks,BuildingYearFloored,FrontageCapped,TotalFloorAreaCapped,TransactionYear,TransactionQuarter,TransactionQuarterEndDate
0,"Pre-owned Condominiums, etc.",,千代田区 (Chiyoda Ward),Kandasudacho,17000000,1K,25,,,,...,80,800,Done,,False,False,False,2010,2,2010-06-30
1,"Pre-owned Condominiums, etc.",,千代田区 (Chiyoda Ward),Yombancho,83000000,2LDK,65,,,,...,80,500,Not yet,,False,False,False,2010,2,2010-06-30
2,Residential Land(Land and Building),Commercial Area,千代田区 (Chiyoda Ward),,170000000,,135,Rectangular Shaped,8.0,630.0,...,80,500,,,False,False,False,2010,2,2010-06-30
3,Residential Land(Land and Building),Commercial Area,千代田区 (Chiyoda Ward),Fujimi,150000000,,50,Trapezoidal Shaped,6.0,220.0,...,80,500,,,False,False,False,2010,2,2010-06-30
4,"Pre-owned Condominiums, etc.",,千代田区 (Chiyoda Ward),Hirakawacho,130000000,1LDK,80,,,,...,60,400,Not yet,,False,False,False,2010,2,2010-06-30


In [16]:
# data write to parquet

try:
    data.to_parquet('../data/tokyo-clean.parquet', index=False)
    print(f"successfuly wrote {data.shape[0]} rows and {data.shape[1]} columns to parquet file: ../data/tokyo-clean.parquet")
except Exception as e:
    print(f"error writing to parquet: {e}")


successfuly wrote 521867 rows and 28 columns to parquet file: ../data/tokyo-clean.parquet
