In [1]:
import pandas as pd

In [2]:
data = pd.read_parquet('../data/tokyo-clean.parquet')

In [3]:
# adding new feature - building age at the time of the transaction

data['BuildingAge'] = data['TransactionYear'] - data['BuildingYear']

In [4]:
def parse_floor_plan(df):
    """
    Parses Japanese real estate 'LDK' strings into numerical features.
    Example: '2LDK+S' -> RoomCount:2, L:1, D:1, K:1, S:1
    """
    # Avoid SettingWithCopy warnings by operating on a copy
    df = df.copy()

    # 1. Standardize special text cases
    # 'Studio Apartment' and 'Open Floor' are effectively 1-room layouts
    # We map them to '1R' so our regex can pick up the '1'
    # Has_L, Has_D, Has_K will allow XGBoost to identify the studio apartments
    # Area will allow XGBoost to discriminate between Studio Apartment and Open Floor
    df['TempPlan'] = df['FloorPlan'].replace({
        'Studio Apartment': '1R',
        'Open Floor': '1R',
        'Duplex': '2LDK',  # Assumption: Treat Duplex as at least 2 rooms
        'None': '0R'  # Handle missing strings if any
    })

    # 2. Extract Room Count
    # Regex: Look for the number at the very start of the string
    df['RoomCount'] = df['TempPlan'].str.extract(r'^(\d+)').fillna(0).astype(int)

    # 3. Create Boolean Flags for Layout Types
    # L = Living, D = Dining, K = Kitchen, S = Service Room (Storage)
    df['Has_L'] = df['TempPlan'].str.contains('L', case=False, na=False).astype(int)
    df['Has_D'] = df['TempPlan'].str.contains('D', case=False, na=False).astype(int)
    df['Has_K'] = df['TempPlan'].str.contains('K', case=False, na=False).astype(int)
    
    # 4. Handle the "Service Room" (Nando)
    # Usually denoted as "+S" or just "S"
    df['Has_S'] = df['TempPlan'].str.contains('S', case=False, na=False).astype(int)

    # 5. Clean up
    # TODO Drop both the temporary column and the original column in the script
    # leaving the FloorPlan in this notebook to illustrate the effect below
    # df = df.drop(columns=['TempPlan', 'FloorPlan'])
    df.drop('TempPlan', axis=1, inplace=True)
    
    return df

data.FloorPlan.unique()

array(['1K', '2LDK', None, '1LDK', '1DK', '1R', '3LDK', '3DK', '4LDK',
       '2DK', 'Open Floor', '2K', 'Studio Apartment', '1LDK+S', '3LK',
       'Duplex', '1K+S', '4LDK+S', '3LDK+S', '2LDK+S', '3K', '5LDK',
       '4DK', '3DK+S', '2DK+S', '2LK', '6LDK', '7LDK', '1DK+S', '1LK',
       '3LD', '1R+S', '4K', '4DK+S', '2LK+S', '2LD+S', '3LD+S', '2K+S',
       '5LDK+S', '2LD', '5DK', '1L+S', '6LDK+S', '3LDK+K', '1L', '6DK',
       '1LK+S', '8LDK', '5LK', '5K', '6DK+S', '7LDK+S', '3K+S', '7DK',
       '6K', '1LDK+K', '5K+S', '5DK+S', '3LK+S', '4K+S', '8LDK+S', '2L+S',
       '4LK', '6K+S', '1LD+S', '6LK', '4L+K'], dtype=object)

In [5]:
data = parse_floor_plan(data)
print(data[['FloorPlan', 'RoomCount', 'Has_L', 'Has_D', 'Has_K', 'Has_S']].head())
data.drop('FloorPlan', axis=1, inplace=True)

  FloorPlan  RoomCount  Has_L  Has_D  Has_K  Has_S
0        1K          1      0      0      1      0
1      2LDK          2      1      1      1      0
2      None          0      0      0      0      0
3      None          0      0      0      0      0
4      1LDK          1      1      1      1      0
