# Data Preparation

## Importing useful libraries

In [5]:
import pandas as pd
import numpy as np
import scipy as sp
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.iolib.summary2 import summary_col
import seaborn as sn
pd.options.mode.chained_assignment = None  # default='warn'

## Importing raw data

In [6]:
df = pd.read_csv(r'resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv')

## Data filtering

In [7]:
resale_data = df[df['flat_type']=='4 ROOM']
resale_data.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
33,2017-01,ANG MO KIO,4 ROOM,472,ANG MO KIO AVE 10,10 TO 12,92.0,New Generation,1979,61 years 06 months,400000.0
34,2017-01,ANG MO KIO,4 ROOM,475,ANG MO KIO AVE 10,07 TO 09,91.0,New Generation,1979,61 years 06 months,400000.0
35,2017-01,ANG MO KIO,4 ROOM,629,ANG MO KIO AVE 4,01 TO 03,94.0,New Generation,1981,63 years 09 months,403000.0
36,2017-01,ANG MO KIO,4 ROOM,546,ANG MO KIO AVE 10,01 TO 03,92.0,New Generation,1981,63 years,410000.0
37,2017-01,ANG MO KIO,4 ROOM,131,ANG MO KIO AVE 3,01 TO 03,98.0,New Generation,1979,61 years 01 month,425888.0


## Data transformation

### Separating `month` into `year` and `separate_month`

In [8]:
resale_data['year'] = resale_data['month'].apply(lambda x: pd.to_numeric(x[0:4]))
resale_data['separate_month'] = resale_data['month'].apply(lambda x: pd.to_numeric(x[5:7]))
np.unique(resale_data['separate_month'])

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int64)

### Merging 26 towns into 5 bigger regions

In [9]:
central = ['CENTRAL AREA', 'BUKIT TIMAH']
rest_of_central = ['BISHAN', 'GEYLANG', 'KALLANG/WHAMPOA','MARINE PARADE','QUEENSTOWN','TOA PAYOH','BUKIT MERAH']
near_central = ['CLEMENTI','BUKIT BATOK','BUKIT PANJANG','ANG MO KIO','SERANGOON','HOUGANG','BEDOK']
further_central = ['JURONG EAST','CHOA CHU KANG','SENGKANG','YISHUN','TAMPINES']
far_central = ['JURONG WEST','PASIR RIS','PUNGGOL','SEMBAWANG','WOODLANDS']
region = [central, rest_of_central, near_central, further_central, far_central]
def region_finder(x):
    for i in range(0,4):
        if x in region[i]:
            break
    return i
resale_data['proximity'] = resale_data['town'].apply(region_finder)

### Creating new variable `average_level` from the existing variable `storey_range`

In [10]:
def range_to_lvl(x):
  x = (int(x[:2]) + int(x[7:]))/2
  return x
resale_data['average_level'] = resale_data['storey_range'].apply(range_to_lvl)

### Changing the unit of `remaining_lease` from *years-months* to *months*

In [11]:
resale_data['remain_y'] = resale_data['remaining_lease'].apply(lambda x: pd.to_numeric(x[0:2]))
resale_data['remain_m'] = resale_data['remaining_lease'].apply(lambda x: pd.to_numeric(x[9:11]) if len(x)>8 else pd.to_numeric('00'))
np.unique(resale_data['remain_m'])

resale_data['remain_lease_m'] = (resale_data['remain_y']*12) + resale_data['remain_m']

### Dividing `year` into 2 periods (before & after 2019)

In [12]:
def year_cat(x):
    if x <= 2019:
        return 0
    else:
        return 1
resale_data['year_cat'] = resale_data['year'].apply(year_cat)

### Creating new variable `central` from the existing variable `proximity`
`central = 1` if the flat is located in the Central region, otherwise `central = 0`.

In [13]:
resale_data['central'] = resale_data['proximity'].apply(lambda x: 1 if (x < 2) else 0)

## Creating new variables

### Number of Hospitals

In [14]:
cond = [(resale_data['town'] == 'ANG MO KIO'), (resale_data['town'] == 'BEDOK'), (resale_data['town'] == 'BISHAN'), (resale_data['town'] == 'BUKIT BATOK'), (resale_data['town'] == 'BUKIT MERAH'), (resale_data['town'] == 'BUKIT PANJANG'),
    (resale_data['town'] == 'BUKIT TIMAH'), (resale_data['town'] == 'CENTRAL AREA'), (resale_data['town'] == 'CHOA CHU KANG'), (resale_data['town'] == 'CLEMENTI'), (resale_data['town'] == 'GEYLANG'),
    (resale_data['town'] == 'HOUGANG'), (resale_data['town'] == 'JURONG EAST'), (resale_data['town'] == 'JURONG WEST'), (resale_data['town'] == 'KALLANG/WHAMPOA'), (resale_data['town'] == 'MARINE PARADE'),
    (resale_data['town'] == 'PASIR RIS'), (resale_data['town'] == 'PUNGGOL'), (resale_data['town'] == 'QUEENSTOWN'), (resale_data['town'] == 'SEMBAWANG'), (resale_data['town'] == 'SENGKANG'),
    (resale_data['town'] == 'SERANGOON'), (resale_data['town'] == 'TAMPINES'), (resale_data['town'] == 'TOA PAYOH'), (resale_data['town'] == 'WOODLANDS'), (resale_data['town'] == 'YISHUN')]
val = [2,0,1,1,3,0,7,3,0,1,0,1,2,0,0,1,0,3,1,0,3,1,2,0,0,2]

# no. hospitals (continuous)
resale_data['n_hospitals'] = np.select(cond, val)
# no. hospitals (categorical)
resale_data['hospitals'] = resale_data['n_hospitals'].astype("category")

### Number of MRT stations

In [15]:
cond2 = [(resale_data['town'] == 'ANG MO KIO'), (resale_data['town'] == 'BEDOK'), (resale_data['town'] == 'BISHAN'), (resale_data['town'] == 'BUKIT BATOK'), (resale_data['town'] == 'BUKIT MERAH'), (resale_data['town'] == 'BUKIT PANJANG'),
    (resale_data['town'] == 'BUKIT TIMAH'), (resale_data['town'] == 'CENTRAL AREA'), (resale_data['town'] == 'CHOA CHU KANG'), (resale_data['town'] == 'CLEMENTI'), (resale_data['town'] == 'GEYLANG'),
    (resale_data['town'] == 'HOUGANG'), (resale_data['town'] == 'JURONG EAST'), (resale_data['town'] == 'JURONG WEST'), (resale_data['town'] == 'KALLANG/WHAMPOA'), (resale_data['town'] == 'MARINE PARADE'),
    (resale_data['town'] == 'PASIR RIS'), (resale_data['town'] == 'PUNGGOL'), (resale_data['town'] == 'QUEENSTOWN'), (resale_data['town'] == 'SEMBAWANG'), (resale_data['town'] == 'SENGKANG'),
    (resale_data['town'] == 'SERANGOON'), (resale_data['town'] == 'TAMPINES'), (resale_data['town'] == 'TOA PAYOH'), (resale_data['town'] == 'WOODLANDS'), (resale_data['town'] == 'YISHUN')]
val2 = [3,5,4,2,7,2,8,23,2,1,8,3,4,1,4,3,1,1,9,2,0,2,6,4,4,3]

# no. mrt stations (continuous)
resale_data['n_mrt'] = np.select(cond2, val2)
# no. mrt stations (categorical)
resale_data['mrt'] = resale_data['n_mrt'].astype('category')

## Export updated data

In [16]:
resale_data.to_csv(r'resale-flat-prices-updated.csv')