In [1]:
# Imports
import pandas as pd
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


In [2]:
df_cpi = pd.read_csv("data/cpi.csv")
df_interest = pd.read_csv("data/interest.csv")
df_rent_index = pd.read_csv("data/rentIndex.csv")
df_vacant = pd.read_csv("data/vacant.csv")
df_properties = pd.read_csv("data/properties.csv")
df_geo_attributes = pd.read_csv("data/geo_attributes.csv")
df_test = pd.read_csv("data/test.csv")
df_train = pd.read_csv("data/train.csv")


In [3]:
df_cpi.head()


Unnamed: 0,Data Series,CPI
0,2022 Dec,111.186
1,2022 Nov,110.959
2,2022 Oct,109.893
3,2022 Sep,110.339
4,2022 Aug,109.863


In [4]:
df_interest.head()


Unnamed: 0,Data Series,InterestRate
0,2022 Dec,1.48
1,2022 Nov,1.47
2,2022 Oct,1.17
3,2022 Sep,1.03
4,2022 Aug,0.95


In [5]:
df_rent_index.head()


Unnamed: 0,Data Series,RentIndex
0,2022 4Q,148.1
1,2022 3Q,137.9
2,2022 2Q,127.0
3,2022 1Q,119.0
4,2021 4Q,114.2


In [6]:
df_vacant.head()


Unnamed: 0,Data Series,Available,Vacant
0,2022 4Q,34084,524
1,2022 3Q,34084,514
2,2022 2Q,34084,452
3,2022 1Q,34084,517
4,2021 4Q,34084,576


In [7]:
df_properties.head()


Unnamed: 0,area,floorRange,propertyType,district,typeOfArea,tenure,street,project,marketSegment,property_key
0,226.0,01-05,Condominium,4,Strata,99 yrs lease commencing from 2007,COVE DRIVE,TURQUOISE,CCR,p-7b6c69000
1,194.0,01-05,Condominium,4,Strata,99 yrs lease commencing from 2007,COVE DRIVE,TURQUOISE,CCR,p-320ed2726
2,348.0,06-10,Condominium,4,Strata,99 yrs lease commencing from 2007,COVE DRIVE,TURQUOISE,CCR,p-2bc5c4951
3,223.0,01-05,Condominium,4,Strata,99 yrs lease commencing from 2007,COVE DRIVE,TURQUOISE,CCR,p-f060c5be0
4,195.0,01-05,Condominium,4,Strata,99 yrs lease commencing from 2007,COVE DRIVE,TURQUOISE,CCR,p-a8d1004a9


In [8]:
df_geo_attributes.head()


Unnamed: 0,street,project,district,lat,lng,num_schools_1km,num_supermarkets_500m,num_mrt_stations_500m
0,LIM AH WOO ROAD,SUITES @ GUILLEMARD,15,1.31252,103.89176,7,2,3
1,HOLLAND ROAD,LOFT@HOLLAND,10,1.312364,103.797169,0,3,4
2,SOUTH BUONA VISTA ROAD,VIVA VISTA,5,1.279803,103.78591,0,0,2
3,UPPER SERANGOON ROAD,PARK RESIDENCES KOVAN,19,1.357934,103.881932,7,2,1
4,NATHAN ROAD,LOFT @ NATHAN,10,1.294461,103.827621,2,4,4


In [9]:
df_train.head()


Unnamed: 0,property_key,contractDate,price
0,p-c1ce31019,2018-03-01,1850000.0
1,p-a7fd7deb5,2018-03-01,1030000.0
2,p-2a981615e,2018-03-01,4300000.0
3,p-2bb570b5d,2018-03-01,1400888.0
4,p-4ccd6e1db,2018-03-01,725000.0


## Process macro and micro features


### Macro

Make all monthly to join on date

In [10]:
def process_date(df):
    df["date"] = pd.to_datetime(df["Data Series"], format="%Y %b ").dt.to_period("M")
    df = df.drop(columns="Data Series")
    df["date"] = df["date"].dt.to_timestamp()
    return df


In [11]:
df_cpi = process_date(df_cpi)
df_interest = process_date(df_interest)


In [12]:
def process_quarter(df):
    df["date"] = pd.to_datetime(
        df["Data Series"].str.replace(r"(\d+) (\d)Q ", r"\1-Q\2", regex=True)
    ).dt.to_period("M")
    df = df.set_index("date").resample("M", convention="end").interpolate("linear")
    df = df.reset_index()
    df = df.drop(columns="Data Series")
    df["date"] = df["date"].dt.to_timestamp()
    return df


In [13]:
df_rent_index = process_quarter(df_rent_index)
df_vacant = process_quarter(df_vacant)


  df["date"] = pd.to_datetime(
  df["date"] = pd.to_datetime(


### Micro

#### properties.csv

In [14]:
df_properties.head()


Unnamed: 0,area,floorRange,propertyType,district,typeOfArea,tenure,street,project,marketSegment,property_key
0,226.0,01-05,Condominium,4,Strata,99 yrs lease commencing from 2007,COVE DRIVE,TURQUOISE,CCR,p-7b6c69000
1,194.0,01-05,Condominium,4,Strata,99 yrs lease commencing from 2007,COVE DRIVE,TURQUOISE,CCR,p-320ed2726
2,348.0,06-10,Condominium,4,Strata,99 yrs lease commencing from 2007,COVE DRIVE,TURQUOISE,CCR,p-2bc5c4951
3,223.0,01-05,Condominium,4,Strata,99 yrs lease commencing from 2007,COVE DRIVE,TURQUOISE,CCR,p-f060c5be0
4,195.0,01-05,Condominium,4,Strata,99 yrs lease commencing from 2007,COVE DRIVE,TURQUOISE,CCR,p-a8d1004a9


In [15]:
df_properties.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31530 entries, 0 to 31529
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   area           31530 non-null  float64
 1   floorRange     31530 non-null  object 
 2   propertyType   31530 non-null  object 
 3   district       31530 non-null  int64  
 4   typeOfArea     31530 non-null  object 
 5   tenure         31530 non-null  object 
 6   street         31530 non-null  object 
 7   project        31530 non-null  object 
 8   marketSegment  31530 non-null  object 
 9   property_key   31530 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 2.4+ MB


In [16]:
df_properties.groupby("floorRange")["property_key"].agg("count")


floorRange
-         4957
01-05    12037
06-10     5693
11-15     3912
16-20     2375
21-25     1052
26-30      662
31-35      411
36-40      200
41-45       94
46-50       45
51-55       32
56-60       22
61-65       18
66-70        5
71-75        1
B1-B5       14
Name: property_key, dtype: int64

In [17]:
df_properties.groupby("propertyType")["property_key"].agg("count")


propertyType
Apartment                 9782
Condominium              14881
Detached                   438
Executive Condominium     1946
Semi-detached             1425
Strata Detached             88
Strata Semi-detached       183
Strata Terrace             470
Terrace                   2317
Name: property_key, dtype: int64

In [18]:
df_properties.groupby("district")["property_key"].agg("count")


district
1      529
2      354
3      852
4      728
5     1228
6        8
7      299
8      544
9     1984
10    2548
11    1394
12    1081
13     723
14    1524
15    3123
16    1565
17     587
18    1348
19    3627
20    1221
21    1126
22     765
23    1675
25     427
26     425
27     866
28     979
Name: property_key, dtype: int64

In [19]:
df_properties.groupby("typeOfArea")["property_key"].agg("count")


typeOfArea
Land       4191
Strata    27339
Name: property_key, dtype: int64

In [20]:
df_properties.groupby("tenure")["property_key"].agg("count")


tenure
100 yrs lease commencing from 1986          11
102 yrs lease commencing from 1978          20
102 yrs lease commencing from 1996          19
103 yrs lease commencing from 1974           4
103 yrs lease commencing from 1975           6
                                         ...  
999999 yrs lease commencing from 1958        1
999999 yrs lease commencing from 1963        1
999999 yrs lease commencing from 1990        1
999999 yrs lease commencing from 1993        1
Freehold                                 14723
Name: property_key, Length: 122, dtype: int64

In [21]:
df_properties.groupby("street")["property_key"].agg("count")


street
ADAM ROAD           21
ADIS ROAD           15
AH HOOD ROAD        28
AH SOO GARDEN        3
AIDA STREET        125
                  ... 
YUK TONG AVENUE      5
YUNNAN CRESCENT     16
ZEHNDER ROAD         4
ZION CLOSE          12
ZION ROAD           15
Name: property_key, Length: 1171, dtype: int64

In [22]:
df_properties.groupby("project")["property_key"].agg("count")


project
# 1 LOFT               8
# 1 SUITES            12
1 CANBERRA            56
1 KING ALBERT PARK    12
1 MOULMEIN RISE        7
                      ..
YUEN SING MANSION      1
YUNNAN GARDENS        16
ZEDGE                 20
ZENITH                15
ZEPHYR PARK            2
Name: property_key, Length: 2753, dtype: int64

In [23]:
df_properties.groupby("marketSegment")["property_key"].agg("count")


marketSegment
CCR     7002
OCR    15737
RCR     8791
Name: property_key, dtype: int64

- floorRange: one hot encode
- propertyType: one hot encode
- district: to join with df_geo_attributes
- typeOfArea: one hot encode
- tenure: set to Freehold or non-Freehold (Lease)
- street: to join with df_geo_attributes
- project: to join with df_geo_attributes
- marketSegment: one hot encode

In [24]:
def encode_tenure(row):
    # Change to either Freehold or Lease
    tenure = row["tenure"]
    if tenure == "Freehold":
        row["tenure"] = 1
    else:
        row["tenure"] = 0
    return row


df_properties_encode = df_properties.apply(encode_tenure, axis=1)


In [25]:
df_properties_encode.groupby("tenure")["property_key"].agg("count")


tenure
0    16807
1    14723
Name: property_key, dtype: int64

In [26]:
# one hot encode for df_properties
def one_hot(col, df):
    ohe = OneHotEncoder()
    transformed = ohe.fit_transform(df[[col]])
    df[ohe.categories_[0]] = transformed.toarray()
    df = df.drop(columns=col)

    return df


col_names = ["floorRange", "propertyType", "typeOfArea", "marketSegment"]
count = 0

for col in col_names:
    if count == 0:
        df_properties_one_hot_encoded = one_hot(col, df_properties_encode)
        count += 1
    else:
        df_properties_one_hot_encoded = one_hot(col, df_properties_one_hot_encoded)



In [27]:
df_properties_one_hot_encoded.head()


Unnamed: 0,area,district,tenure,street,project,property_key,-,01-05,06-10,11-15,...,Semi-detached,Strata Detached,Strata Semi-detached,Strata Terrace,Terrace,Land,Strata,CCR,OCR,RCR
0,226.0,4,0,COVE DRIVE,TURQUOISE,p-7b6c69000,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,194.0,4,0,COVE DRIVE,TURQUOISE,p-320ed2726,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,348.0,4,0,COVE DRIVE,TURQUOISE,p-2bc5c4951,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,223.0,4,0,COVE DRIVE,TURQUOISE,p-f060c5be0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,195.0,4,0,COVE DRIVE,TURQUOISE,p-a8d1004a9,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


#### geo_attributes.csv

In [28]:
df_geo_attributes.head()


Unnamed: 0,street,project,district,lat,lng,num_schools_1km,num_supermarkets_500m,num_mrt_stations_500m
0,LIM AH WOO ROAD,SUITES @ GUILLEMARD,15,1.31252,103.89176,7,2,3
1,HOLLAND ROAD,LOFT@HOLLAND,10,1.312364,103.797169,0,3,4
2,SOUTH BUONA VISTA ROAD,VIVA VISTA,5,1.279803,103.78591,0,0,2
3,UPPER SERANGOON ROAD,PARK RESIDENCES KOVAN,19,1.357934,103.881932,7,2,1
4,NATHAN ROAD,LOFT @ NATHAN,10,1.294461,103.827621,2,4,4


- street: to join with df_geo_attributes
- project: to join with df_geo_attributes
- district: to join with df_geo_attributes
- lat: do not use
- lng: do not use
- num_schools_1km: no change
- num_supermarkets_500m: no change
- num_mrt_stations_500m: no change

## Model Training

### Combine dataframes

In [29]:
# Join df_properties and df_geo_attributes
df_micro_encoded = pd.merge(
    df_properties_one_hot_encoded,
    df_geo_attributes,
    on=["street", "project", "district"],
    how="outer",
)
df_micro_encoded = df_micro_encoded.drop(columns=["street", "project", "district"])

# Drop lat and long
df_micro_encoded = df_micro_encoded.drop(columns=["lat", "lng"])


In [30]:
df_micro_encoded.head()


Unnamed: 0,area,tenure,property_key,-,01-05,06-10,11-15,16-20,21-25,26-30,...,Strata Terrace,Terrace,Land,Strata,CCR,OCR,RCR,num_schools_1km,num_supermarkets_500m,num_mrt_stations_500m
0,226.0,0,p-7b6c69000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,194.0,0,p-320ed2726,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,348.0,0,p-2bc5c4951,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,223.0,0,p-f060c5be0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,195.0,0,p-a8d1004a9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [31]:
df_macro = pd.merge(df_cpi, df_interest, on="date", how="outer")
df_macro = pd.merge(df_macro, df_rent_index, on="date", how="outer")
df_macro = pd.merge(df_macro, df_vacant, on="date", how="outer")


In [32]:
# Fill empty with latest data
df_macro = df_macro.fillna(method="bfill")


In [33]:
df_macro.head()


Unnamed: 0,CPI,date,InterestRate,RentIndex,Available,Vacant
0,111.186,2022-12-01,1.48,148.1,34084.0,524.0
1,110.959,2022-11-01,1.47,148.1,34084.0,524.0
2,109.893,2022-10-01,1.17,148.1,34084.0,524.0
3,110.339,2022-09-01,1.03,144.7,34084.0,520.666667
4,109.863,2022-08-01,0.95,141.3,34084.0,517.333333


In [34]:
df_train = df_train.rename(columns={"contractDate": "date"})
df_train["date"] = pd.to_datetime(df_train["date"])


In [35]:
df_train = pd.merge(df_train, df_micro_encoded, on="property_key", how="left")
df_train = pd.merge(df_train, df_macro, on="date", how="left")


In [36]:
df_train = df_train.set_index("property_key")


In [37]:
df_train.head()


Unnamed: 0_level_0,date,price,area,tenure,-,01-05,06-10,11-15,16-20,21-25,...,OCR,RCR,num_schools_1km,num_supermarkets_500m,num_mrt_stations_500m,CPI,InterestRate,RentIndex,Available,Vacant
property_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p-c1ce31019,2018-03-01,1850000.0,102.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,13.0,99.299,0.5,103.466667,31095.0,4320.0
p-a7fd7deb5,2018-03-01,1030000.0,121.0,1,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,1.0,0.0,99.299,0.5,103.466667,31095.0,4320.0
p-2a981615e,2018-03-01,4300000.0,300.0,1,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,1.0,99.299,0.5,103.466667,31095.0,4320.0
p-2bb570b5d,2018-03-01,1400888.0,107.0,1,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,6.0,0.0,2.0,99.299,0.5,103.466667,31095.0,4320.0
p-4ccd6e1db,2018-03-01,725000.0,54.0,0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,1.0,99.299,0.5,103.466667,31095.0,4320.0


In [38]:
df_train.loc[df_train.isnull().any(axis=1)]


Unnamed: 0_level_0,date,price,area,tenure,-,01-05,06-10,11-15,16-20,21-25,...,OCR,RCR,num_schools_1km,num_supermarkets_500m,num_mrt_stations_500m,CPI,InterestRate,RentIndex,Available,Vacant
property_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p-53b215525,2020-09-01,8880000.0,923.1,1,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,,,,100.139,0.25,103.866667,33874.666667,1026.333333


In [39]:
df_train = df_train.drop(index="p-53b215525")


In [40]:
df_train.loc[df_train.isnull().any(axis=1)]


Unnamed: 0_level_0,date,price,area,tenure,-,01-05,06-10,11-15,16-20,21-25,...,OCR,RCR,num_schools_1km,num_supermarkets_500m,num_mrt_stations_500m,CPI,InterestRate,RentIndex,Available,Vacant
property_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [41]:
y = df_train["price"]
X = df_train.drop(columns="price")


# Drop date for now, as linear regression model cannot use datetime
X = X.drop(columns="date")


### Linear Regression Model

In [42]:
lin_reg = LinearRegression()
lin_reg.fit(X, y)


### Submission

In [43]:
df_macro.head()


Unnamed: 0,CPI,date,InterestRate,RentIndex,Available,Vacant
0,111.186,2022-12-01,1.48,148.1,34084.0,524.0
1,110.959,2022-11-01,1.47,148.1,34084.0,524.0
2,109.893,2022-10-01,1.17,148.1,34084.0,524.0
3,110.339,2022-09-01,1.03,144.7,34084.0,520.666667
4,109.863,2022-08-01,0.95,141.3,34084.0,517.333333


In [44]:
df_new_months = pd.DataFrame({"date": ["2023-03-01", "2023-02-01", "2023-01-01"]})
df_new_months["date"] = pd.to_datetime(df_new_months["date"])


In [45]:
df_new_months.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    3 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 152.0 bytes


In [46]:
df_macro = pd.concat([df_new_months, df_macro])


In [47]:
df_macro = df_macro.fillna(method="bfill")


In [48]:
df_macro.head()


Unnamed: 0,date,CPI,InterestRate,RentIndex,Available,Vacant
0,2023-03-01,111.186,1.48,148.1,34084.0,524.0
1,2023-02-01,111.186,1.48,148.1,34084.0,524.0
2,2023-01-01,111.186,1.48,148.1,34084.0,524.0
0,2022-12-01,111.186,1.48,148.1,34084.0,524.0
1,2022-11-01,110.959,1.47,148.1,34084.0,524.0


In [49]:
df_val = df_test


In [50]:
df_val = df_val.rename(columns={"contractDate": "date"})
df_val["date"] = pd.to_datetime(df_val["date"])


In [51]:
df_val = pd.merge(df_val, df_micro_encoded, on="property_key", how="left")
df_val = pd.merge(df_val, df_macro, on="date", how="left")


In [52]:
df_val = df_val.set_index("property_key")


In [53]:
df_val.head()


Unnamed: 0_level_0,date,area,tenure,-,01-05,06-10,11-15,16-20,21-25,26-30,...,OCR,RCR,num_schools_1km,num_supermarkets_500m,num_mrt_stations_500m,CPI,InterestRate,RentIndex,Available,Vacant
property_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p-ff93e87ab,2023-01-01,147.0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,1.0,111.186,1.48,148.1,34084.0,524.0
p-38fe6afe9,2023-01-01,99.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,0.0,1.0,111.186,1.48,148.1,34084.0,524.0
p-fc9650179,2023-01-01,99.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,6.0,0.0,1.0,111.186,1.48,148.1,34084.0,524.0
p-59a09ad08,2023-01-01,108.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,111.186,1.48,148.1,34084.0,524.0
p-808332e5c,2023-01-01,72.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,6.0,1.0,111.186,1.48,148.1,34084.0,524.0


In [54]:
df_val.loc[df_val.isnull().any(axis=1)]


Unnamed: 0_level_0,date,area,tenure,-,01-05,06-10,11-15,16-20,21-25,26-30,...,OCR,RCR,num_schools_1km,num_supermarkets_500m,num_mrt_stations_500m,CPI,InterestRate,RentIndex,Available,Vacant
property_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p-f4822a16f,2023-03-01,939.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,,,,111.186,1.48,148.1,34084.0,524.0


In [55]:
df_val["num_schools_1km"] = df_val["num_schools_1km"].fillna(df_val["num_schools_1km"].median())
df_val["num_supermarkets_500m"] = df_val["num_supermarkets_500m"].fillna(
    df_val["num_supermarkets_500m"].median()
)
df_val["num_mrt_stations_500m"] = df_val["num_mrt_stations_500m"].fillna(
    df_val["num_mrt_stations_500m"].median()
)


In [56]:
# df_val = df_val.drop(index="p-f4822a16f")


In [57]:
df_val.loc[df_val.isnull().any(axis=1)]


Unnamed: 0_level_0,date,area,tenure,-,01-05,06-10,11-15,16-20,21-25,26-30,...,OCR,RCR,num_schools_1km,num_supermarkets_500m,num_mrt_stations_500m,CPI,InterestRate,RentIndex,Available,Vacant
property_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [58]:
# Drop date for now, as linear regression model cannot use datetime
X_val = df_val.drop(columns="date")


In [59]:
X_val


Unnamed: 0_level_0,area,tenure,-,01-05,06-10,11-15,16-20,21-25,26-30,31-35,...,OCR,RCR,num_schools_1km,num_supermarkets_500m,num_mrt_stations_500m,CPI,InterestRate,RentIndex,Available,Vacant
property_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p-ff93e87ab,147.0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,1.0,111.186,1.48,148.1,34084.0,524.0
p-38fe6afe9,99.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,0.0,1.0,111.186,1.48,148.1,34084.0,524.0
p-fc9650179,99.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,6.0,0.0,1.0,111.186,1.48,148.1,34084.0,524.0
p-59a09ad08,108.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,111.186,1.48,148.1,34084.0,524.0
p-808332e5c,72.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,6.0,1.0,111.186,1.48,148.1,34084.0,524.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
p-56681c564,84.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,2.0,1.0,111.186,1.48,148.1,34084.0,524.0
p-89a6ff5de,290.5,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,6.0,4.0,7.0,111.186,1.48,148.1,34084.0,524.0
p-920d750e9,115.0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,6.0,5.0,1.0,111.186,1.48,148.1,34084.0,524.0
p-e7146b43c,96.0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,7.0,4.0,3.0,111.186,1.48,148.1,34084.0,524.0


In [60]:
results = lin_reg.predict(X_val)


In [61]:
df_results = pd.Series(results, name="prediction")


In [62]:
df_results


0       3.537815e+06
1       1.695814e+06
2       1.323341e+06
3       1.465147e+06
4       1.060480e+06
            ...     
2326    1.264091e+06
2327    5.998765e+06
2328    3.232820e+06
2329    2.750498e+06
2330    9.624198e+05
Name: prediction, Length: 2331, dtype: float64

In [63]:
df_test


Unnamed: 0,property_key,contractDate
0,p-ff93e87ab,2023-01-01
1,p-38fe6afe9,2023-01-01
2,p-fc9650179,2023-01-01
3,p-59a09ad08,2023-01-01
4,p-808332e5c,2023-01-01
...,...,...
2326,p-56681c564,2023-03-01
2327,p-89a6ff5de,2023-03-01
2328,p-920d750e9,2023-03-01
2329,p-e7146b43c,2023-03-01


In [64]:
df_submission = pd.concat([df_test, df_results], axis=1)


In [65]:
df_submission.head()


Unnamed: 0,property_key,contractDate,prediction
0,p-ff93e87ab,2023-01-01,3537815.0
1,p-38fe6afe9,2023-01-01,1695814.0
2,p-fc9650179,2023-01-01,1323341.0
3,p-59a09ad08,2023-01-01,1465147.0
4,p-808332e5c,2023-01-01,1060480.0


In [66]:
df_submission.to_csv("data/linear_reg_submission.csv", index=False)


### XGBoost