# Data Preparation

In [88]:
# Necessary Imports
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Will add more imports here later

In [92]:

#pre-set file path
file_path = "../dataset/rental_prediction/Non-Landed Housing/"

#combining all into one file for non-landed
df1 = pd.read_csv(file_path+"1-5.csv")
df2 = pd.read_csv(file_path+"6-10.csv")
df3 = pd.read_csv(file_path+"11-15.csv")
df4 = pd.read_csv(file_path+"16-20.csv")
df5 = pd.read_csv(file_path+"21-23,25-26.csv")
df6 = pd.read_csv(file_path+"27-28.csv")

df_nonLanded = pd.concat([df1,df2,df3,df4,df5,df6], axis=0, join='inner')
df_nonLanded = df_nonLanded.drop(["S/N"],axis=1)
df_nonLanded = df_nonLanded.drop(["Building/Project Name"],axis=1)
df_nonLanded = df_nonLanded.drop(["Street Name"],axis=1)
df_nonLanded = df_nonLanded.drop(["No. of Bedroom(for Non-Landed Only)"],axis=1)
df_nonLanded.to_csv(file_path+"Non-Landed.csv")

#df_nonLanded.head(50)


In [None]:
#pre-set file path
file_path2 = "../dataset/rental_prediction/Landed Properties/"
file_name2 = "/Landed Properties/"

#combining all into one file for landed
df1 = pd.read_csv(file_path2+"1-5.csv")
df2 = pd.read_csv(file_path2+"6-10.csv")
df3 = pd.read_csv(file_path2+"11-15.csv")
df4 = pd.read_csv(file_path2+"16-20.csv")
df5 = pd.read_csv(file_path2+"21-23,25-26.csv")
df6 = pd.read_csv(file_path2+"27-28.csv")

df_Landed = pd.concat([df1,df2,df3,df4,df5,df6], axis=0, join='inner')
df_Landed = df_Landed.drop(["No. of Bedroom(for Non-Landed Only)"],axis=1)
df_Landed.to_csv(file_path2+"Landed.csv")

#dropped the no of bedrooms as there was no data recorded


In [None]:
file_path3 = "../dataset/rental_prediction/Executive Condominiums/"
file_name3 = "/Executive Condominiums/"


df1 = pd.read_csv(file_path3+"16-20.csv")
df2 = pd.read_csv(file_path3+"21-23,25-26.csv")
df3 = pd.read_csv(file_path3+"27-28.csv")

df_EC = pd.concat([df1,df2,df3], axis=0, join='inner')
#df_EC = df_EC.drop(["No. of Bedroom(for Non-Landed Only)"],axis=1)
df_EC.to_csv(file_path3+"Executive Condominiums.csv")

In [93]:

# Convert month column to 'month' type
df_nonLanded['Lease Commencement Date'] = pd.to_datetime(df_nonLanded['Lease Commencement Date'],format='%Y-%m', errors='coerce')
# Convert month to ordinal type
df_nonLanded['Lease Commencement Date'] = df_nonLanded['Lease Commencement Date'].map(datetime.datetime.toordinal)
df_nonLanded.head()

# Check data types of columns
df_nonLanded.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 411059 entries, 0 to 10101
Data columns (total 5 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Postal District          411059 non-null  int64 
 1   Type                     411059 non-null  object
 2   Monthly Gross Rent($)    411059 non-null  int64 
 3   Floor Area (sq ft)       411059 non-null  object
 4   Lease Commencement Date  411059 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 18.8+ MB


In [85]:
df_Landed.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 28919 entries, 0 to 1809
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   S/N                      28919 non-null  int64 
 1   Building/Project Name    28919 non-null  object
 2   Street Name              28919 non-null  object
 3   Postal District          28919 non-null  int64 
 4   Type                     28919 non-null  object
 5   Monthly Gross Rent($)    28919 non-null  int64 
 6   Floor Area (sq ft)       28919 non-null  object
 7   Lease Commencement Date  28919 non-null  object
dtypes: int64(3), object(5)
memory usage: 2.0+ MB


In [None]:
df_EC.info()

In [None]:
# Find out mean, median, standard deviation, etc
df_Landed.describe()

In [None]:
# Checks how many rows/columns are there
df_Landed.shape

## Check for null/empty values

In [None]:
df.isnull().sum()

In [None]:
# Change which columns to drop later
# And change the type of the column

df = df.drop(['block','street_name', 'month', 'remaining_lease'], axis=1)
df.head()

In [None]:
df.keys()

In [None]:
col = "resale_price"
df[col].hist()
plt.suptitle(col)
plt.show()

In [None]:
# Remove month, storey_range column later
df = pd.get_dummies(df, columns=['town','flat_type', 'storey_range', 'flat_model'])
df.head()

In [None]:
df.shape

# Training Model - Regression
- We will test the training of model first.

In [94]:
X = df_nonLanded.drop(['Monthly Gross Rent($)'], axis=1)
Y = df_nonLanded['Monthly Gross Rent($)']

print(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

       Postal District                   Type Floor Area (sq ft)  \
0                    4  Non-landed Properties         400 to 500   
1                    4  Non-landed Properties       1900 to 2000   
2                    3  Non-landed Properties       1000 to 1100   
3                    2  Non-landed Properties         800 to 900   
4                    1  Non-landed Properties       2200 to 2300   
...                ...                    ...                ...   
10097               27  Non-landed Properties         700 to 800   
10098               28  Non-landed Properties       1100 to 1200   
10099               27  Non-landed Properties       1400 to 1500   
10100               27  Non-landed Properties       1000 to 1100   
10101               28  Non-landed Properties       1300 to 1400   

       Lease Commencement Date  
0                            1  
1                            1  
2                            1  
3                            1  
4                 

In [95]:
reg = LinearRegression()

In [96]:
reg.fit(X_train,y_train)

ValueError: could not convert string to float: 'Non-landed Properties'

In [None]:
test = reg.predict(X_test)
print(test)
