In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

In [2]:
raw_data = pd.read_csv('Datasets/train.csv')
raw_data_test = pd.read_csv('Datasets/test.csv')

In [3]:
data = pd.concat([raw_data, raw_data_test], axis=0, ignore_index=True)
data

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent
0,2021-09,jurong east,257,Jurong East Street 24,3 room,new generation,67.0,yes,1983,1.344518,103.738630,0.0,yuhua east,jurong east,west region,1600.0
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250.0
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900.0
3,2021-08,pasir ris,250,Pasir Ris Street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850.0
4,2022-11,kallang/whampoa,34,Whampoa West,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,2022-08,punggol,615b,edgefield plains,4-room,model a,92.0,yes,2012,1.403746,103.909883,0.0,waterway east,punggol,north-east region,
89996,2023-02,sembawang,491,admiralty link,4-room,model a,90.0,yes,2004,1.456113,103.817717,0.0,sembawang north,sembawang,north region,
89997,2023-05,ang mo kio,524,ang mo kio avenue 5,3-room,new generation,68.0,yes,1980,1.373378,103.853043,0.0,cheng san,ang mo kio,north-east region,
89998,2023-02,woodlands,656,woodlands ring road,executive,maisonette,144.0,yes,1998,1.436986,103.799118,0.0,woodlands east,woodlands,north region,


### Encode each column

In [4]:
t, enc = sorted(data['rent_approval_date'].unique()), {}
print(t)
for i, date in enumerate(t):
    enc[date] = i
def rent_date_encode(x):
    return enc[x]

['2021-01', '2021-02', '2021-03', '2021-04', '2021-05', '2021-06', '2021-07', '2021-08', '2021-09', '2021-10', '2021-11', '2021-12', '2022-01', '2022-02', '2022-03', '2022-04', '2022-05', '2022-06', '2022-07', '2022-08', '2022-09', '2022-10', '2022-11', '2022-12', '2023-01', '2023-02', '2023-03', '2023-04', '2023-05', '2023-06', '2023-07']


In [5]:
def flat_type_encode(x):
    if len(x) > 2 and x[1] == '-':
        return int(x[0])
    else:
        return 5

In [6]:
def one_hot(data, *cols):
    for c in cols:
        one_hot_df = pd.get_dummies(data[c], prefix=c)
        data = data.drop(c, axis=1)
        data = pd.concat([data, one_hot_df], axis=1)
    return data

In [7]:
def drop_columns(data, *cols):
    for c in cols:
        data = data.drop(c, axis=1)
    return data

In [8]:
data = pd.concat([raw_data, raw_data_test], axis=0, ignore_index=True)
data['rent_approval_date'] = data['rent_approval_date'].apply(rent_date_encode)
data['flat_type'] = data['flat_type'].apply(flat_type_encode)
data = one_hot(data, 'town', 'region')
data = drop_columns(data, 'block', 'street_name', 'furnished', 'flat_model', 'elevation', 'subzone', 'planning_area')
data

Unnamed: 0,rent_approval_date,flat_type,floor_area_sqm,lease_commence_date,latitude,longitude,monthly_rent,town_ang mo kio,town_bedok,town_bishan,...,town_serangoon,town_tampines,town_toa payoh,town_woodlands,town_yishun,region_central region,region_east region,region_north region,region_north-east region,region_west region
0,8,5,67.0,1983,1.344518,103.738630,1600.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,16,4,92.0,1978,1.330186,103.938717,2250.0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,21,3,67.0,1971,1.332242,103.845643,1900.0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,7,5,149.0,1993,1.370239,103.962894,2850.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,22,3,68.0,1972,1.320502,103.863341,2100.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,19,4,92.0,2012,1.403746,103.909883,,0,0,0,...,0,0,0,0,0,0,0,0,1,0
89996,25,4,90.0,2004,1.456113,103.817717,,0,0,0,...,0,0,0,0,0,0,0,1,0,0
89997,28,3,68.0,1980,1.373378,103.853043,,1,0,0,...,0,0,0,0,0,0,0,0,1,0
89998,25,5,144.0,1998,1.436986,103.799118,,0,0,0,...,0,0,0,1,0,0,0,1,0,0


### Models

In [9]:
print(len(raw_data))
train_data, test_data = data[:len(raw_data)], data[len(raw_data):].reset_index(drop=True)
X_train, y_train = train_data.drop('monthly_rent', axis=1), train_data['monthly_rent']
X_test = test_data.drop('monthly_rent', axis=1)

60000


In [10]:
def run_model(model_name, model, parameters, scorer):
    print(f'\nMODEL: {model_name}')

    grid_search = GridSearchCV(model, parameters, cv=5, scoring=scorer)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print("Best Parameters:", best_params)
    print("Best Score:", best_score)
    
    best_model = type(model)(**best_params)
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    
    print(y_pred)


In [11]:
all_models = {'Linear Regression': (LinearRegression(), {'fit_intercept': [True, False]}), 
              'Ridge Regression': (Ridge(), {'alpha': [0.01, 0.1, 1, 10, 100]}), 
              'Lasso Regression': (Lasso(), {'alpha': [0.1, 1, 10]})
             }
scorer = make_scorer(mean_squared_error, squared=False)
for model_name, (model, params) in all_models.items():
    run_model(model_name, model, params, scorer)


MODEL: Linear Regression
Best Parameters: {'fit_intercept': False}
Best Score: 509.7813898874463
[3164.03754803 2629.04726876 3439.2765602  ... 2769.79812713 3344.53489854
 3578.85992291]

MODEL: Ridge Regression
Best Parameters: {'alpha': 100}
Best Score: 510.48133689550576
[3123.81369343 2625.05335473 3390.16108168 ... 2782.05069203 3346.25706741
 3583.91575931]

MODEL: Lasso Regression
Best Parameters: {'alpha': 10}
Best Score: 528.1979331629535
[3157.19453148 2769.55991657 3127.31615055 ... 2719.86510455 3417.54282465
 3438.37395453]


In [12]:
model = LinearRegression(**{'fit_intercept': False})

In [13]:
model.fit(X_train, y_train)

In [14]:
pred = model.predict(X_test)

In [15]:
pred

array([3164.03754803, 2629.04726876, 3439.2765602 , ..., 2769.79812713,
       3344.53489854, 3578.85992291])

In [20]:
ids = range(0, 30000)
df = pd.DataFrame({'id': ids, 'predicted': pred})
df.to_csv('predictions.csv', index=False)

In [21]:
df

Unnamed: 0,id,predicted
0,0,3164.037548
1,1,2629.047269
2,2,3439.276560
3,3,2079.725385
4,4,2750.396030
...,...,...
29995,29995,2754.750408
29996,29996,2814.366164
29997,29997,2769.798127
29998,29998,3344.534899
