In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.decomposition import PCA

from sklearn.svm import SVR
import time

In [2]:
train_df = pd.read_csv('./data/train_svm.csv')
test_df = pd.read_csv('./data/test_svm.csv')
train_df.head()

Unnamed: 0,floor_area_sqm,planning_area_jurong east,planning_area_bedok,planning_area_toa payoh,planning_area_pasir ris,planning_area_kallang,planning_area_bukit panjang,planning_area_sengkang,planning_area_ang mo kio,planning_area_bishan,...,flat_model_3gen,lat_rads,long_rads,sti,coe,age,shortest_dist_mall,shortest_ex_mrt,shortest_pl_mrt,monthly_rent
0,67.0,1,0,0,0,0,0,0,0,0,...,0,0.023466,1.810581,0.405905,0.258731,38.667461,1.202674,0.699127,0.699127,1600
1,92.0,0,1,0,0,0,0,0,0,0,...,0,0.023216,1.814073,0.853443,0.597983,44.329452,1.114338,0.898991,0.898991,2250
2,67.0,0,0,1,0,0,0,0,0,0,...,0,0.023252,1.812448,0.47643,0.766542,51.74918,0.468297,0.218603,0.218603,1900
3,149.0,0,0,0,1,0,0,0,0,0,...,0,0.023915,1.814495,0.537475,0.192568,28.581011,0.402359,1.54604,1.54604,2850
4,68.0,0,0,0,0,1,0,0,0,0,...,0,0.023047,1.812757,0.476863,0.880567,50.834719,1.073354,0.187856,0.187856,2100


In [3]:
print(train_df.shape)
print(test_df.shape)

(59727, 63)
(30000, 62)


In [4]:
X = train_df.loc[:,train_df.columns != 'monthly_rent']
y = train_df['monthly_rent']

In [5]:
X.shape

(59727, 62)

# With PCA

In [9]:
pca = PCA(n_components=30)
X = pca.fit(X).transform(X)

svr_rbf = SVR(kernel="rbf", C = 1000, gamma=0.01, epsilon = 10)

start_time = time.time()
print(cross_val_score(svr_rbf, X, y, cv = 5, n_jobs = 4, scoring = 'neg_root_mean_squared_error'))
elapsed_time = time.time() - start_time

print("elapsed time: {:.2}".format(elapsed_time))

[-529.43048037 -528.79017328 -535.48744001 -527.55580737 -531.60330544]
elapsed time: 4.1e+02


# Without PCA

In [9]:
X = train_df.loc[:,train_df.columns != 'monthly_rent']

svr_rbf = SVR(kernel="rbf", C = 2000, gamma = 0.05, epsilon = 0.1)

start_time = time.time()
print(cross_val_score(svr_rbf, X, y, cv = 5, n_jobs = 4, scoring = 'neg_root_mean_squared_error'))
elapsed_time = time.time() - start_time

print("elapsed time: {:.2}".format(elapsed_time))

[-517.08366448 -515.61105827 -523.85444329 -514.63422245 -519.52438383]
elapsed time: 6.2e+02


# Final SVM Model

In [11]:
final_svr_rbf = SVR(kernel="rbf", C = 2000, gamma=0.01, epsilon = 10)
start_time = time.time()
final_svr_rbf.fit(X, y)
elapsed_time = time.time() - start_time
print("elapsed time: {:.2}".format(elapsed_time))

elapsed time: 3.2e+02


In [12]:
preds = final_svr_rbf.predict(test_df)
submission_df = pd.DataFrame({'Predicted': preds})
submission_df.rename_axis('Id', inplace=True)
submission_df.to_csv('./data/submission_svm_test.csv')

# Grid Search to tune the hyperparameter

In [13]:
svr = SVR()

parameters = {
    'kernel':['rbf'],
    'C': [1, 10, 100, 500, 1000, 2000],
    'gamma': [0.1, 0.05, 0.01,0.005],
    'epsilon': [0.1],
}

grid = GridSearchCV(svr, parameters, cv = 5, n_jobs = 4, scoring= 'neg_root_mean_squared_error')

start_time = time.time()
grid.fit(X, y)
elapsed_time = time.time() - start_time

print(grid.best_estimator_)

SVR(C=2000, gamma=0.01)


In [9]:
print("elapsed time: {:.2}".format(elapsed_time))

elapsed time: 4.2e+03
