# CS5228 Team Not Found
## Tree-based models on various dataset

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
import category_encoders as ce
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from geopy.distance import geodesic
import geopy

from sklearn.model_selection import GridSearchCV
from sklearn import tree

In [2]:
# load pre-processed data
train_df = pd.read_csv('./data/final-train.csv', index_col=0)
test_df = pd.read_csv('./data/final-test.csv', index_col=0)
train_df.columns

Index(['type', 'bedrooms', 'bathrooms', 'district', 'planning_area', 'lat',
       'lng', 'freehold', 'since_built_year', 'no_of_units', 'area_size',
       'since_listing_month', 'additional_rooms', 'price', 'model_0',
       'model_1', 'model_2', 'model_3', 'region_0', 'region_1', 'region_2',
       'region_3', 'region_4', 'betweenness_closest_mrt', 'closest_comercial',
       'closest_hawker', 'closest_primary', 'closest_secondary',
       'closest_mall'],
      dtype='object')

## Random Forest

In [3]:
best_param = {'bootstrap': True, 'max_depth': 50, 'max_features': None, 'min_samples_split': 5, 'n_estimators': 100}
rfr = RandomForestRegressor()
rfr.set_params(**best_param)

RandomForestRegressor(max_depth=50, max_features=None, min_samples_split=5)

### numeric only

In [4]:
train_df =  pd.read_csv('./data/train_numeric.csv', index_col = 0)
X_train = train_df.drop(columns = ['price'])
y_train = train_df['price']
X_train.columns

Index(['bedrooms', 'bathrooms', 'lat', 'lng', 'since_built_year',
       'no_of_units', 'area_size', 'since_listing_month'],
      dtype='object')

In [5]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
tuned_r2 = np.mean(cross_val_score(rfr, X_train, y_train, scoring='r2', cv=folds))
tuned_rmse =(-1) * np.mean(cross_val_score(rfr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=folds))

display(tuned_r2)
display(tuned_rmse)

9.557071526290075e-01

832075.2254396887

### numerical + categorical (with encoding)

In [6]:
train_df =  pd.read_csv('./data/train_numeric_bin_target.csv')
X_train = train_df.drop(columns = ['price'])
y_train = train_df['price']
X_train.columns

Index(['model', 'bedrooms', 'bathrooms', 'district', 'region', 'planning_area',
       'lat', 'lng', 'freehold', 'since_built_year', 'no_of_units',
       'area_size', 'since_listing_month', 'additional_rooms'],
      dtype='object')

In [7]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
tuned_r2 = np.mean(cross_val_score(rfr, X_train, y_train, scoring='r2', cv=folds))
tuned_rmse =(-1) * np.mean(cross_val_score(rfr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=folds))

display(tuned_r2 )
display(tuned_rmse)

0.9574610863940249

820206.4389728281

### numerical + categorical (with encoding) + amenties (betweenness weighted)

In [8]:
# load pre-processed data
train_df = pd.read_csv('./data/final-train.csv', index_col=0)
test_df = pd.read_csv('./data/final-test.csv', index_col=0)
X_train = train_df.drop(columns = ['price'])
y_train = train_df['price']

In [9]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
tuned_r2 = np.mean(cross_val_score(rfr, X_train, y_train, scoring='r2', cv=folds))
tuned_rmse =(-1) * np.mean(cross_val_score(rfr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=folds))

display(tuned_r2 )
display(tuned_rmse)

0.966070853641849

759536.8037606042

## Gradient Boosting

In [10]:
best_param ={'learning_rate': 0.1,
 'max_depth': 10,
 'max_features': None,
 'min_samples_split': 10,
 'n_estimators': 200,
 'subsample': 0.8}
clf =  GradientBoostingRegressor(n_estimators=100, random_state=0)
clf.set_params(**best_param)

GradientBoostingRegressor(max_depth=10, min_samples_split=10, n_estimators=200,
                          random_state=0, subsample=0.8)

### numerical

In [11]:
train_df =  pd.read_csv('./data/train_numeric.csv', index_col = 0)
X_train = train_df.drop(columns = ['price'])
y_train = train_df['price']

In [12]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
r2_scores = np.mean(cross_val_score(clf, X_train, y_train, scoring='r2', cv=folds))
rmse = np.mean(cross_val_score(clf, X_train, y_train, scoring='neg_root_mean_squared_error', cv=folds))
clf.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=10, min_samples_split=10, n_estimators=200,
                          random_state=0, subsample=0.8)

In [13]:
display(r2_scores)
display(rmse)

0.9583081492927272

-811672.340079638

### numerical + categorical (with encoding)

In [14]:
train_df =  pd.read_csv('./data/train_numeric_bin_target.csv')
X_train = train_df.drop(columns = ['price'])
y_train = train_df['price']
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
r2_scores = np.mean(cross_val_score(clf, X_train, y_train, scoring='r2', cv=folds))
rmse = np.mean(cross_val_score(clf, X_train, y_train, scoring='neg_root_mean_squared_error', cv=folds))
clf.fit(X_train, y_train)
display(r2_scores)
display(rmse)

0.9580959145664586

-810457.0338227935

### numerical + categorical (with encoding) + amenties (betweenness weighted)

In [15]:
# load pre-processed data
train_df = pd.read_csv('./data/final-train.csv', index_col=0)
X_train = train_df.drop(columns = ['price'])
y_train = train_df['price']
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
r2_scores = np.mean(cross_val_score(clf, X_train, y_train, scoring='r2', cv=folds))
rmse = np.mean(cross_val_score(clf, X_train, y_train, scoring='neg_root_mean_squared_error', cv=folds))
clf.fit(X_train, y_train)
display(r2_scores)
display(rmse)

0.9647611578135787

-753471.4184826509