# Setup

In [1]:
!pip install flaml
# !pip install h2o

Collecting flaml
  Downloading FLAML-2.1.1-py3-none-any.whl (295 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/295.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/295.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.2/295.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: flaml
Successfully installed flaml-2.1.1


In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBRegressor

# Data Loading & EDA

In [34]:
train = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/30_Training Dataset_V2/training_data_v1104v2.csv')
testpub = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/public test/public_dataset_v1104v3.csv')
testprv = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/private test/private_dataset_v3.csv')
subpub = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/public_submission_template.csv')
subprv = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/private_submission_template.csv')
subpubprv = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/public_private_submission_template.csv')

In [35]:
print('train:', train.shape)
print('public test :', testpub.shape)
print('private test:', testprv.shape)
print('public sub  :', subpub.shape)
print('public sub  :', subprv.shape)
print('pubic_private sub :', subpubprv.shape)

train: (11751, 39)
public test : (5876, 38)
private test: (5875, 38)
public sub  : (5876, 2)
public sub  : (5875, 2)
pubic_private sub : (11751, 2)


In [36]:
# Change column name
df_train = train.copy()
df_testpub = testpub.copy()
df_testprv = testprv.copy()

column_name_mapping = {
    '縣市': 'County',
    '鄉鎮市區': 'District',
    '路名': 'Road',
    '土地面積': 'Land Area',
    '使用分區': 'Use Partitions',
    '移轉層次': 'Floor Level',
    '總樓層數': 'Total Floors',
    '主要用途': 'Main Purpose',
    '主要建材': 'Main Materials',
    '建物型態': 'Building Type',
    '屋齡': 'House Age',
    '建物面積': 'Building Area',
    '車位面積': 'Parking Area',
    '車位個數': 'Num of Parking',
    '橫坐標': 'x-axis',
    '縱坐標': 'y-axis',
    '主建物面積': 'Main Building Area',
    '陽台面積': 'Balcony Area',
    '附屬建物面積': 'Ancillary Building Area',
    '單價': 'Price'
}

df_train.rename(columns=column_name_mapping, inplace=True)
df_testpub.rename(columns=column_name_mapping, inplace=True)
df_testprv.rename(columns=column_name_mapping, inplace=True)

# Data Preprocessing

In [37]:
df_train = df_train.drop(['備註'], axis=1)
df_testpub = df_testpub.drop(['備註'], axis=1)
df_testprv = df_testprv.drop(['備註'], axis=1)

print('train:', df_train.shape)
print('public test :', df_testpub.shape)
print('private test:', df_testprv.shape)

train: (11751, 38)
public test : (5876, 37)
private test: (5875, 37)


In [38]:
df_train = pd.DataFrame(df_train)
df_testpub = pd.DataFrame(df_testpub)
df_testprv = pd.DataFrame(df_testprv)

df_train['ID'] = df_train['ID'].apply(lambda x: int(x.replace('TR-', '')) if isinstance(x, str) and x.startswith('TR-') else x)
df_testpub['ID'] = df_testpub['ID'].apply(lambda x: int(x.replace('PU-', '')) if isinstance(x, str) and x.startswith('PU-') else x)
df_testprv['ID'] = df_testprv['ID'].apply(lambda x: int(x.replace('PR-', '')) if isinstance(x, str) and x.startswith('PR-') else x)

In [40]:
numeric_features = ['Land Area', 'Floor Level', 'Total Floors', 'House Age', 'Building Area', 'Parking Area', 'Num of Parking', 'Main Building Area',
                    'Balcony Area', 'Ancillary Building Area']
train_numeric = df_train[numeric_features]
testpub_numeric = df_testpub[numeric_features]

scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_numeric)
testpub_scaled = scaler.transform(testpub_numeric)

df_train[numeric_features] = train_scaled
df_testpub[numeric_features] = testpub_scaled

In [41]:
label_encoder = LabelEncoder()
object_columns = df_train.select_dtypes(include=['object']).columns

for column in object_columns:
    combined_data = pd.concat([df_train[column], df_testpub[column], df_testprv[column]], axis=0)
    label_encoder.fit(combined_data)

    df_train[column] = label_encoder.transform(df_train[column])
    df_testpub[column] = label_encoder.transform(df_testpub[column])
    df_testprv[column] = label_encoder.transform(df_testprv[column])

In [42]:
# finding correlation of every variables with target
correlation_matrix = df_train.corr()
correlation_with_target = correlation_matrix['Price']
print(correlation_with_target)

ID                                -0.002611
County                            -0.459016
District                          -0.236772
Road                              -0.035864
Land Area                          0.067088
Use Partitions                     0.008904
Floor Level                        0.054038
Total Floors                       0.037874
Main Purpose                      -0.057103
Main Materials                     0.011903
Building Type                      0.016075
House Age                          0.049196
Building Area                      0.061327
Parking Area                       0.099986
Num of Parking                     0.013064
x-axis                             0.455633
y-axis                             0.394402
Main Building Area                 0.056150
Balcony Area                       0.009866
Ancillary Building Area            0.064905
Price                              1.000000
lng                                0.456070
lat                             

In [43]:
df2_train = df_train.copy()
df2_testpub = df_testpub.copy()
df2_testprv = df_testprv.copy()

df2_train = df2_train.drop(['ID', 'Use Partitions', 'Balcony Area'], axis=1)
df2_testpub = df2_testpub.drop(['ID', 'Use Partitions', 'Balcony Area'], axis=1)
df2_testprv = df2_testprv.drop(['ID', 'Use Partitions', 'Balcony Area'], axis=1)

print('train:', df2_train.shape)
print('public test :', df2_testpub.shape)
print('private test:', df2_testprv.shape)

train: (11751, 35)
public test : (5876, 34)
private test: (5875, 34)


In [45]:
# Data Spliting
X = df2_train.drop('Price', axis=1)
y = df2_train['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 42)

# Model Training

In [None]:
# Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

train_mape_lr = mean_absolute_percentage_error(y_train, y_pred_train)
test_mape_lr = mean_absolute_percentage_error(y_test, y_pred)

print('train mape: {:.4f}'.format(train_mape_lr * 100))
print('test mape: {:.4f}'.format(test_mape_lr * 100))

train mape: 18.2111
test mape: 18.1659


In [None]:
# XGBoost
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)
model = XGBRegressor()

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=mape_scorer,
    cv=10
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

y_pred_train_best = best_model.predict(X_train)
y_pred_best = best_model.predict(X_test)

train_mape_xgb = mean_absolute_percentage_error(y_train, y_pred_train_best)
test_mape_xgb = mean_absolute_percentage_error(y_test, y_pred_best)

print('train mape: {:.4f}'.format(train_mape_xgb * 100))
print('test mape: {:.4f}'.format(test_mape_xgb * 100))

train mape: 4.5596
test mape: 9.4585


## H2O

In [None]:
# H2O
import h2o
h2o.init()
from h2o.automl import H2OAutoML

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.20.1" 2023-08-24; OpenJDK Runtime Environment (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpcawq7vgr
  JVM stdout: /tmp/tmpcawq7vgr/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpcawq7vgr/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,05 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,3 days
H2O_cluster_name:,H2O_from_python_unknownUser_68qvok
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [None]:
trainframe = h2o.H2OFrame(df2_train)
testframe = h2o.H2OFrame(df_testpub)
trainf, testf, validf = trainframe.split_frame([0.6, 0.2], seed=42)
y = 'Price'
X = list(trainframe.columns)
X.remove(y)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [None]:
aml = H2OAutoML(max_models=100, max_runtime_secs=3600, seed=42)
aml.train(x=X, y=y, training_frame=trainf)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,61.0,61.0,163760.0,9.0,9.0,9.0,57.0,340.0,209.08197

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,0.1842823,0.0055304,0.178384,0.1788923,0.1912819,0.1860344,0.1868189
mean_residual_deviance,0.0963063,0.0565354,0.0682675,0.0618734,0.0869792,0.0683753,0.1960362
mse,0.0963063,0.0565354,0.0682675,0.0618734,0.0869792,0.0683753,0.1960362
r2,0.9045802,0.0462143,0.921582,0.9329649,0.919916,0.9260337,0.8224044
residual_deviance,0.0963063,0.0565354,0.0682675,0.0618734,0.0869792,0.0683753,0.1960362
rmse,0.3018386,0.0806209,0.2612805,0.2487437,0.2949224,0.2614867,0.4427598
rmsle,0.0801725,0.0047783,0.0756773,0.0757299,0.0805111,0.0817984,0.0871457

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2023-11-11 10:38:33,1 min 49.788 sec,0.0,0.9910064,0.7300045,0.9820937
,2023-11-11 10:38:33,1 min 49.975 sec,5.0,0.6334918,0.4553143,0.4013118
,2023-11-11 10:38:33,1 min 50.153 sec,10.0,0.4279385,0.2983402,0.1831314
,2023-11-11 10:38:33,1 min 50.331 sec,15.0,0.3149069,0.2101289,0.0991664
,2023-11-11 10:38:34,1 min 50.507 sec,20.0,0.2484098,0.1610321,0.0617074
,2023-11-11 10:38:34,1 min 50.700 sec,25.0,0.2103529,0.1335108,0.0442483
,2023-11-11 10:38:34,1 min 50.878 sec,30.0,0.1868765,0.1176388,0.0349228
,2023-11-11 10:38:34,1 min 51.039 sec,35.0,0.1705854,0.1073661,0.0290994
,2023-11-11 10:38:34,1 min 51.201 sec,40.0,0.1583876,0.100647,0.0250866
,2023-11-11 10:38:34,1 min 51.350 sec,45.0,0.1490425,0.0962129,0.0222137

variable,relative_importance,scaled_importance,percentage
Snhigh school Count within 10km,10381.3066406,1.0,0.3036048
Bank Count within 10km,4483.0434570,0.4318381,0.1311081
County,2690.7814941,0.2591949,0.0786928
House Age,2357.3869629,0.2270800,0.0689426
x-axis,1705.9460449,0.1643286,0.0498910
Jnhighschool Count within 10km,1372.9154053,0.1322488,0.0401514
lng,1371.3763428,0.1321006,0.0401064
ATM Count within 10km,1192.0511475,0.1148267,0.0348619
Primschool Count within 10km,908.9412231,0.0875556,0.0265823
Nearest Metro Distance (km),802.0043335,0.0772547,0.0234549


In [None]:
def MAPE(actual, predict):
  abs_pct_error = abs((actual - predict) / actual)
  mape = abs_pct_error.mean()[0]
  return mape

train_mape_h2o = MAPE(trainf[y], aml.predict(trainf))
test_mape_h2o = MAPE(testf[y], aml.predict(testf))

print("train mape: " + str(round(train_mape_h2o, 4)*100))
print("test mape: " + str(round(test_mape_h2o, 4)*100))

gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
train mape: 4.859999999999999
test mape: 10.05


## FLAML

In [46]:
from flaml import AutoML
from flaml.automl.data import get_output_from_log
flaml = AutoML()

In [47]:
# Define hyperparameters
settings = {
    "time_budget": 3600,  # total running time in seconds
    "metric": 'mape',
    "task": 'regression',
    "log_file_name": 'airlines_experiment.log',
}

flaml.fit(X_train = X_train, y_train = y_train, **settings)

[flaml.automl.logger: 11-13 12:13:40] {1679} INFO - task = regression
[flaml.automl.logger: 11-13 12:13:40] {1690} INFO - Evaluation method: cv
[flaml.automl.logger: 11-13 12:13:40] {1788} INFO - Minimizing error metric: mape
[flaml.automl.logger: 11-13 12:13:40] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 11-13 12:13:40] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 11-13 12:13:41] {2344} INFO - Estimated sufficient time budget=3770s. Estimated necessary time budget=27s.
[flaml.automl.logger: 11-13 12:13:41] {2391} INFO -  at 0.5s,	estimator lgbm's best error=0.3404,	best estimator lgbm's best error=0.3404
[flaml.automl.logger: 11-13 12:13:41] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 11-13 12:13:41] {2391} INFO -  at 0.9s,	estimator lgbm's best error=0.3404,	best estimator lgbm's best error=0.3404
[flaml.automl.logger: 11-13 12:13:41] {2218} INFO - 

In [48]:
from flaml.ml import sklearn_metric_loss_score
train_mape = sklearn_metric_loss_score('mape', flaml.predict(X_train), y_train)
test_mape = sklearn_metric_loss_score('mape', flaml.predict(X_test), y_test)

print('train mape: {:.4f}'.format(train_mape * 100))
print('test mape: {:.4f}'.format(test_mape * 100))

train mape: 0.7354
test mape: 8.5191


# Test

In [32]:
subpub = pd.DataFrame(subpub)
pred = flaml.predict(df_testpub)
subpub['predicted_price'] = pred

In [None]:
datatp = h2o.H2OFrame(df_testpub)
pred = aml.leader.predict(datatp)
predd = pred.as_data_frame()
subpub['predicted_price'] = predd

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [49]:
subpub.to_csv('/content/drive/MyDrive/ME /SinoPac/submission_20.csv', index=False)