In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_openml

In [3]:
import scipy.io # for directly downloading .mat files
import pyreadr  # for directly downloading .rda files

This notebook shows how to get the public datasets used in the paper CPoE. In particular, the datasets are downloaded, the inputs and outputs are selected and standarized. Finally, they are stored as .csv data all in the same format which are then used as input for the algorithms.

## i) get datasets from LIBSSVM dataset repository 
#### (which is based on other data repositories such as UCI,...)

In [4]:
url_dict = {
    'abalone':'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/abalone',
    'cadata':'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/cadata',
    'mg':'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/mg',
    'space_ga':'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/space_ga'
    }

In [5]:
for name in ['abalone', 'cadata', 'mg', 'space_ga']:
    
    # read csv file from url
    data = pd.read_csv(url_dict[name], sep=' ', header=None)
    
    # response/output
    yy = np.array(data.iloc[:,0])

    # select inputs
    XX = np.hstack([ np.array([np.float(it[2:]) for it in np.array(data.iloc[:,col])])[:,None] for col in np.arange(1,data.shape[1]) ])

    # scale and store as csv with same format
    GG = np.hstack([XX, yy[:,None]])
    df = StandardScaler().fit_transform(np.array(GG))
    pd.DataFrame(df).to_csv('datasets/DAT'+name+'.csv')

## ii) get datasets from open ML repository
#### (which is based on other data sources such as UCI,...)

In [6]:
name = 'kin8nm'
# download data and select rows for input and output
data0 = fetch_openml('kin8nm', version=1)
DF = data0['data']
DF['y'] = data0['target']

# standarize and save
df = StandardScaler().fit_transform(DF)
pd.DataFrame(df).to_csv('datasets/DAT'+name+'.csv')

In [7]:
name = 'concrete'
# download data and select rows for input and output
data0 = fetch_openml('Concrete_Data')
DF = data0['data']

# standarize and save
df = StandardScaler().fit_transform(DF)
pd.DataFrame(df).to_csv('datasets/DAT'+name+'.csv')

In [8]:
name = 'casp'
# download data and select rows for input and output
data0 = fetch_openml('physicochemical-protein')
DF = data0['data'][['F'+str(i) for i in range(1,10)]]
DF['y'] = data0['data']['RMSD']

# standarize and save
df = StandardScaler().fit_transform(DF)
pd.DataFrame(df).to_csv('datasets/DAT'+name+'.csv')

## iii) get sarcos dataset from Gaussianprocess.org

In [9]:
name = 'sarcos'

In [10]:
# get matlab file
!wget http://gaussianprocess.org/gpml/data/sarcos_inv.mat?raw=true

--2022-11-22 13:40:42--  http://gaussianprocess.org/gpml/data/sarcos_inv.mat?raw=true
Resolving gaussianprocess.org (gaussianprocess.org)... 185.199.108.153
Connecting to gaussianprocess.org (gaussianprocess.org)|185.199.108.153|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9964616 (9.5M) [application/octet-stream]
Saving to: ‘sarcos_inv.mat?raw=true’


2022-11-22 13:40:53 (962 KB/s) - ‘sarcos_inv.mat?raw=true’ saved [9964616/9964616]



In [11]:
# create unified .csv file
mat = scipy.io.loadmat('sarcos_inv.mat?raw=true')
df = StandardScaler().fit_transform(np.array(mat['sarcos_inv'][:,:22])) # as done in several GP papers
pd.DataFrame(df).to_csv('datasets/DAT'+name+'.csv')

In [12]:
# remove matlab file
!rm sarcos_inv.mat?raw=true

## iv) get elecdemand dataset from fpp2 package

In [13]:
name = 'elecdemand'

In [14]:
# get .rda dataset
!wget https://github.com/robjhyndman/fpp2-package/raw/master/data/elecdemand.rda

--2022-11-22 13:40:55--  https://github.com/robjhyndman/fpp2-package/raw/master/data/elecdemand.rda
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/robjhyndman/fpp2-package/master/data/elecdemand.rda [following]
--2022-11-22 13:40:55--  https://raw.githubusercontent.com/robjhyndman/fpp2-package/master/data/elecdemand.rda
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 143201 (140K) [application/octet-stream]
Saving to: ‘elecdemand.rda’


2022-11-22 13:40:56 (1.36 MB/s) - ‘elecdemand.rda’ saved [143201/143201]



In [15]:
# transfrom .rda file into panda dataframe
result = pyreadr.read_r('elecdemand.rda') 
DF = result['elecdemand']
# remove .rda file
!rm elecdemand.rda

In [16]:
# reorder so that target is last column
DF = DF[['Temperature','WorkDay','Demand']]
df = StandardScaler().fit_transform(DF)

In [17]:
# create time variable ranging from 0 to 1 [1 year]
time = np.arange(1,DF.shape[0]+1)/(2*24*365) 
# and add it as first column
mat = np.hstack( [time[:,None], df] )

In [18]:
# store it as unified .csv
pd.DataFrame(mat).to_csv('datasets/DAT'+name+'.csv')

## load created .csv data

In [19]:
for name in ['concrete', 'mg', 'space_ga','abalone', 'kin8nm','cadata','sarcos','casp','elecdemand' ]:
    # load .csv data
    data = np.array(pd.read_csv('datasets/DAT'+name+'.csv', index_col=0))
    X = data[:,:-1]
    y = data[:,-1]
    
    print(name, "\nN =",len(y), "\nD =",X.shape[1],'\n')

concrete 
N = 1030 
D = 8 

mg 
N = 1385 
D = 6 

space_ga 
N = 3107 
D = 6 

abalone 
N = 4177 
D = 8 

kin8nm 
N = 8192 
D = 8 

cadata 
N = 20640 
D = 8 

sarcos 
N = 44484 
D = 21 

casp 
N = 45730 
D = 9 

elecdemand 
N = 17520 
D = 3 

