In [20]:
import numpy as np
import pandas as pd
import RQP_test_data_creator
from scipy.io import arff


def sample_interval_from_data(data_train):
    n = data_train.shape[0]
    num_features = data_train.shape[1]
    x1 = data_train[np.random.choice(n)]
    x2 = data_train[np.random.choice(n)]
    feature_intervals = []
    for feature in range(num_features):
        lower_bound = np.min([x1[feature], x2[feature]])
        upper_bound = np.max([x1[feature], x2[feature]])
        feature_intervals.append((lower_bound, upper_bound))
    return feature_intervals


file_path_train = "RQPdata/boston.arff_RQPtrain.arff"
file_path_test = "RQPdata/boston.arff_RQPtest.arff"
data_train_raw = pd.DataFrame(arff.loadarff(file_path_train)[0])
data_test_raw = pd.DataFrame(arff.loadarff(file_path_test)[0])
X_train_raw = data_train_raw.iloc[:, :-1]

aug_sample_numbers = [200, 500, 1000, 2000]
num_features = X_train_raw.shape[1]
data_train_aug = []
for i in range(aug_sample_numbers[3]):
    feature_intervals = sample_interval_from_data(X_train_raw.values)
    y_hat_min = np.inf
    y_hat_max = -np.inf
    for index, data_point in data_train_raw.iterrows():
        in_interval = True
        for feature in range(num_features):
            feature_interval = feature_intervals[feature]
            if feature_interval[0] > data_point[feature] or feature_interval[1] < data_point[feature]:
                in_interval = False
        if in_interval:
            y_hat_min = np.min([y_hat_min, data_point[-1]])
            y_hat_max = np.max([y_hat_max, data_point[-1]])
    #if not y_hat_min == np.inf and not y_hat_max == -np.inf:
    aug_point = []
    for interval in feature_intervals:
        aug_point.append(interval[0])
        aug_point.append(interval[1])
    aug_point.append(y_hat_min)
    aug_point.append(y_hat_max)
    data_train_aug.append(aug_point)
data_train_aug = pd.DataFrame(data_train_aug)

#print(data_train_aug.describe())

X_test = data_test_raw.iloc[:, :-2]
Y_test_min = data_test_raw.iloc[:, -2:]
Y_test_max = data_test_raw.iloc[:, :-1]

In [21]:
from sklearn.ensemble import RandomForestRegressor

X_train_aug = data_train_aug.iloc[:, :-2]
Y_min_aug = np.ravel(data_train_aug.iloc[:, -2:-1])
Y_max_aug = np.ravel(data_train_aug.iloc[:, -1:])

reps = 100
min_r2s = []
max_r2s = []
l1_mins = []
l1_maxs = []
for rep in range(reps):
    min_classifier = RandomForestRegressor()
    min_classifier = min_classifier.fit(X_train_aug, Y_min_aug)

    max_classifier = RandomForestRegressor()
    max_classifier = max_classifier.fit(X_train_aug, Y_max_aug)

    X_test = data_test_raw.iloc[:, :-2]
    Y_test_min = data_test_raw.iloc[:, -2:-1]
    Y_test_max = data_test_raw.iloc[:, -1:]

    l1_min = np.sum(np.absolute(np.subtract(min_classifier.predict(X_test), np.ravel(Y_test_min)))) / X_test.shape[0]
    l1_max = np.sum(np.absolute(np.subtract(max_classifier.predict(X_test), np.ravel(Y_test_max)))) / X_test.shape[0]
    
    min_r2s.append(min_classifier.score(X_test, Y_test_min))
    max_r2s.append(max_classifier.score(X_test, Y_test_max))
    l1_mins.append(l1_min)
    l1_maxs.append(l1_max)
    
print("Min R^2:", np.mean(min_r2s))
print("Max R^2:", np.mean(max_r2s))
print("Min l1:", np.mean(l1_mins))
print("Max l1:", np.mean(l1_max))

Min R^2: 0.4109252883285175
Max R^2: 0.8159998543560129
Min l1: 1.8270046224686112
Max l1: 1.1867882911937726


In [22]:
import numpy as np
import pandas as pd
import RQP_test_data_creator
from scipy.io import arff


def sample_interval_from_data(data_train):
    n = data_train.shape[0]
    num_features = data_train.shape[1]
    x1 = data_train[np.random.choice(n)]
    x2 = data_train[np.random.choice(n)]
    feature_intervals = []
    for feature in range(num_features):
        lower_bound = np.min([x1[feature], x2[feature]])
        upper_bound = np.max([x1[feature], x2[feature]])
        feature_intervals.append((lower_bound, upper_bound))
    return feature_intervals


file_path_train = "RQPdata/cpu.small.arff_RQPtrain.arff"
file_path_test = "RQPdata/cpu.small.arff_RQPtest.arff"
data_train_raw = pd.DataFrame(arff.loadarff(file_path_train)[0])
data_test_raw = pd.DataFrame(arff.loadarff(file_path_test)[0])
X_train_raw = data_train_raw.iloc[:, :-1]

#aug_sample_numbers = [200, 500, 1000, 2000]
num_features = X_train_raw.shape[1]
data_train_aug = []
for i in range(8000):
    feature_intervals = sample_interval_from_data(X_train_raw.values)
    y_hat_min = np.inf
    y_hat_max = -np.inf
    for index, data_point in data_train_raw.iterrows():
        in_interval = True
        for feature in range(num_features):
            feature_interval = feature_intervals[feature]
            if feature_interval[0] > data_point[feature] or feature_interval[1] < data_point[feature]:
                in_interval = False
        if in_interval:
            y_hat_min = np.min([y_hat_min, data_point[-1]])
            y_hat_max = np.max([y_hat_max, data_point[-1]])
    #if not y_hat_min == np.inf and not y_hat_max == -np.inf:
    aug_point = []
    for interval in feature_intervals:
        aug_point.append(interval[0])
        aug_point.append(interval[1])
    aug_point.append(y_hat_min)
    aug_point.append(y_hat_max)
    data_train_aug.append(aug_point)
data_train_aug = pd.DataFrame(data_train_aug)

#print(data_train_aug.describe())

X_test = data_test_raw.iloc[:, :-2]
Y_test_min = data_test_raw.iloc[:, -2:]
Y_test_max = data_test_raw.iloc[:, :-1]

In [23]:
from sklearn.ensemble import RandomForestRegressor

X_train_aug = data_train_aug.iloc[:, :-2]
Y_min_aug = np.ravel(data_train_aug.iloc[:, -2:-1])
Y_max_aug = np.ravel(data_train_aug.iloc[:, -1:])

reps = 100
min_r2s = []
max_r2s = []
l1_mins = []
l1_maxs = []
for rep in range(reps):
    min_classifier = RandomForestRegressor()
    min_classifier = min_classifier.fit(X_train_aug, Y_min_aug)

    max_classifier = RandomForestRegressor()
    max_classifier = max_classifier.fit(X_train_aug, Y_max_aug)

    X_test = data_test_raw.iloc[:, :-2]
    Y_test_min = data_test_raw.iloc[:, -2:-1]
    Y_test_max = data_test_raw.iloc[:, -1:]

    l1_min = np.sum(np.absolute(np.subtract(min_classifier.predict(X_test), np.ravel(Y_test_min)))) / X_test.shape[0]
    l1_max = np.sum(np.absolute(np.subtract(max_classifier.predict(X_test), np.ravel(Y_test_max)))) / X_test.shape[0]
    
    min_r2s.append(min_classifier.score(X_test, Y_test_min))
    max_r2s.append(max_classifier.score(X_test, Y_test_max))
    l1_mins.append(l1_min)
    l1_maxs.append(l1_max)
    
print("Min R^2:", np.mean(min_r2s))
print("Max R^2:", np.mean(max_r2s))
print("Min l1:", np.mean(l1_mins))
print("Max l1:", np.mean(l1_max))

Min R^2: 0.6191485325068501
Max R^2: 0.75157965172968
Min l1: 0.04640308741278421
Max l1: 0.04534470975012554
