In [None]:

from helpers import *

In [1]:
MINIMIZER_SIMULTANEOUS = 0
MINIMIZER_GRADIENT_DESCENT = 1
LOSS_SQUARED_ERROR = 0
LOSS_ABSOLUTE_ERROR = 1

In [2]:
# Number of rules
M = 100
# Shrinkage
nu = 1
# Resampling
resample = True
# Percentage (fraction) fo the training set used in each subsample
percentage = 0.5

# Resampling with replacement
withReplacement = False
	
# Replace missing values with means and modes
replaceMissingValues = False

# Minimization technique
minimizationTechnique = MINIMIZER_SIMULTANEOUS

# Loss function
lossFunction = LOSS_SQUARED_ERROR

loss = LossClass() # Depends on absolute or squared

In [None]:
if lossFunction == LOSS_ABSOLUTE_ERROR:
    loss = AbsoluteErrorLossFunction()
else:
    loss = SquaredLossFunction()

In [None]:
if minimizationTechnique == MINIMIZER_GRADIENT_DESCENT:
	empiricalRiskMinimizer = LeastAngleEmpiricalRiskMinimizer();
else:
    if lossFunction == LOSS_ABSOLUTE_ERROR:
        empiricalRiskMinimizer = AbsoluteErrorRiskMinimizer()
    else:
        empiricalRiskMinimizer = GradientEmpiricalRiskMinimizer()


In [24]:
import numpy as np
import random
from sklearn.base import ClassifierMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils import Bunch
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import check_X_y
from sklearn.utils import check_random_state

class Rule:
    # Define the Rule class if needed
    pass

class NominalToBinary:
    # Define the NominalToBinary class if needed
    #implemented in WEKA
    #TODO
    pass

class ReplaceMissingValues:
    # Define the ReplaceMissingValues class if needed
    # It is implemented in Weka
    #TODO
    pass

class SingleRuleBuilder():
    pass

class Regender():
    def __init__(self, M, rule_builder, resample, percentage, with_replacement):
        self.M = M
        self.rule_builder = rule_builder
        self.resample = resample
        self.percentage = percentage
        self.with_replacement = with_replacement

    def fit(self, X, y):
        self._validate_params()
        X, y = self._validate_input(X, y)
        self._initialize(X, y)

        self.rules = [None] * self.M

        # create default rule
        self.covered_instances = np.ones(self.number_of_instances, dtype=np.int16)
        self.default_rule = self.rule_builder.create_default_rule(self.value_of_F, self.covered_instances)
        self._update_function(self.default_rule)

        i = 0
        while i < self.M:
            # resampling
            if self.resample:
                self.covered_instances = self._resample(self.number_of_instances, self.percentage, self.with_replacement)
            else:
                self.covered_instances.fill(1)

            # create rule
            self.rules[i] = self.rule_builder.create_rule(self.value_of_F, self.covered_instances)

            if self.rules[i] is not None:
                # update function F
                self._update_function(self.rules[i].decision)
                i += 1
            else:
                self.M = i
                break

    def _validate_params(self):
        # Add validation for parameters if needed
        pass

    def _validate_input(self, X, y):
        # Add validation for input data if needed
        #TODO
        return X, y

    def _initialize(self, X, y):
        self.X_ = X
        self.y_ = y
        self.number_of_condition_attributes = X.shape[1]
        self.number_of_instances = X.shape[0]
        self.value_of_F = np.zeros(self.number_of_instances)
        self.decision_attribute = X.shape[1]
        print(y)
        print(np.unique(y))
        self.number_of_decision_classes = len(np.unique(y))

        if self.number_of_decision_classes != 1:
            raise ValueError("This is a regression method: wrong number of decision classes")

        self.auxiliary_decision_attribute = self.number_of_condition_attributes + 1
        self.instance_index_attribute = self.number_of_condition_attributes + 2

        # Initialize covered instances array
        self.covered_instances = np.zeros(self.number_of_instances, dtype=np.int16)

        self.rule_builder.initialize(self.X_)

        self.random_state = check_random_state(None)

    def _resample(self, number_of_instances, percentage, with_replacement):
        sub_sample = np.zeros(number_of_instances, dtype=np.int16)
        sub_sample_size = int(number_of_instances * percentage)

        if sub_sample_size > 0:
            if not with_replacement:
                indices = np.arange(number_of_instances)
                self.random_state.shuffle(indices)
                sub_sample[indices[:sub_sample_size]] = 1
            else:
                sub_sample[self.random_state.choice(number_of_instances, sub_sample_size)] += 1

        return sub_sample

    def _update_function(self, decision):
        self.value_of_F[self.covered_instances >= 0] += decision

    def predict(self, X):
        check_is_fitted(self, "X_")

        X = check_array(X)
        X = self._preprocess_input(X)

        predictions = np.zeros(X.shape[0])
        for i in range(self.M):
            if self.rules[i] is not None:
                predictions += self.rules[i].classify_instance(X)

        return predictions + self.default_rule

    def _preprocess_input(self, X):
        # Add preprocessing steps if needed
        pass

    def score(self, X, y):
        check_is_fitted(self, "X_")
        y = self._validate_targets(y)
        predictions = self.predict(X)
        return np.mean((predictions - y) ** 2)

    def _validate_targets(self, y):
        check_classification_targets(y)
        return LabelEncoder().fit_transform(y)

    def get_params(self, deep=True):
        # Add implementation to get parameters if needed
        pass

    def set_params(self, **params):
        # Add implementation to set parameters if needed
        pass

    def __repr__(self):
        # Add implementation for representation if needed
        pass



In [25]:
# Example usage
data = from_csv('WEKA/apple_quality.csv')
X_train, y_train = X_y_split(data, 'Quality')
print(data)

my_rule_builder = SingleRuleBuilder()
regender = Regender(M=10, rule_builder=my_rule_builder, resample=True, percentage=0.8, with_replacement=False)
regender.fit(X_train, y_train)
# predictions = regender.predict(X_test)
# score = regender.score(X_test, y_test)
# print(predictions)
# print(score)

        A_id      Size    Weight  Sweetness  Crunchiness  Juiciness  Ripeness   
0        0.0 -3.970049 -2.512336   5.346330    -1.012009   1.844900  0.329840  \
1        1.0 -1.195217 -2.839257   3.664059     1.588232   0.853286  0.867530   
2        2.0 -0.292024 -1.351282  -1.738429    -0.342616   2.838636 -0.038033   
3        3.0 -0.657196 -2.271627   1.324874    -0.097875   3.637970 -3.413761   
4        4.0  1.364217 -1.296612  -0.384658    -0.553006   3.030874 -1.303849   
...      ...       ...       ...        ...          ...        ...       ...   
3996  3996.0 -0.293118  1.949253  -0.204020    -0.640196   0.024523 -1.087900   
3997  3997.0 -2.634515 -2.138247  -2.440461     0.657223   2.199709  4.763859   
3998  3998.0 -4.008004 -1.779337   2.366397    -0.200329   2.161435  0.214488   
3999  3999.0  0.278540 -1.715505   0.121217    -1.154075   1.266677 -0.776571   
4000     NaN       NaN       NaN        NaN          NaN        NaN       NaN   

                           

ValueError: This is a regression method: wrong number of decision classes

In [18]:
np.unique(['2', 2.,  '1', '1'])

array(['1', '2', '2.0'], dtype='<U32')