In [1]:
import numpy as np
from scipy.special import expit
from scipy.optimize import minimize
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

from rule_builder import RuleBuilder

class MLRules(BaseEstimator, ClassifierMixin):
    def __init__(self, n_rules=100, shrinkage=0.5, resample=True, subsample_size=0.5,
                 minimization_technique=0, use_line_search=False, pre_chosen_k=False, random_state=None):
        self.n_rules = n_rules
        self.shrinkage = shrinkage
        self.resample = resample
        self.subsample_size = subsample_size
        self.minimization_technique = minimization_technique
        self.use_line_search = use_line_search
        self.pre_chosen_k = pre_chosen_k
        self.random_state = random_state
        self.model_built = False

        
        self.bigNumber = 862897923

    def _resample(self, percentage):
        # sub_sample = np.zeros(self.N, dtype=np.short)
        # subsample_size = int(self.N * percentage)
        # indices = np.arange(self.N)
        # np.random.shuffle(indices)
        # sub_sample[indices[:subsample_size]] = 1

        # TODO under is my function for compatibility with java random generator
        sub_sample = [0] * self.N
        subsampleSize = int(self.N * percentage)
        indices = list(range(self.N))

        for i in range(self.N):
            indices[i] = i

        for i in range(self.N - 1, 0, -1):
            temp = indices[i]
            index = self.bigNumber % (i + 1)
            # TODO
            # index = i
            indices[i] = indices[index]
            indices[index] = temp

        for i in range(subsampleSize):
            sub_sample[indices[i]] = 1
        self.bigNumber += 2

        return sub_sample

    def _update_function_when_removal(self, rule):
        for i in range(self.N):
            if rule.classify_instance(self.instances[i]) is not None:
                self.f[i] -= rule.get_decision()

    def _update_function(self, decision):
        for i in range(self.N):
            if self.covered_instances[i] >= 0:
                self.f[i] += decision

    def _evaluate_f(self, instance):
        eval_f = np.copy(self.default_rule)
        for m in range(self.n_rules):
            current_values = self.rules[m].classify_instance(instance)
            if current_values is not None:
                eval_f += current_values
        return eval_f

    def _distribution_for_instance(self, instance):
        eval_f = self._evaluate_f(instance)
        distribution = np.exp(eval_f) / np.sum(np.exp(eval_f))
        return distribution

    def _compute_empirical_risk(self):
        empirical_risk = 0
        for i in range(self.N):
            total = np.sum(np.exp(self.f[i]))
            empirical_risk -= self.instances[i].weight() * np.log(
                np.exp(self.f[i][int(self.instances[i].class_value())]) / total
            )
        return empirical_risk / self.N

    def fit(self, X, y, weights=None):
        X, y = check_X_y(X, y)

        self.instances = X
        self.D = X.shape[1] - 1
        self.N = X.shape[0]
        self.K = len(np.unique(y))
        
        if weights:
            self.weights = weights
        else:
            self.weights = [1.0 for _ in range(self.N)]

        self.f = np.zeros((self.N, self.K))
        #TOCHECK maybe not ones but none or zero??
        self.covered_instances = np.ones(self.N, dtype=np.short)

        np.random.seed(self.random_state)

        # Initialize rule builder and other parameters
        self.rule_builder = RuleBuilder(shrinkage=self.shrinkage, use_line_search=self.use_line_search, use_gradient=self.minimization_technique == 0, pre_chosen_k=self.pre_chosen_k, R=5.0, Rp=1e-5)
        self.rule_builder.initialize(X, y)

        self.main_random_generator = np.random.default_rng()

        self.rules = [None] * self.n_rules
        # TODO
        # np.fill_diagonal(self.covered_instances, 1)  # Set covered instances to 1 initially

        if self.use_line_search:
            self.default_rule = self.rule_builder.create_default_rule_line_search()
        else:
            self.default_rule = self.rule_builder.create_default_rule_no_line_search(self.f, self.covered_instances)

        print("default rule")
        print(self.default_rule)
        self._update_function(self.default_rule)

        for m in range(self.n_rules):
            if self.resample:
                self.covered_instances = self._resample(self.subsample_size)
            else:
                #TOCHECK Unchecked right now 19.02
                np.fill_diagonal(self.covered_instances, 1)

            print(self.covered_instances)

            self.rules[m] = self.rule_builder.create_rule(self.f, self.covered_instances)
            return

            if self.rules[m] is not None:
                self._update_function(self.rules[m].get_decision())
            else:
                m -= 1

        self.model_built = True
        return self

    def predict_proba(self, X):
        check_is_fitted(self)

        X = check_array(X)
        probabilities = np.zeros((X.shape[0], self.K))

        for i, instance in enumerate(X):
            probabilities[i] = self._distribution_for_instance(instance)

        return probabilities

    def predict(self, X):
        check_is_fitted(self)

        probabilities = self.predict_proba(X)
        return np.argmax(probabilities, axis=1)




In [4]:
from helpers import from_csv, X_y_split
from sklearn.preprocessing import LabelEncoder
import os
# data = from_csv(os.path.join('C:', 'Użytkownicy', 'Maciej', 'Dokumenty', '"small class dataset.csv"'))
# import pandas as pd
# data = pd.read_csv(os.path.join('C:\\', 'Użytkownicy', 'Maciej', 'Dokumenty', '"apple_quality.csv"'))

# data = from_csv('C:\\Użytkownicy\\Maciej\\Dokumenty\\small class dataset.csv')
# data = from_csv('C:\\Użytkownicy\Maciej\\Dokumenty\\apple_quality.csv')
# data = from_csv('C:\\Użytkownicy\\Maciej\\Dokumenty\\apple_quality_head100.csv')
data = from_csv('apple_quality_head100.csv')
data = from_csv('apple_quality.csv')
# data = from_csv('small class dataset.csv')

# X_train, y_train = X_y_split(data, 'Decyzyjny')
X_train, y_train = X_y_split(data, 'Quality')

# print(y_train)

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

# print(X_train)
# print(y_train)
y_train = [1 if x == 0 else 0 for x in y_train]

# Usage example:
ml_rules_classifier = MLRules(n_rules=100, shrinkage=0.5, resample=True, subsample_size=0.5, minimization_technique=0, use_line_search=False, pre_chosen_k=True, random_state=42)
ml_rules_classifier.fit(X_train, y_train)
# predictions = ml_rules_classifier.predict(X_test)
# probabilities = ml_rules_classifier.predict_proba(X_test)

probability:
[[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0

RuntimeError: No active exception to reraise

In [None]:
D, N = 3, 5
np.zeros((D, N)), [[0] * N for _ in range(D)]

(array([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]),
 [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])