In [67]:
import copy
import numpy as np
import pandas as pd
from sklearn import preprocessing

class __dict(dict):
    def __missing__(self,key):
        return key

In [68]:
#dataset = pd.read_csv("reduzida-alon-pn-freq-2.CSV", sep=',')
dataset = pd.read_csv("dataset_abalone.csv", sep=',')
dataset.columns = dataset.columns.str.replace('"',"")
dataset.columns = dataset.columns.str.replace(' ',"")


label_encoder = preprocessing.LabelEncoder()
converted = dataset.apply(lambda x: label_encoder.fit_transform(x))
#converted.head()

In [69]:
def construct_features(df, target='last', value=None):
    '''
    This function constructs features according to the rules specified in Gamberger and Lavrac (2002).
    Each feature is an indicator function that establishes constraints on the values
    of individuals in the dataset. Features are constructed in the following way:
    
    * For discrete attributes, features are A_i=v_ip and A_i!=v_in. The values v_ip and v_in como
    from the domain of attribute A_i considering only positive and negative samples respectively.
    
    * For continuous attributes, features are:
        1 - A_i <= (v_ip+v_in)/2, for each pair of consecutive values v_ip and v_in (in this order), 
        conditioned that v_ip comes from the domain of A_i in positive samples and v_in 
        from the domain of A_i in negative samples.
        
        2 - A_i > (v_in+v_ip)/2, for each pair of consecutive values v_in and v_ip (in this order),
        and v_in and v_ip are values from domain of A_i as described above.
    
    * For integer attributes, features are generated considering the rules for discrete and continuous
    attributes.    
    '''
    
    _target = __dict({'last': df.columns.values.tolist()[-1], 
                           'first':df.columns.values.tolist()[0]})
    target = _target[target]
    if value is None: value = df[target].unique()[0]
    pos = df[target]==value    
    
    tmp = df.drop(target,axis=1)
    numeric_cols = tmp.select_dtypes(include=['floating','integer']).columns.values
    categorical_cols =  tmp.select_dtypes(include=['object','category','integer']).columns.values
    del tmp
    
    # column, value, op
#     for col in categorical_cols:
#         for value in df.loc[~pos,col].unique():
#             print(Feature(col,value,'ne'))
    
    features = set()
        
    features.update([Feature(col,value,'eq') for col in categorical_cols 
                     for value in df.loc[pos,col].unique()])
    
    features.update([Feature(col,value,'ne') for col in categorical_cols
                     for value in df.loc[~pos,col].unique()])
    
    features
    
    def chooseOP(e):
        if e: return 'le'
        return 'gt'
    chooseOP = np.vectorize(chooseOP)
    
    for col in numeric_cols:
        tmp = df.sort_values(by=col)
        pos = tmp[target]==value
        indices = np.where(pos != np.roll(pos,-1))[0]        
        if indices[0]==0: indices = indices[1:]
        if indices[-1]==df.shape[0]: indices = indices[:-2]         
        features.update([Feature(col,value,op) for value,op in zip((tmp[col].values[indices] + tmp[col].values[indices+1])/2,
                                                                   chooseOP(pos[indices]))])

    return features



# ---------------------------------------------------------------------


from numpy import greater, less_equal, equal, not_equal

class Feature:   
    __operators = {'gt':greater,
                   'le':less_equal,
                   'eq': equal,
                   'ne': not_equal}
    
    __symbols = {'gt':'>',
                   'le':'\u2264',
                   'eq': '=',
                   'ne': "\u2260"}

    def __init__(self, column, value, op):

        self.__col = column
        self.__value = value
        self.__func = Feature.__operators[op]
        self.__sym = Feature.__symbols[op]
    
    def __call__(self, data):
        return self.__func(data[self.__col],self.__value)
    
    def __str__(self):
        return '{} {} {}'.format(self.__col,self.__sym,self.__value)
    
    def __repr__(self):
        return self.__str__()
    
    def __eq__(self,other):
        if not isinstance(other, type(self)): return False
        return self.__col==other.__col and self.__value==other.__value and self.__sym == other.__sym
    
    def __hash__(self):
        return hash((self.__col,self.__value,self.__sym))

# -----------------------------------------------------------

from functools import reduce
import numpy as np

class Rule:


    def __init__(self, ft=set(), target=None):
        '''
        Constructor
        '''
        self.__feat = ft
        self.__target = target
        self.__objs = None
    
    def __call__(self, data):
        print("call")
        self.__objs = reduce(lambda x,y: np.logical_and(x,y), map(lambda x: x(data),self.__feat))
        return self.__objs
    
    def __add__(self, elem):
        r = Rule(self.feat.copy(),self.target)
        if isinstance(elem, Feature):
            r.feat.update([elem])
        elif isinstance(elem, Rule):
            assert(self.target==elem.target)
            r.feat.update(elem.feat)
        else:
            raise ValueError("Invalid type of param elem: {}\n Should either be Feature or Rule.".format(type(elem)))
        
        return r
    
    def __iadd__(self, elem):
        if isinstance(elem, Feature):
            self.feat.update([elem])
        elif isinstance(elem, Rule):
            assert(self.target==elem.target)
            self.feat.update(elem.feat)
        else:
            raise ValueError("Invalid type of param elem: {}\n Should either be Feature or Rule.".format(type(elem)))
        return self
    
    def __str__(self):
        return " \u2227 ".join(map(str,self.feat)) + " \u2192 " + str(self.target)
    
    def __repr__(self):
        return str(self)    

    def _get_feat(self):
        return self.__feat

    def _get_target(self):
        return self.__target

    def _get_objs(self):
        return self.__objs

    def _set_feat(self, value):
        self.__feat = set(value)

    def _set_target(self, value):
        self.__target = value

    feat = property(_get_feat, _set_feat, None, "The antecedent of this rule.")
    target = property(_get_target, _set_target, None, "The consequent of this rule.")
    objs = property(_get_objs, None, None, "The set of examples that satisfy the rule")  

In [75]:
def _map_target_column(columns, target):
    mapping = __dict({'last': columns[-1], 'first':columns[0]})
    return mapping[target]

def wracc(df, rule, target='last'): 
    target = _map_target_column(df.columns.values.tolist(), target)
    examples = rule(df)
    #print(rule(df))
    positive = df[target] == rule.target
    N = df.shape[0]
    probClass = np.sum(positive)/N
    probCond = np.sum(examples)/N
    accuracy = np.sum(positive & examples)/N
    return probCond * (accuracy - probClass)

ft = list(construct_features(dataset, 'rings', 10))
    
r = Rule()
r.feat = [ft[0],ft[10]]
r.target = 10
#print(r(dataset))
#print(r)
print(wracc(converted,  r))

call
-0.0


In [71]:
ft

[shell ≤ 0.0885,
 diameter > 0.37,
 diameter > 0.4325,
 diameter > 0.495,
 shucked > 0.0725,
 viscera > 0.1615,
 whole > 1.7407499999999998,
 shell ≤ 0.085,
 shucked > 0.12875,
 height ≤ 0.18,
 shell ≤ 0.125,
 shell ≤ 0.25,
 shell ≤ 0.4375,
 shucked > 0.59275,
 shucked > 0.65525,
 viscera > 0.52425,
 shell > 0.136,
 shell > 0.16725,
 shell > 0.22975,
 shell > 0.35324999999999995,
 viscera > 0.20199999999999999,
 shell ≤ 0.2765,
 shell ≤ 0.307,
 viscera > 0.1755,
 viscera > 0.27,
 viscera > 0.27075000000000005,
 viscera > 0.29974999999999996,
 viscera ≤ 0.196,
 viscera > 0.32475,
 viscera > 0.356,
 viscera ≤ 0.38675000000000004,
 shucked > 0.4465,
 whole > 0.2715,
 shucked > 0.47775,
 height > 0.125,
 sex ≠ I,
 shucked > 0.25325,
 shucked > 0.2845,
 shucked > 0.3165,
 shucked > 0.34775,
 shucked > 0.41100000000000003,
 shucked > 0.4415,
 shucked > 0.50325,
 shucked > 0.50475,
 shucked > 0.5329999999999999,
 shell > 0.147,
 diameter ≤ 0.395,
 diameter ≤ 0.52,
 diameter ≤ 0.27,
 shell ≤ 0

In [63]:
ft

[shell ≤ 0.0885,
 diameter > 0.37,
 diameter > 0.4325,
 diameter > 0.495,
 shucked > 0.0725,
 viscera > 0.1615,
 whole > 1.7407499999999998,
 shell ≤ 0.085,
 shucked > 0.12875,
 height ≤ 0.18,
 shell ≤ 0.125,
 shell ≤ 0.25,
 shell ≤ 0.4375,
 shucked > 0.59275,
 shucked > 0.65525,
 viscera > 0.52425,
 shell > 0.136,
 shell > 0.16725,
 shell > 0.22975,
 shell > 0.35324999999999995,
 viscera > 0.20199999999999999,
 shell ≤ 0.2765,
 shell ≤ 0.307,
 viscera > 0.1755,
 viscera > 0.27,
 viscera > 0.27075000000000005,
 viscera > 0.29974999999999996,
 viscera ≤ 0.196,
 viscera > 0.32475,
 viscera > 0.356,
 viscera ≤ 0.38675000000000004,
 shucked > 0.4465,
 whole > 0.2715,
 shucked > 0.47775,
 height > 0.125,
 sex ≠ I,
 shucked > 0.25325,
 shucked > 0.2845,
 shucked > 0.3165,
 shucked > 0.34775,
 shucked > 0.41100000000000003,
 shucked > 0.4415,
 shucked > 0.50325,
 shucked > 0.50475,
 shucked > 0.5329999999999999,
 shell > 0.147,
 diameter ≤ 0.395,
 diameter ≤ 0.52,
 diameter ≤ 0.27,
 shell ≤ 0

In [9]:
class SSDP():
    def __init__(self, dataset_name, k, metric="wracc", sep=','):
        self.dataset_name = dataset_name
        self.set = sep
        self.k = k
        self.metric = metric
        self.min_val = -10
        self.max_val = 10
        self.n_variables = 10000
        self.max_iterations = 100
        self.population_size = 50
        self.crossover_gamma = 0.1
        self.mutation_mu = 0.1
        self.mutation_sigma = 0.1
        self.number_of_childrens = int(np.round(self.population_size / 2) * 2)

        # Execute
        self.read_data(dataset_name, sep)
        self.initialize_population()
        self.main_loop()

In [4]:
    def read_data(self, dataset_name, sep):
        dataset = pd.read_csv(self.dataset_name, sep=sep)        

SyntaxError: unexpected EOF while parsing (<ipython-input-4-f2acc8daefd5>, line 1)

In [5]:
    def initialize_population(self):
            # Empty individual model
            self.empty_individual = dict()
            self.empty_individual['attrs'] = None

            if self.metric == 'wracc':
                self.empty_individual['cost'] = -np.inf
            else:
                self.empty_individual['cost'] = np.inf

            # K Bests individuals
            self.population_k = [copy.deepcopy(self.empty_individual) for _ in range(self.k)]

            # Make the population size
            self.population = [copy.deepcopy(self.empty_individual) for _ in range(self.population_size)]

            # Fill values on population
            for idx_individual in range(self.population_size):
                self.population[idx_individual]['attrs'] = np.random.uniform(self.min_val,
                                                                        self.max_val,
                                                                        self.n_variables)
                self.population[idx_individual]['cost'] = self.calc_cost(self.population[idx_individual]['attrs'])

                # Fill values on k bests population
                for idx_best in range(self.k):
                    if self.if_metric(self.population[idx_individual]['cost'] , self.population_k[idx_best]['cost']):
                        self.population_k[idx_best] = self.population[idx_individual]
                        break

In [None]:
    def main_loop(self):

        for _ in range(3):
            #
            for _ in range(self.number_of_childrens // 2):
                # Permutes a list, randomly. Example: (1,2,3,4) -> (2,1,3,4)
                q = np.random.permutation(self.population_size)
                # Select the firsts of the permutation choose.
                p1 = self.population[q[0]]
                p2 = self.population[q[1]]


                # Perform crossover
                c1, c2 = self.crossover(p1, p2) # under construction

                # Perform mutation
                m1 = self.mutate(p1, self.mutation_mu, self.mutation_sigma)
                m2 = self.mutate(p2, self.mutation_mu, self.mutation_sigma)

                # Apply bounds
                self.apply_bound(c1, self.min_val, self.max_val)
                self.apply_bound(c2, self.min_val, self.max_val)
                self.apply_bound(m1, self.min_val, self.max_val)
                self.apply_bound(m2, self.min_val, self.max_val)

                # Evaluate first offspring
                c1['cost'] = self.calc_cost(c1['attrs'])
                c2['cost'] = self.calc_cost(c2['attrs'])
                m1['cost'] = self.calc_cost(m1['attrs'])
                m2['cost'] = self.calc_cost(m2['attrs'])

                # Get the best 2 values
                p1, p2 = self.compare_news(p1, p2, c1, c2, m1, m2)

                # Update bests population
                for idx_best in range(self.k):
                    if self.if_metric(p1['cost'] , self.population_k[idx_best]['cost']):
                        self.population_k[idx_best] = p1
                        break
                for idx_best in range(self.k):
                    if self.if_metric(p2['cost'] , self.population_k[idx_best]['cost']):
                        self.population_k[idx_best] = p2
                        break

In [None]:
    def mutate(self, x, mu, sigma):
        y = copy.deepcopy(x)
        flags_off_changes = np.random.rand(*x['attrs'].shape) <= mu
        indexes_to_change = np.argwhere(flags_off_changes)
        y['attrs'][indexes_to_change] += sigma * np.random.randn(*indexes_to_change.shape)
        return y

In [None]:
    def apply_bound(self, x, varmin, varmax):
            x['attrs'] = np.maximum(x['attrs'], varmin)
            x['attrs'] = np.minimum(x['attrs'], varmax)

In [None]:
    def compare_news(self, p1, p2, c1, c2, m1, m2):
        all_itens = [p1,p2,c1,c2,m1,m2]
        if self.metric == 'wracc': # > first
            all_itens = sorted(all, key = lambda x: x['cost'] , reverse=True)
        else:                      # < first
            all_itens = sorted(all, key = lambda x: x['cost'] , reverse=False)
        return all_itens[0], all_itens[1]

In [None]:
    def calc_cost(self, individual_attr):
        return sum(individual_attr)
    
    def _map_target_column(self, columns, target):
        mapping = __dict({'last': columns[-1], 'first':columns[0]})
        return mapping[target]
    
    def wracc(self, df, rule, target='last'):       
        '''
        Parameters
        ----------
        df : pandas.DataFrame
            The data where the rule shall be evaluated.
        
        target : string 
                The column of df that contains the target (class) attribute, or either
                'last' for the last column (default) or 'first' for the first.
        
        
        rule : Rule-object 
                The measure is computed taking rule.target as positive
                and the rest as negative examples.
                
        Returns
        -------
        score : float
                The non-normalized weighted relative accuracy of the rule.
                Values vary from -0.25 to 0.25. The larger the value, the more
                significant the rule is, zero means uninteresting. 
        '''
        target = self._map_target_column(df.columns.values.tolist(), target)
        examples = rule(df)
        positive = df[target] == rule.target
        N = df.shape[0]
        probClass = np.sum(positive)/N
        probCond = np.sum(examples)/N
        accuracy = np.sum(positive & examples)/N
        return probCond * (accuracy - probClass)

In [6]:


if __name__ == "__main__": 
    ssdp = SSDP(2)
    print(ssdp.population_k)

TypeError: __init__() missing 1 required positional argument: 'k'