In [3]:
import numpy as np
import random
import gdal 
import os
from copy import copy
from joblib import Parallel, delayed,parallel_backend

In [17]:
class OCAT():
    def __init__(self,data,targetColumn):
        self.data = data
        self.target = targetColumn
        self.domain = {}
        self.indexes = {}
        self.auxiliar = {}
        self.rules = []
        self.terms = []
        self.preprocessing()
        
    def checkConflict(self,problem,cls,key):
        if key in problem[cls]:
            return cls
    
    def getLearningProblem(self,data,target):
        problem = {}
        conflicts = {}
        for row in data:
            cls = row[target]
            values = row[:-1]
            if cls not in problem:
                problem[cls] = {}
            key = str(values)
            conflict = False
            conflictedKeys = []
            for c in problem:
                if c != cls and key in problem[c]:
                    conflict = True
                    conflictedKeys.append(c)
            if not conflict:
                if key not in problem[cls]:
                    problem[cls][key] = [values,0]
                problem[cls][key][1] += 1
            else:
                conflicts[key] = conflictedKeys
        return problem,conflicts
        
    def dictToMat(self,dic):
        matrix = []
        for key in dic:
            for key2 in dic[key]:
                temp = copy(dic[key][key2])
                if type(temp) == np.ndarray:
                    temp = np.append(temp,key)
                else:
                    temp.append(key)
                matrix.append(temp)
        return matrix

    def getDomain(self,data,target):
        #encontramos los valores observados de cada atributo en los subproblemas de aprendizaje
        domain = {}
        for row in data:
            for j,col in enumerate(row[0]):
                if j not in domain:
                    domain[j] = set()
                if col not in domain[j]:
                    domain[j].add(col)
        domain[target] = set()
        for row in data:    
            domain[target].add(row[2])
        for key in domain:
            domain[key] = sorted(domain[key])
        return domain
    
    def getBinaryDomain(self,binarized):
        cols = len(binarized[0][0])
        domain = {}
        for col in range(cols):
            domain[col] = [0,1]
        return domain

    def binarize(self,data,domain):
        #binarizamos los subproblemas de aprendizaje
        binarizedMatrix = []
        for row in data:
            rtemp = []
            for j,col in enumerate(row[0]):
                for val in domain[j]:
                    if col >= val:
                        rtemp.append(1)
                    else:
                        rtemp.append(0)
            binarizedMatrix.append([rtemp,row[1],row[2]])
        return binarizedMatrix

    def getPosNeg(self,domain,binarized,indexes,target):
        # obtenemos los ejemplos positivos y negativos para el atributo x[1] de los subproblemas
        keys = list(domain.keys())
        pos_neg = {}
        pos = {}
        for val in domain[keys[target]]:
            if val not in pos_neg:
                pos_neg[val] = {"pos":{},"neg":{}}
        for val in indexes:
            for t in indexes[val]["pos"]:
                i= t[0]
                row = binarized[i]
                for j,col in enumerate(row[0]):
                    if j not in pos_neg[val]["pos"]:
                        pos_neg[val]["pos"][j] = {}
                    if col not in pos_neg[val]["pos"][j]:
                        pos_neg[val]["pos"][j][col] = set()
                    pos_neg[val]["pos"][j][col].add((i,row[1]))
            for t in indexes[val]["neg"]:
                i = t[0]
                row = binarized[i]
                for j,col in enumerate(row[0]):
                    if j not in pos_neg[val]["neg"]:
                        pos_neg[val]["neg"][j] = {}
                    if col not in pos_neg[val]["neg"][j]:
                        pos_neg[val]["neg"][j][col] = set()
                    pos_neg[val]["neg"][j][col].add((i,row[1]))
        return pos_neg
    
    def getIndexes(self,data,domain,target):
        indexes = {}
        print("dindex")
        for val in domain[target]:
            indexes[val] = {"pos":set(),"neg":set()}
        print("pos index")
        for i,row in enumerate(data):
            val = row[2]
            indexes[val]["pos"].add((i,row[1]))
        print("neg index")
        for key in indexes:
            neg = set()
            for key2 in indexes:
                if key != key2:
                    neg = neg.union(indexes[key2]["pos"])
            indexes[key]["neg"] = neg
        return indexes
    
    def getTerms(self,domain,target):
        terms = []
        for key in domain:
            if key != target:
                terms.append((key,0))
                terms.append((key,1))
        return terms
        
    def preprocessing(self):
        print("problem")
        problem,self.conflicts = self.getLearningProblem(self.data,self.target)
        print("matrix")
        matrix = self.dictToMat(problem)
        print("domain")
        self.domain = self.getDomain(matrix,self.target)
        print("binarized")
        binarized = self.binarize(matrix,self.domain)
        print("bdomains")
        self.bdomains = self.getBinaryDomain(binarized)
        print("indexes")
        self.indexes = self.getIndexes(binarized,self.domain,self.target)
        print("auxiliar")
        self.auxiliar = self.getPosNeg(self.domain,binarized,self.indexes,self.target)
        print("terms")
        self.terms = self.getTerms(self.bdomains,self.target)
        
    def getFitnessValue(self,term,auxiliar,removedPos,removedNeg):
        temp = term
        pos = temp[0]
        val = temp[1]
        if pos not in auxiliar["pos"] or pos not in auxiliar["neg"]:
            return [term,None]
        if val not in auxiliar["pos"][pos] and val not in auxiliar["neg"][pos]:
            return [term,None]
        if val in auxiliar["pos"][pos]:
            remainingPos = auxiliar["pos"][pos][val].difference(removedPos) 
            if len(remainingPos) > 0:
                posEj = np.array(list(remainingPos)).sum(axis=0)[1] 
            else:
                posEj = 0.000000001
        else:
            return [term,None]
        if val in auxiliar["neg"][pos]:
            remainingNeg = auxiliar["neg"][pos][val].difference(removedNeg)
            
            if len(remainingNeg) > 0:
                negEj =  np.array(list(remainingNeg)).sum(axis=0)[1] #len(remainingNeg) 
            else:
                #print("errror ",temp,auxiliar["neg"][pos][val],removedNeg,remainingNeg,posEj)
                return [term,2*posEj]
            if negEj == 0:
                return [term,2*posEj]
        else:
            return [term,0.000000001]
        return [term,posEj/negEj]
        
    
    def getProbabilities(self,termsFitness):
        total = 0
        for term in termsFitness:
            total += term[1]
        probabilities = []
        cur = 0
        for i,term in enumerate(termsFitness):
            cur += term[1]/total
            probabilities.append([i, cur])
        return probabilities
    
    def createClause(self,auxiliar,element):
        clausule = set()
        for key in auxiliar:
            for j in auxiliar["neg"]:
                for col in auxiliar["neg"][j]:
                    if element in auxiliar["neg"][j][col]:
                        clausule.add((j,1 if col == 0 else 0))
        return clausule
    
    def obtainRules(self,pos,neg,auxiliar,terms):
        removedNeg = set()
        clausules = []
        count = 0
        backend = 'threading'
        with parallel_backend(backend):
            while(len(neg.difference(removedNeg))>0):
                if count % 10 == 0:
                    print(len(neg.difference(removedNeg)))
                removedPos = set()
                termsTemp = [term for term in terms]
                clausule = set()
                while(len(pos.difference(removedPos))>0):
                    termsFitness = Parallel(n_jobs=8)(delayed(self.getFitnessValue)(term,auxiliar,removedPos,removedNeg) for term in termsTemp)
                    termsFitness = list(filter(lambda x: x[1] != None,termsFitness))
                    termsFitness = sorted(termsFitness,key=lambda x:x[1],reverse=True)
                    probabilities = self.getProbabilities(termsFitness[:len(termsFitness)//2])
                    #print(termsFitness)
                    rand = random.random()
                    selected = 0
                    for prob in probabilities:
                        if rand > prob[1]:
                            selected = prob[0]
                        else: break
                    term = termsFitness[selected][0]
                    clausule.add(term)
                    termsTemp = list(filter(lambda x: x[0] != term[0],termsTemp))
                    if term[1] in auxiliar["pos"][term[0]]:
                        removedPos = removedPos.union(auxiliar["pos"][term[0]][term[1]])
                temp = set().union(neg)
                #print(clausule)
                for item in clausule:
                    val = 1 if item[1] == 0 else 0
                    post = item[0]
                    temp = temp.intersection(auxiliar["neg"][post].get(val,set()))
                #print(neg.difference(removedNeg))
                if len(temp) > 0 and len(temp.difference(removedNeg)):
                    removedNeg = removedNeg.union(temp)
                    clausules.append(clausule)
                    count = 0
                else:
                    count += 1
                    if count% 100 == 0:
                        print(f"difficult element {count}/1000 to delete it")
                    if count >= 1000:
                        count = 0
                        problematicElements = neg.difference(removedNeg)
                        element = random.choice(list(problematicElements))
                        clausules.append(self.createClause(auxiliar,element))
                        removedNeg.add(element)
                        print(element,neg.difference(removedNeg))
        return clausules
                  
        
    def train(self):
        rules = []
        for val in self.indexes:
            print(f"training for value {val}")
            rules.append([val,self.obtainRules(self.indexes[val]["pos"],self.indexes[val]["neg"],self.auxiliar[val],self.terms)])
        self.rules = rules
        self.rules = self.unbinarize(rules)
        return self.rules
        
    def unbinarize(self,rules):
        newRules = []
        ranges = []
        start = 0
        for key in self.domain:
            end = start+len(self.domain[key])-1
            ranges.append([start,end,key])
            start = end+1
        for row in rules:
            newRow = []
            for col in row[1]:
                clause = set()
                for term in col:
                    oCol = 0
                    val = 0
                    for ran in ranges:
                        if term[0] >= ran[0] and term[0] <= ran[1]:
                            oCol = ran[2]
                            val = self.domain[oCol][term[0]-ran[0]]
                            break
                    op = ">=" if term[1] == 1 else "<"
                    clause.add((oCol,val,op))
                newRow.append(clause)
            newRules.append([row[0],newRow])
        return newRules
    
    def displayRule(self,labels):
        fun = f"def evaluate({' , '.join(labels)}):" + "\n"
        rules = []
        for row in self.rules:
            clausules = []
            for clausule in row[1]:
                terms = []
                for term in clausule:
                    terms.append(f"{labels[term[0]]} {term[2]} {term[1]}")
                temp = " or ".join(terms)
                clausules.append(f"({temp})")
            temp = " and ".join(clausules)
            rules.append("\t"+f"if {temp}:"+"\n\t\t"+f"return {row[0]}")
        temp = '\n'.join(rules)
        fun = f"{fun}{temp}"
        return fun

In [27]:
data = np.array([[0,2,0,0,2],[0,1,0,1,2],[1,0,1,0,2],[1,0,0,1,2],
        [1,0,1,0,1],[0,0,0,1,1],[1,1,1,1,1],[0,0,0,0,1],[1,0,0,0,1],[1,1,1,0,1],[0,2,0,0,1]])

In [28]:
model = OCAT(data,4)

problem
matrix
domain
binarized
bdomains
indexes
dindex
pos index
neg index
auxiliar
terms


In [32]:
model.conflicts

{'[1 0 1 0]': [2], '[0 2 0 0]': [2]}

In [30]:
1,0,1,1,1,0,1,0,0,2

(1, 0, 1, 1, 1, 0, 1, 0, 0, 2)

In [31]:
model.train()

training for value 1
4
2
1
training for value 2
5
4
2
1
1
1
1
1
1
1
1
1
1
difficult element 100/1000 to delete it
1
1
1
1
1
1
1
1
1
1
difficult element 200/1000 to delete it
1
1
1
1
1
1
1
1
1
1
difficult element 300/1000 to delete it
1
1
1
1
1
1
1
1
1
1
difficult element 400/1000 to delete it
1
1
1
1
1
1
1
1
1
1
difficult element 500/1000 to delete it
1
1
1
1
1
1
1
1
1
1
difficult element 600/1000 to delete it
1
1
1
1
1
1
1
1
1
1


KeyboardInterrupt: 

In [437]:
fun = exec(model.displayRule(["x1","x2","x3","x4"]))

In [439]:
evaluate(*data[3,:-1])

2

In [10]:
def readraster(file):
    dataSource = gdal.Open(file)
    band = dataSource.GetRasterBand(1)
    band = band.ReadAsArray()
    return(dataSource, band)

In [33]:
stateFiles1 = ["cbddist.tif","roaddist.tif","dda_2021_government_restricted.tif","den1991.tif","slope.tif","Actual_1994.tif"]
stateFiles2 = ["Actual_1999.tif","cbddist.tif","roaddist.tif","dda_2021_government_restricted.tif","den2001.tif","slope.tif"]
stateFiles = [stateFiles1,stateFiles2]
states = []
path = "data"
for i,files in enumerate(stateFiles):
    state = []
    for file in files:
        source,band = readraster(f"{path}/{file}")
        state.append(band)
    states.append(state)

In [34]:
state = np.array(states[0])
states = None

In [35]:
newState = state.transpose().reshape(-1,np.shape(state)[0])
state = None

In [36]:
newState.shape

(3525570, 6)

In [37]:
model2 = OCAT(newState,5)

problem
matrix
domain
binarized
bdomains
indexes
dindex
pos index
neg index
auxiliar
terms


In [38]:
model2.train()

training for value 0
6412
22
22
22
19
19
19
5
5
5
5
5
5
5
5
5
5
difficult element 100/1000 to delete it
5
5
5
5
5
5
5
5
5
5
difficult element 200/1000 to delete it
5
5
5
5
5
5
5
5
5
5
difficult element 300/1000 to delete it
5
5
5
5
5
5
5
5
5
5
difficult element 400/1000 to delete it
5
5
5
5
5
5
5
5
5
5
difficult element 500/1000 to delete it
5
5
5
5
5
5
5
5
5
5
difficult element 600/1000 to delete it
5
5
5
5
5
5
5
5
5
5
difficult element 700/1000 to delete it
5
5
5
5
5
5
5
5
5
5
difficult element 800/1000 to delete it
5
5
5
5
5
5
5
5
5
5
difficult element 900/1000 to delete it
5
5
5
5
5
5
5
5
5
5
difficult element 1000/1000 to delete it
(761, 1) {(601, 2), (542, 1), (600, 1), (599, 1)}
4
4
2
2
2
2
2
2
2
2
2
2
difficult element 100/1000 to delete it
2
2
2
2
2
2
2
2
2
2
difficult element 200/1000 to delete it
2
2
2
2
2
2
2
2
2
2
difficult element 300/1000 to delete it
2
2
2
2
2
2
2
2
2
2
difficult element 400/1000 to delete it
2
2
2
2
2
2
2
2
2
2
difficult element 500/1000 to delete it
2

KeyboardInterrupt: 

In [11]:
problem = getLearningProblem(newState,5)

In [12]:
matrix = dictToMat(problem)

In [13]:
domain = getDomain(matrix,5)

In [28]:
binarized = binarize(matrix,domain)

In [36]:
posNeg = getPosNeg(domain,binarized,5)

In [453]:
set([1,2,3,4]).difference(set([1,2,4,8,9]))

{3}