In [1]:
import random
from collections import Counter
import numpy as np
import copy

In [2]:
def split_data(data, label=0, length=50):
    strings = [data[i:i+length] for i in range(0, len(data) - length, length)]
    random.shuffle(strings)
    strings = [(s, label) for s in strings]

    test = strings[:len(strings) * 10 // 100]
    training = strings[len(strings) * 10 // 100:]
    return test, training

In [3]:
def entropy(data):
    v = Counter([b for _, b in data]).values()
    v = list(v)
    d = np.array(v) / (sum(v))
    return - sum(d * np.log(d))

In [4]:
def split(train, feat):
    ## first compute the entropy
    Hx = entropy(train)
    if Hx < 0.000001:
        raise Exception("Entropy very low")
    L1 = []
    L2 = []
    for t in train:
        if feat in t[0]:
            L1 += [t]
        else:
            L2 += [t]

    E1 = entropy(L1)
    E2 = entropy(L2)
    L = float(len(train))

    H = Hx - E1 * len(L1)/L - E2 * len(L2)/L
    return H, L1, L2, feat

In [5]:
def build_jungle(train, features, levels=20, numfeatures=100):
    DAG = {0: copy.copy(train)}
    Candidate_sets = [0]
    next_ID = 0
    M = 20

    for level in range(levels):
        result_sets = []
        for tdata_idx in Candidate_sets:
            tdata = DAG[tdata_idx]

            if entropy(tdata) == 0.0:
                next_ID += 1
                idx1 = next_ID
                result_sets += [idx1]
                DAG[idx1] = tdata + []
                del DAG[tdata_idx][:]
                DAG[tdata_idx] += [True, idx1, idx1]
                continue

            X = (split(tdata, F) for F in random.sample(features, numfeatures))
            H, L1, L2, F = max(X)

            # Branch = (F, M1, M2)
            next_ID += 1
            idx1 = next_ID
            DAG[idx1] = L1
            next_ID += 1
            idx2 = next_ID
            DAG[idx2] = L2

            result_sets += [idx1, idx2]
            del DAG[tdata_idx][:]
            DAG[tdata_idx] += [F, idx1, idx2]

        ## Now optimize the result sets here
        random.shuffle(result_sets)

        basic = result_sets[:M]
        for r in result_sets[M:]:
            maxv = None
            maxi = None
            for b in basic:
                L = float(len(DAG[r] + DAG[b]))
                e1 = len(DAG[r]) * entropy(DAG[r])
                e2 = len(DAG[b]) * entropy(DAG[b])
                newe = L * entropy(DAG[r] + DAG[b])
                score = abs(e1 + e2 - newe)
                if maxv is None:
                    maxv = score
                    maxi = b
                    continue
                if score < maxv:
                    maxv = score
                    maxi = b
            DAG[maxi] += DAG[r]
            del DAG[r]
            DAG[r] = DAG[maxi]

        Candidate_sets = basic

    for tdata_idx in Candidate_sets:
        tdata = DAG[tdata_idx]
        C1 = Counter([b for _, b in tdata])
        del DAG[tdata_idx][:]
        DAG[tdata_idx] += [None, C1]

    return DAG

In [6]:
def classify_jungle(DAG, item):
    branch = DAG[0]
    while branch[0] is not None:
        try:
            fet, L1, L2 = branch
            if fet == True or fet in item:
                branch = DAG[L1]
            else:
                branch = DAG[L2]
        except:
            print (len(branch))
            raise
    return branch[1]

In [7]:
if __name__ == "__main__":
    # dataEN = file("../data/pg23428.txt").read()
    # dataFR = file("../data/pg5711.txt").read()

    dataEN = open("C:\\Users\hc\PycharmProjects\BA_dt\data\origin\pg23428.txt").read()
    dataFR = open("C:\\Users\hc\PycharmProjects\BA_dt\data\origin\pg5711.txt").read()

    
    length = 200

    testEN, trainEN = split_data(dataEN, label=0, length=length)
    testFR, trainFR = split_data(dataFR, label=1, length=length)

    print ("testData: EN=%s FR=%s" % (len(testEN), len(testFR)))
    print ("training: EN=%s FR=%s" % (len(trainEN), len(trainFR)))

testData: EN=225 FR=505
training: EN=2029 FR=4552


In [8]:
    print(testEN)

[('eories and all the so-called "laws" taught by economists.\n\nAnd yet it is certain that the day when any civilized association of\nindividuals would ask itself, _what are the needs of all, and the means\n', 0), ("he State.\n\nIt is a fact that little companies exist, in spite of the State's\npartiality. If in France, land of centralization, we only see five or\nsix large companies, there are more than a hundred an", 0), ('lais to\nConstantinople, without delays, without even changing carriages (when\nyou travel by express). More than that: a parcel deposited at a station\nwill find its addressee anywhere, in Turkey or in ', 0), ('nt.\n\nFOOTNOTE:\n\n[9] _Kropotkin: In Russian and French Prisons._ London, 1887.\n\n\n\n\nCHAPTER XIII\n\nTHE COLLECTIVIST WAGES SYSTEM\n\n\nI\n\nIn their plans for the reconstruction of society the collectivists\nco', 0), ('m for art,\nscience, and amusement?--in a word, for everything that is not comprised\nin the category of absolute necessities? If the

In [9]:
    print(trainEN)



In [10]:
    print(testFR)

[(" s'챕tranglait dans une\ntristesse immense.\n\n--Puis, des malins sont toujours l횪, pour vous promettre que 챌a peut\ns'arranger, si l'on s'en donne seulement la peine...  On se monte la\nt챗te, on souffre te", 1), (' se ruer sur Gaston-Marie.  Au loin, dans le\nsoleil clair, il voyait les beffrois de plusieurs fosses, Mirou sur la\ndroite, Madeleine et Cr챔vecoeur, c척te 횪 c척te.  Le travail grondait\npartout, les coup', 1), ("s deux fosses, il les 챕tudia, il\nd챕termina les points o첫 devaient porter les recherches.  Peu 횪 peu,\ncette chasse l'enflammait, il 챕tait, 횪 son tour, pris d'une fi챔vre de\nd챕vouement, malgr챕 son ironiq", 1), ("saient ensemble.\nMais cela les 챕gayait, d'챗tre ainsi, de songer 횪 de vieilles choses,\nqu'ils rem창chaient en commun, sans avoir besoin d'en causer.  A\nR챕quillart, ils s'asseyaient sur une poutre, c척te ", 1), ('t vers le\ncabaretier, le saisissait par les 챕paules, le secouait, en lui criant\nses r챕ponses dans la face.\n\n--Mais, tonnerre de Dieu! je veux b

In [11]:
    print(trainFR)

[("s,\nd챕tachant par instants les silhouettes d챕mesur챕es du p챔re Bonnemort et\nde son cheval jaune.  Et, au-del횪, dans la plaine rase, l'ombre avait\ntout submerg챕, Montsou, Marchiennes, la for챗t de Vandame", 1), ('oll챕e 횪 la porte, excit챕 par la d챕bauche de ses ma챤tres.\n\nAlors, M. Hennebeau ne bougea plus.  Il regardait toujours le lit.  Le\nlong pass챕 de souffrance se d챕roulait, son mariage avec cette femme,\nle', 1), ("th챕ories.  Une r챕pugnance l'en avait d챕tach챕 peu 횪 peu, le malaise de\nses go청ts affin챕s, la mont챕e lente de tout son 챗tre vers une classe\nsup챕rieure.\n\nA ce moment, la voix de Rasseneur se perdit au mi", 1), ("les camarades.  Jamais,\nd'ailleurs, il ne les avait dirig챕s, c'챕taient eux qui le menaient,\nqui l'obligeaient 횪 faire des choses qu'il n'aurait pas faites, sans\nle branle de cette cohue poussant derri", 1), ("ne clameur: 짬A bas\nles pantalons rouges!쨩 Ces hommes qui avaient 챕cout챕, impassibles,\nd'un visage immobile et muet, les appels 횪 la fraternit챕, l

In [12]:
    train = trainEN + trainFR
    print(train)



In [13]:
    random.shuffle(train)    
    print(train)



In [14]:
    test = testEN + testFR
    print(test)

[('eories and all the so-called "laws" taught by economists.\n\nAnd yet it is certain that the day when any civilized association of\nindividuals would ask itself, _what are the needs of all, and the means\n', 0), ("he State.\n\nIt is a fact that little companies exist, in spite of the State's\npartiality. If in France, land of centralization, we only see five or\nsix large companies, there are more than a hundred an", 0), ('lais to\nConstantinople, without delays, without even changing carriages (when\nyou travel by express). More than that: a parcel deposited at a station\nwill find its addressee anywhere, in Turkey or in ', 0), ('nt.\n\nFOOTNOTE:\n\n[9] _Kropotkin: In Russian and French Prisons._ London, 1887.\n\n\n\n\nCHAPTER XIII\n\nTHE COLLECTIVIST WAGES SYSTEM\n\n\nI\n\nIn their plans for the reconstruction of society the collectivists\nco', 0), ('m for art,\nscience, and amusement?--in a word, for everything that is not comprised\nin the category of absolute necessities? If the

In [15]:
    random.shuffle(test)
    print(test)

[("it-ce\npas un cri de famine que roulait le vent de mars, au travers de cette\ncampagne nue? Les rafales s'챕taient enrag챕es, elles semblaient\napporter la mort du travail, une disette qui tuerait beaucoup", 1), ("et il retrouvait ainsi la chaleur, l'odeur de la\nfornication, l'adult챔re vivant, dans les pots qui tra챤naient dans les\ncuvettes encore pleines, dans le d챕sordre des linges, des meubles, de\nla pi챔ce en", 1), ('e to gain time to snatch up the whip--it is natural that\n"practical" men should extol this method of perpetuating the wage\nsystem. What need to rack our brains when we have the time-honoured\nmethod of', 0), (" regrettable, quelque chose\nde perdu, l횪-bas, dans le pays noir, tr챔s loin du pav챕 parisien qui\nfaisait l'opinion.  On oublierait vite, la Compagnie avait re챌u\nl'ordre officieux d'챕touffer l'affaire e", 1), (" and to tolerate a Government that\nwould make itself felt in the smallest details of a citizen's life, even\nif that Government had no other aim tha

In [16]:
    sometrain = random.sample(train, 1000)
    print(sometrain)

[(" Br청l챕, qui les dominait de sa taille maigre.\n\nMais il y eut un brusque arr챗t, la surprise d'une minute d챕terminait\nun peu du calme que les supplications d'횋tienne ne pouvaient obtenir.\nC'챕taient simp", 1), ('complished without so\nmuch as jarring the dishes on their dinner tables will find themselves\nmistaken. It is true that Governments can change without disturbing\nworthy citizens at dinner, but the crim', 0), ('grand-p챔re, enfin on ne sait pas, tout au commencement, quand on a\ndonn챕 le premier coup de pioche l횪-bas, 횪 R챕quillart.\n\nR챗veur, M. Gr챕goire regardait cette femme et ces enfants pitoyables,\navec leur', 1), ("le silence.\nEnfin, le premier put dire:\n\n--Mais il y a des hommes au fond, camarades!\n\nLe vacarme redoubla, des voix partaient de toutes parts.\n\n--Tant pis! fallait pas descendre!...  C'est bien fait ", 1), ("ue 횪\nl'ouvrier! Eh! justement, les choses changeraient bient척t, parce que\nl'ouvrier r챕fl챕chissait 횪 cette heure.  Du temps du vieux, le mineu

In [17]:
    features = set()
    while len(features) < 700:
        fragment, _ = random.choice(sometrain)
        l = int(round(random.expovariate(0.20)))
        b = random.randint(0, max(0, length - l))
        feat = fragment[b:b+l]

        ## Test
        C = 0
        for st, _ in sometrain:
            if feat in st:
                C += 1

        f = float(C) / 1000
        if f > 0.01 and f < 0.99 and feat not in features:
            features.add(feat)

    features = list(features)
    print(features)

['e trav', 'xe', 'ow', ' avaient ', 'ui ', 'men', 'reg', 'e of a', '\ng', 'ement', 'ais', 'll', 'emsel', 'uan', 's ch', 'a, e', 's,', '\na', ' fer', 't elle', ' m', 'ces', 'fo', 'onne', 'h a', 'it pas', 'very ', 'coup', ' leur ', 'er a', ' De', ' rai', '\nf', 'amp', 'se d', 'u d', 'aien', 'ce', 'com', 'tt', '  Ce', 'plu', '\nLe ', " l'a", ' so', 'y,', 'em', 'ma', 'pes', 'poi', 'f the ', 'ill', 'cla', 'cal', 'enant, ', 'es ', 'e fo', 'volutio', 'x ', 'rl', 'ce ', 'du ', ' nou', 'face', 'le', ' enc', 'sa', ' gr챔', 'r챕', 'ks', '.\n\n-', 'ex', "'ils", 'sem', 'han', 'ci', ' se', 'l r', 'he ', 'r les', 'ev', ' secou', ' b', ' alla', 'u ', "'e", 'an', 'appe', 's un', 'la', 'n si', 'to', 's s', 'ga', 'uy', 'Bon', ' r', 's to', 'eces', 'nt\n', 'sait', ' fu', 'ting ', 'mb', 's,\n', 'e da', 'at', 'des ', '-', 'es qui', 'tag', 'ha', ' de sa', 'tr', 'le p', ' reg', 'd w', 'ra', 're en', ' t', 're', ' p', 'ter,', 'over', ' les', 'Mais', 'hat', 'tic', 'le\n', 'on r', 'cou', 'pren', 'ea', 'que', 'ite 

In [18]:
    jungle = []
    for i in range(10):
        print ("*--- Build tree : %s ---*" % i)
        size = len(train) / 3
        size = int(size)
        training_sample = random.sample(train, size)
        tree = build_jungle(training_sample, features, numfeatures=100)
        print("\n", tree ,"\n")
        jungle += [tree]

*--- Build tree : 0 ---*

 {0: ['the', 1, 2], 1: ['dans', 3, 4], 2: [' of', 5, 6], 3: [True, 13, 13], 4: ['the ', 7, 8], 5: ['챕', 9, 10], 6: ['챕', 11, 12], 7: [True, 23, 23], 8: ['of', 18, 19], 9: [True, 15, 15], 10: [True, 14, 14], 11: [' w', 21, 22], 12: ['de ', 16, 17], 13: [True, 20, 20], 14: [True, 30, 30], 15: [True, 31, 31], 16: [' d', 33, 34], 17: [' d', 28, 29], 18: ['i ', 26, 27], 19: ['q', 36, 37], 20: [True, 35, 35], 21: [True, 32, 32], 22: [True, 24, 24], 23: [True, 25, 25], 24: [True, 48, 48], 25: [True, 43, 43], 26: [True, 42, 42], 27: [True, 39, 39], 28: ['wh', 44, 45], 29: ['x ', 51, 52], 30: [True, 40, 40], 31: [True, 50, 50], 32: [True, 49, 49], 33: [True, 41, 41], 34: [True, 38, 38], 35: [True, 55, 55], 36: ['ting ', 46, 47], 37: [' la ', 53, 54], 38: [True, 56, 56], 39: [True, 61, 61], 40: [True, 63, 63], 41: [True, 57, 57], 42: [True, 67, 67], 43: [True, 66, 66], 44: [True, 62, 62], 45: [True, 73, 73], 46: [True, 58, 58], 47: [True, 74, 74], 48: [True, 68, 68], 49


 {0: ['of', 1, 2], 1: ['eu', 3, 4], 2: ['ed', 5, 6], 3: ['he ', 13, 14], 4: ['ait ', 9, 10], 5: ['.  ', 11, 12], 6: ['th', 7, 8], 7: ['ing ', 20, 21], 8: ['.  ', 16, 17], 9: ['the', 24, 25], 10: ['챔', 26, 27], 11: ['he ', 18, 19], 12: [' 횪 ', 22, 23], 13: [True, 28, 28], 14: [True, 15, 15], 15: [True, 30, 30], 16: [True, 31, 31], 17: [' l', 44, 45], 18: [True, 37, 37], 19: [True, 39, 39], 20: [True, 32, 32], 21: ['ne', 41, 42], 22: [True, 38, 38], 23: ['the', 34, 35], 24: [True, 29, 29], 25: [True, 40, 40], 26: [True, 36, 36], 27: [True, 43, 43], 28: [True, 33, 33], 29: [True, 56, 56], 30: [True, 58, 58], 31: [True, 63, 63], 32: [True, 50, 50], 33: [True, 60, 60], 34: [True, 49, 49], 35: ['and', 64, 65], 36: [True, 48, 48], 37: [True, 59, 59], 38: [True, 53, 53], 39: [True, 47, 47], 40: [True, 46, 46], 41: ['ld ', 61, 62], 42: [True, 55, 55], 43: [True, 54, 54], 44: [True, 57, 57], 45: [' se', 51, 52], 46: [True, 78, 78], 47: [True, 82, 82], 48: [True, 77, 77], 49: [True, 87, 87], 50:


 {0: [' the', 1, 2], 1: [True, 5, 5], 2: ['th', 3, 4], 3: ['y', 9, 10], 4: ['k', 7, 8], 5: [True, 6, 6], 6: [True, 14, 14], 7: ['of', 15, 16], 8: ['or ', 11, 12], 9: ['ait ', 17, 18], 10: [True, 13, 13], 11: ['횪 ', 29, 30], 12: [' d', 22, 23], 13: [True, 19, 19], 14: [True, 20, 20], 15: ['챕t', 25, 26], 16: [True, 21, 21], 17: [True, 24, 24], 18: ['de ', 27, 28], 19: [True, 36, 36], 20: [True, 43, 43], 21: [True, 40, 40], 22: [True, 37, 37], 23: ['te', 32, 33], 24: [True, 35, 35], 25: [True, 44, 44], 26: [True, 31, 31], 27: ['k', 38, 39], 28: [True, 42, 42], 29: [True, 41, 41], 30: [True, 34, 34], 31: [True, 49, 49], 32: [True, 45, 45], 33: ['챕', 50, 51], 34: [True, 54, 54], 35: [True, 55, 55], 36: [True, 46, 46], 37: [True, 53, 53], 38: [True, 58, 58], 39: [True, 47, 47], 40: [True, 48, 48], 41: [True, 52, 52], 42: [True, 59, 59], 43: [True, 56, 56], 44: [True, 57, 57], 45: [True, 72, 72], 46: [True, 68, 68], 47: [True, 73, 73], 48: [True, 61, 61], 49: [True, 70, 70], 50: [True, 69, 6


 {0: ['th', 1, 2], 1: ['챕', 5, 6], 2: [' of', 3, 4], 3: [' la ', 9, 10], 4: ['s i', 11, 12], 5: ['the ', 7, 8], 6: [' 횪', 13, 14], 7: [True, 19, 19], 8: [True, 24, 24], 9: [True, 20, 20], 10: ['ujo', 22, 23], 11: ['wh', 15, 16], 12: [True, 21, 21], 13: [True, 25, 25], 14: [" l'", 17, 18], 15: [True, 35, 35], 16: [True, 36, 36], 17: [True, 33, 33], 18: ['i ', 30, 31], 19: [True, 37, 37], 20: [True, 32, 32], 21: [True, 34, 34], 22: [True, 28, 28], 23: [True, 29, 29], 24: [True, 27, 27], 25: [True, 26, 26], 26: [True, 40, 40], 27: [True, 47, 47], 28: [True, 46, 46], 29: [True, 42, 42], 30: [True, 49, 49], 31: [True, 38, 38], 32: [True, 41, 41], 33: [True, 44, 44], 34: [True, 48, 48], 35: [True, 43, 43], 36: [True, 45, 45], 37: [True, 39, 39], 38: [True, 54, 54], 39: [True, 56, 56], 40: [True, 55, 55], 41: [True, 51, 51], 42: [True, 58, 58], 43: [True, 53, 53], 44: [True, 59, 59], 45: [True, 57, 57], 46: [True, 60, 60], 47: [True, 52, 52], 48: [True, 61, 61], 49: [True, 50, 50], 50: [True


 {0: [' of', 1, 2], 1: ['횪 ', 5, 6], 2: [' w', 3, 4], 3: ['aien', 12, 13], 4: [' the', 8, 9], 5: [True, 7, 7], 6: ['i ', 10, 11], 7: [True, 21, 21], 8: [True, 15, 15], 9: [' l', 18, 19], 10: ['ne', 16, 17], 11: ['tte ', 22, 23], 12: [True, 14, 14], 13: [True, 20, 20], 14: [True, 33, 33], 15: [True, 25, 25], 16: [True, 31, 31], 17: [True, 32, 32], 18: [True, 26, 26], 19: ['on', 28, 29], 20: [True, 24, 24], 21: [True, 27, 27], 22: [True, 30, 30], 23: ['que ', 34, 35], 24: [True, 38, 38], 25: [True, 37, 37], 26: [True, 44, 44], 27: [True, 50, 50], 28: ['rec', 42, 43], 29: ['횪', 39, 40], 30: [True, 47, 47], 31: [True, 45, 45], 32: [True, 41, 41], 33: [True, 36, 36], 34: ['tha', 48, 49], 35: [True, 46, 46], 36: [True, 57, 57], 37: [True, 61, 61], 38: [True, 62, 62], 39: [True, 66, 66], 40: [True, 54, 54], 41: [True, 56, 56], 42: ['t par', 52, 53], 43: [True, 55, 55], 44: [True, 59, 59], 45: [True, 65, 65], 46: [True, 51, 51], 47: [True, 60, 60], 48: [True, 63, 63], 49: [True, 64, 64], 50: 

In [19]:
    testdata = test
    results_jungle = Counter()
    print(testdata)

[("it-ce\npas un cri de famine que roulait le vent de mars, au travers de cette\ncampagne nue? Les rafales s'챕taient enrag챕es, elles semblaient\napporter la mort du travail, une disette qui tuerait beaucoup", 1), ("et il retrouvait ainsi la chaleur, l'odeur de la\nfornication, l'adult챔re vivant, dans les pots qui tra챤naient dans les\ncuvettes encore pleines, dans le d챕sordre des linges, des meubles, de\nla pi챔ce en", 1), ('e to gain time to snatch up the whip--it is natural that\n"practical" men should extol this method of perpetuating the wage\nsystem. What need to rack our brains when we have the time-honoured\nmethod of', 0), (" regrettable, quelque chose\nde perdu, l횪-bas, dans le pays noir, tr챔s loin du pav챕 parisien qui\nfaisait l'opinion.  On oublierait vite, la Compagnie avait re챌u\nl'ordre officieux d'챕touffer l'affaire e", 1), (" and to tolerate a Government that\nwould make itself felt in the smallest details of a citizen's life, even\nif that Government had no other aim tha

In [20]:
    for item, cat in testdata:
        # Jungle
        c = Counter()
        for tree in jungle:
            c += classify_jungle(tree, item)
        res = (max(c, key=lambda x: c[x]), cat)
        results_jungle.update([res])

    
    print ("Results           Jungle")
    print ("True positives:     %4d"  % (results_jungle[(1, 1)]))
    print ("True negatives:     %4d"  % (results_jungle[(0, 0)]))
    print ("False positives:    %4d"  % (results_jungle[(1, 0)]))
    print ("False negatives:    %4d"  % (results_jungle[(0, 1)]))

Results           Jungle
True positives:      505
True negatives:      222
False positives:       3
False negatives:       0
