In [76]:
import random
import sympy as sp
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import multiprocessing
import time as t
import warnings

from tqdm import tqdm
import os
import re

## Getting reference data from Feynman dataset

In [15]:
D_MAX = 10
VARIABLES = [f'x{i}' for i in range(D_MAX)]
SYMBOLS = list(sp.symbols(' '.join(VARIABLES)))

class Sqr(sp.Function):
    @classmethod
    def eval(cls, x):
        return x ** 2
    
class Inv(sp.Function):
    @classmethod
    def eval(cls, x):
        return 1 / x
    
class Abs(sp.Function):
    @classmethod
    def eval(cls, x):
        return sp.Abs(x)

class Atan(sp.Function):
    @classmethod
    def eval(cls, x):
        return sp.atan(x)

class Asin(sp.Function):
    @classmethod
    def eval(cls, x):
        return sp.asin(x)

class Acos(sp.Function):
    @classmethod
    def eval(cls, x):
        return sp.acos(x)

class Tanh (sp.Function):
    @classmethod
    def eval(cls, x):
        return sp.tanh(x)

symbol_dict = {str(var): var for var in SYMBOLS}
symbol_dict['inv'] = Inv
#symbol_dict['sqr'] = Sqr
symbol_dict['abs'] = Abs
symbol_dict['arctan'] = Atan
symbol_dict['arcsin'] = Asin
symbol_dict['arccos'] = Acos
symbol_dict['tanh'] = Tanh


BINARY_FEYNMAN = ['+', '-', '*', '/']
UNARY_FEYNMAN = ['inv','abs', 'sqr','sqrt','sin', 'cos', 'tan','arctan','log', 'exp', 'asin', 'acos', 'cosh', 'tanh']

In [16]:
#use healed files attached

df1 = pd.read_csv('FeynmanEquations.csv')
df2 = pd.read_csv('BonusEquations.csv')
df2 = df2.set_index(pd.Index([100+i for i in range (df2.shape[0])]))
df = pd.concat((df1, df2))
df = df[df['Formula'].notnull()]

NameError: name 'pd' is not defined

In [257]:
feynman_expressions = []

for index, row in df.iterrows():
    for i in range (1, int(row['# variables']+1)):
        row['Formula'] = row['Formula'].replace(row[f"v{i}_name"], f"x{i}")
    row['Formula'] = row['Formula'].replace("pi", "3.1415")
    feynman_expressions.append((int(row['Number']), int(row['# variables']), row['Formula']))

df['Formula'] = [feynman_expressions[i][2] for i in range(len(feynman_expressions))]

In [157]:
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

def simplify(number, num_var, expr):
    sympy_expr = sp.sympify(expr, locals=symbol_dict)
    return (number, num_var, sp.simplify(sympy_expr))

def parallel_simplification(feynman_expressions, timeout=3):
    results = []
    with ThreadPoolExecutor() as executor:
        # Submit tasks to generate and evaluate random expressions
        futures = {executor.submit(simplify, number, num_var, expr) : (number, num_var, expr) for (number, num_var, expr) in feynman_expressions}
        for future in futures:
            try:
                result = future.result(timeout=timeout)  # Set the timeout for each thread
                results.append(result)
            except TimeoutError:
                number = futures[future][0]
                print(f"Timeout occurred for expression: {number}")
                results.append(None)  # Append None or any placeholder for timed-out tasks
    return results

In [259]:
if __name__ == '__main__':
    results = parallel_simplification(feynman_expressions, timeout=3)

In [None]:
def pref_to_inf(expr):
    stack = []
    for i in range(len(expr)-1,-1,-1):
        e = expr[i]
        if e in BINARY:
            right = stack.pop()
            left = stack.pop()
            stack.append(f"({left} {e} {right})")
        elif e in UNARY:
            right = stack.pop()
            stack.append(f"{e}({right})")
        else :
            stack.append(str(e))
    return stack[0]

In [159]:
def inf_to_pref(expr):
    precedence = {
        ' + ': 1, ' - ': 1,
        ' * ': 2, ' / ': 2
    }
    stack = []  # Operator stack
    output = []  # Prefix output

    def precedence_of(op):
        return precedence.get(op, 0)
    
    expr = expr[::-1]  # Reverse the expression for right-to-left traversal

    i = 0
    while i < len(expr) :
        token = expr[i]
        if i < len(expr)-3 and (expr[i]+expr[i+1]+expr[i+2]) in precedence:
            while (stack and precedence_of(stack[-1]) > precedence_of(expr[i]+expr[i+1]+expr[i+2])):
                output.append(stack.pop())
            stack.append(expr[i]+expr[i+1]+expr[i+2])
            output.append(' ')
            i += 3
        elif not token in '()' :
            output.append(token) # Add operand to the output
            i+=1
        elif token == ')':  # Closing parenthesis (reversed input)
            stack.append(token)
            i+=1
        elif token == '(':
            while stack and stack[-1] != ')':
                output.append(stack.pop())
            stack.pop()
            i+=1# Pop the closing parenthesis
                
    while stack:
        output.append(stack.pop())

    return ''.join(output[::-1]).split()  # Reverse the output for prefix format

In [260]:
pref_results = []

for (number, num_var, expr) in results :
    expr = str(expr)
    expr = expr.replace('(', '( ')
    expr = expr.replace(')', ' )')
    expr = expr.replace('*', ' * ')
    expr = expr.replace('/', ' / ')
    pref_expr = inf_to_pref(expr)
    pref_results.append((number, num_var, pref_expr))

In [236]:
def TDD (tree):
    i = 0
    res = [[]]
    for e in tree :
        if e in BINARY_FEYNMAN :
            res[i].append([e, 0, 0, 2])
            i += 1
            res.append([])
            Ascend=False
        elif e in UNARY_FEYNMAN :
            res[i].append([e, None, 'unary', 1])
            i += 1
            res.append([])
            Ascend=False
        else :
            res[i].append([e, None, 'leaf', 0])
            Ascend=True
        
        while Ascend and i>0 and res[i] != [] :
            if res[i-1][-1][1] != None:
                side = res[i-1][-1][3]
                if side == 2 :
                    res[i-1][-1][1] += 1
                elif side == 1 :
                    res[i-1][-1][2] += 1
            res[i-1][-1][3] -= 1
            Ascend=False
            if res[i-1][-1][3] == 0:
                i -= 1
                Ascend=True
        j=i-1
        while j>0 :
            if res[j-1][-1][1] != None :
                side = res[j-1][-1][3]
                if side == 2 :
                    res[j-1][-1][1] += 1
                elif side == 1 :
                    res[j-1][-1][2] += 1
            j -= 1
    while res[-1] == [] :
        res.pop()
    return res

In [261]:
feynman_trees = []
for (line, (number, num_var, pref_expr)) in enumerate(pref_results) :
    tree = TDD (pref_expr)
    depth = len(tree)
    left, right = 0, 0
    distr = []
    for (pos, e) in enumerate(tree) :
        if e != [] :
            local = len(e)/(2**pos)
            distr.append(local)
            for op in e :
                if op[1] != None :
                    left += op[1]
                    right += op[2]
    nodes = sum (distr)
    fullness = nodes/(2**depth-1)
    skewness = left/(left+right)

    feynman_trees.append([number, num_var, tree, depth, fullness, skewness, distr])

In [262]:
df['tree'] = [feynman_trees[i][2] for i in range(len(feynman_trees))]
df['depth'] = [feynman_trees[i][3] for i in range(len(feynman_trees))]
df['fullness'] = [feynman_trees[i][4] for i in range(len(feynman_trees))]
df['skewness'] = [feynman_trees[i][5] for i in range(len(feynman_trees))]
df['distr'] = [feynman_trees[i][6] for i in range(len(feynman_trees))]

df[['depth', 'fullness', 'skewness']].groupby('depth').mean()

Unnamed: 0_level_0,fullness,skewness
depth,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0.666667,0.5
3,0.408163,0.539116
4,0.194928,0.635386
5,0.100554,0.61186
6,0.047194,0.596878
7,0.023089,0.660049
8,0.012892,0.690192
9,0.005512,0.654033
10,0.00264,0.664058
11,0.001383,0.680295


In [263]:
df[['# variables', 'depth', 'fullness', 'skewness']].groupby('# variables').mean()

Unnamed: 0_level_0,depth,fullness,skewness
# variables,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,6.0,0.040675,0.526316
2.0,3.933333,0.314062,0.628046
3.0,5.459459,0.179168,0.613949
4.0,6.575758,0.076986,0.659358
5.0,7.684211,0.064631,0.592002
6.0,10.0,0.034003,0.665522
7.0,15.666667,4.2e-05,0.872115
8.0,9.0,0.005336,0.644628
9.0,9.0,0.006834,0.613445


In [278]:
df1 = pd.concat((df.depth, pd.DataFrame(df.distr.values.tolist()).add_prefix('level_')), axis=1)
df1.groupby('depth').mean()

Unnamed: 0_level_0,level_0,level_1,level_2,level_3,level_4,level_5,level_6,level_7,level_8,level_9,level_10,level_11,level_12,level_13,level_14,level_15
depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2,1.0,1.0,,,,,,,,,,,,,,
3,1.142857,1.0,0.714286,,,,,,,,,,,,,
4,1.0,0.956522,0.652174,0.315217,,,,,,,,,,,,
5,1.0625,0.84375,0.703125,0.367188,0.140625,,,,,,,,,,,
6,1.0,0.821429,0.553571,0.321429,0.196429,0.080357,,,,,,,,,,
7,1.0,0.722222,0.611111,0.291667,0.166667,0.107639,0.032986,,,,,,,,,
8,1.2,0.9,0.5,0.325,0.1625,0.13125,0.05,0.01875,,,,,,,,
9,1.0,0.791667,0.479167,0.229167,0.125,0.109375,0.05599,0.018229,0.007812,,,,,,,
10,1.0,0.625,0.5,0.265625,0.140625,0.078125,0.046875,0.027344,0.012695,0.00415,,,,,,
11,1.0,0.75,0.5625,0.28125,0.125,0.054688,0.027344,0.015625,0.007812,0.004883,0.001953,,,,,


### Histogram for operators

In [323]:
dfhist = df.groupby('depth')['Formula'].apply(''.join).reset_index()

dfhist['+'] = dfhist.Formula.str.count('\+')
dfhist['-'] = dfhist.Formula.str.count('-')
dfhist['*'] = dfhist.Formula.str.count('\*')
dfhist['/'] = dfhist.Formula.str.count('/')
for operator in UNARY_FEYNMAN :
   dfhist[f"{operator}"] = dfhist.Formula.str.count(f"{operator}") 

dfhist

  dfhist['+'] = dfhist.Formula.str.count('\+')
  dfhist['*'] = dfhist.Formula.str.count('\*')


Unnamed: 0,depth,Formula,+,-,*,/,inv,abs,sqr,sqrt,sin,cos,tan,arctan,log,exp,asin,acos,cosh,tanh
0,2,x1*x2x1*x2x1/x2x1/x2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,3,x1*x2*x3x1*x2*sin(x3)arcsin(x1*sin(x2))arcsin(...,1,4,26,8,0,0,0,0,4,2,0,0,0,0,0,0,0,0
2,4,x1*x4+x2*x5+x3*x6(x1*x3+x2*x4)/(x1+x2)x1*x2*x3...,7,4,49,26,0,0,2,1,1,0,0,0,1,0,0,0,0,0
3,5,x1*x2*x4/(4*3.1415*x3*x4*sqr(x4))x1*x3/(4*3.14...,3,5,50,18,0,0,9,0,2,7,1,0,0,0,0,0,0,1
4,6,exp(-0.5*sqr(x1))/sqrt(2*3.1415)(x3+x2)/(1+x3*...,7,5,52,21,0,0,11,4,0,2,0,0,0,5,0,0,0,0
5,7,sqrt(sqr(x2-x1)+sqr(x4-x3))0.5*x1*(sqr(x2)+sqr...,4,6,26,14,0,0,12,1,1,3,0,0,0,2,0,0,0,0
6,8,0.5*x1*(sqr(x2)+sqr(x3)+sqr(x4))sqrt(sqr(x1)+s...,5,4,16,6,0,0,9,1,2,2,0,0,0,0,0,0,0,0
7,9,x3*x4*x5/(sqr(x5-x4)+sqr(x7-x6)+sqr(x9-x8))x1/...,7,15,28,23,0,0,27,7,1,1,0,0,0,0,0,0,0,0
8,10,exp(-0.5*sqr(x2/x1))/(sqrt(2*3.1415)*x1)x3/(2*...,3,6,30,22,0,0,23,6,3,2,0,0,0,2,0,0,0,0
9,11,exp(-0.5*sqr((x2-x3)/x1))/(sqrt(2*3.1415)*x1)(...,1,5,15,10,0,0,11,3,0,0,0,0,0,1,0,0,0,0


In [324]:
dfvar = df.groupby('# variables')['Formula'].apply(''.join).reset_index()

dfvar['+'] = dfvar.Formula.str.count('\+')
dfvar['-'] = dfvar.Formula.str.count('-')
dfvar['*'] = dfvar.Formula.str.count('\*')
dfvar['/'] = dfvar.Formula.str.count('/')
for operator in UNARY_FEYNMAN :
   dfvar[f"{operator}"] = dfvar.Formula.str.count(f"{operator}") 

dfvar

  dfvar['+'] = dfvar.Formula.str.count('\+')
  dfvar['*'] = dfvar.Formula.str.count('\*')


Unnamed: 0,# variables,Formula,+,-,*,/,inv,abs,sqr,sqrt,sin,cos,tan,arctan,log,exp,asin,acos,cosh,tanh
0,1.0,exp(-0.5*sqr(x1))/sqrt(2*3.1415),0,1,2,1,0,0,2,1,0,0,0,0,0,1,0,0,0,0
1,2.0,exp(-0.5*sqr(x2/x1))/(sqrt(2*3.1415)*x1)x1*x2x...,2,2,21,12,0,0,6,1,2,0,0,0,0,1,0,0,0,0
2,3.0,exp(-0.5*sqr((x2-x3)/x1))/(sqrt(2*3.1415)*x1)x...,9,16,69,49,0,0,29,10,5,8,0,0,0,1,0,0,0,0
3,4.0,sqrt(sqr(x2-x1)+sqr(x4-x3))x1*x2*x4/(4*3.1415*...,15,17,103,49,0,0,40,8,5,8,0,0,0,2,0,0,0,0
4,5.0,x1*(x2+x3*x4*sin(x5))x5*x1*x2*(1/x4-1/x3)x3/(2...,6,17,85,35,0,0,31,3,1,2,1,0,1,4,0,0,0,1
5,6.0,x1*x4+x2*x5+x3*x6(0.5*x1*x2*sqr(x3))*(8*3.1415...,11,8,62,17,0,0,28,3,1,2,0,0,0,1,0,0,0,0
6,7.0,sqr(x1*x2*x3*x4*x5/(4*x6*sqr(sin(x7/2))))sqrt(...,2,1,18,8,0,0,12,1,2,0,0,0,0,0,0,0,0,0
7,8.0,x1*x2/(x3*x4)+(x1*x5)/(x6*sqr(x7)*x3*x4)*x8,1,0,7,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0
8,9.0,x3*x4*x5/(sqr(x5-x4)+sqr(x7-x6)+sqr(x9-x8)),2,3,2,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0


In [328]:
dfhist.drop(['depth', 'Formula'], axis=1).sum()

+          48
-          65
*         369
/         174
inv         0
abs         0
sqr       152
sqrt       27
sin        16
cos        20
tan         1
arctan      0
log         1
exp        10
asin        0
acos        0
cosh        0
tanh        1
dtype: int64

## Evaluating our datasets

### Naïve reccurrent generation

In [2]:
D_MAX = 4 #max dimension of input
u_max = 10 #max number of unary
N_MAX = 100 #max number of input values
BINARY = ['+', '-', '*']
BINARY_WEIGHTS = [1, 1, 1]
UNARY = ['inv','abs', 'sqr','sqrt','sin', 'cos', 'tan','arctan','log', 'exp']
UNARY_WEIGHTS = [5,1,3,3,1,1,0.2,0.2,0.2,1]
PRECISION = 2 #number of digits
VARIABLES = [f'x{i}' for i in range(D_MAX)]
SYMBOLS = list(sp.symbols(' '.join(VARIABLES)))

TREE_WEIGHTS=[1,2] #[binary ; leaf] : to be tuned so that matches canonical tree statistics. It is not the actual ratio which is roughly equal to 1.

class Sqr(sp.Function):
    @classmethod
    def eval(cls, x):
        return x ** 2
    
class Inv(sp.Function):
    @classmethod
    def eval(cls, x):
        return 1 / x
    
class Abs(sp.Function):
    @classmethod
    def eval(cls, x):
        return sp.Abs(x)

class Atan(sp.Function):
    @classmethod
    def eval(cls, x):
        return sp.atan(x)

symbol_dict = {str(var): var for var in SYMBOLS}
symbol_dict['inv'] = Inv
symbol_dict['sqr'] = Sqr
symbol_dict['abs'] = Abs
symbol_dict['arctan'] = Atan

In [44]:
# Générateur récursif d'expressions mathématiques
# risque de ne pas capturer ration binaires-dimension. Les autre sparamètres devraient être observables par fullness-roughness
# la structure est exactement la même que Lample infra, juste obtenir le ratio de génération pour que la profondeur pilote le nombre de binaires autant que possible à l'étape de génération d'arbre

def generate_tree(depth=3):
    if depth == 0:
        return ['Leaf']
    
    # Choix d'une structure d'expression (opérateur binaire ou fonction unaire)
    else :
        expr_type = random.choices(['binary', 'leaf'], weights=TREE_WEIGHTS)[0]
        
        if expr_type == 'binary':
            # Générer une expression binaire
            op = random.choices(BINARY, weights=BINARY_WEIGHTS)
            left = generate_tree(depth-1)
            right = generate_tree(depth-1)
            return op + left + right
        
        elif expr_type == 'leaf':
            return ['Leaf']

def insert_multi(position, list1, list2):
    return list1[:position] + list2 + list1[position:]

def generate_integer(precision=2, max_exp=1):
    sign = random.choice([-1,1])
    mantissa = random.randint(1,10**precision)
    min_power = -max_exp - (precision + 1) // 2
    max_power = max_exp - (precision + 1) // 2   # ??
    exponent = random.randint(min_power, max_power+1)
    return round(sign * mantissa * 10**exponent, precision)

def generate_expression() :
    D = random.randint(1, D_MAX)
    variables = VARIABLES[:D]
    b_max = 5 + D #max number of binary
    b = random.randint(D-1, D+b_max)
    depth = b//2 #to be tuned according to statistics

    expr = generate_tree(depth)

    u = random.randint(0, u_max)
    # print(u)
    unary = random.choices(UNARY, weights=UNARY_WEIGHTS, k=u)
    while len(unary) > 0 :
        i = random.randint(0, len(expr)-1)
        una = unary.pop(0)
        expr = insert_multi(i, expr, [una])

    ession = []
    i = 0
    for x in expr :
        lateral = bool(random.getrandbits(1))
        if x == 'Leaf' :
            x = random.choice(variables)
            m, p = generate_integer(PRECISION, max_exp=0), generate_integer(PRECISION,max_exp=0)
            if lateral :
                 ession += ['+', p, '*', m, x]
            else :
                ession += ['+', '*', m, x, p]
        elif x in UNARY :
            m, p = generate_integer(PRECISION, max_exp=0), generate_integer(PRECISION,max_exp=0)
            if lateral :
                ession += ['+', p, '*', m, x]
            else :
                ession += ['+', '*', m, x, p]
        else :
            ession.append(x)
        i += 1
        
    return ession

In [45]:
naive = []
for k in range (1000):
    ession = generate_expression()
    naive.append((k, ession)) #beware that D is now depth of the tree !!

In [46]:
def pref_to_inf(expr):
    stack = []
    for i in range(len(expr)-1,-1,-1):
        e = expr[i]
        if e in BINARY:
            right = stack.pop()
            left = stack.pop()
            stack.append(f"({left} {e} {right})")
        elif e in UNARY:
            right = stack.pop()
            stack.append(f"{e}({right})")
        else :
            stack.append(str(e))
    return stack[0]

#### Simplification

In [6]:
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

def simplify(number, expr):
    infixed_expr = pref_to_inf(expr)
    sympy_expr = sp.sympify(infixed_expr, locals=symbol_dict)
    return (number, sp.simplify(sympy_expr))

def parallel_simplification(feynman_expressions, timeout=3):
    results = []
    with ThreadPoolExecutor() as executor:
        # Submit tasks to generate and evaluate random expressions
        futures = {executor.submit(simplify, number, expr) : (number, expr) for (number, expr) in feynman_expressions}
        for future in tqdm(futures):
            try:
                result = future.result(timeout=timeout) # Set the timeout for each thread
                results.append(result)
            except TimeoutError:
                number = futures[future][0]
                print(f"Timeout occurred for expression: {number}")
                results.append(None)  # Append None or any placeholder for timed-out tasks
    return results

In [None]:
if __name__ == '__main__':
    results = parallel_simplification(naive, timeout=3)

  1%|▊                                                                                 | 1/100 [00:03<05:02,  3.05s/it]

Timeout occurred for expression: 0


  2%|█▋                                                                                | 2/100 [00:07<05:56,  3.64s/it]

Timeout occurred for expression: 1
Timeout occurred for expression: 2


  6%|████▉                                                                             | 6/100 [00:17<04:23,  2.80s/it]

Timeout occurred for expression: 5


 10%|████████                                                                         | 10/100 [00:33<05:52,  3.92s/it]

Timeout occurred for expression: 9


 11%|████████▉                                                                        | 11/100 [00:38<06:07,  4.13s/it]

Timeout occurred for expression: 10


 15%|████████████▏                                                                    | 15/100 [00:49<04:55,  3.48s/it]

Timeout occurred for expression: 14


 17%|█████████████▊                                                                   | 17/100 [00:56<05:06,  3.69s/it]

Timeout occurred for expression: 16


 18%|██████████████▌                                                                  | 18/100 [00:59<05:00,  3.67s/it]

Timeout occurred for expression: 17


 19%|███████████████▍                                                                 | 19/100 [01:07<04:46,  3.53s/it]


In [10]:
def inf_to_pref(expr):
    precedence = {
        ' + ': 1, ' - ': 1,
        ' * ': 2, ' / ': 2
    }
    stack = []  # Operator stack
    output = []  # Prefix output

    def precedence_of(op):
        return precedence.get(op, 0)
    
    expr = expr[::-1]  # Reverse the expression for right-to-left traversal

    i = 0
    while i < len(expr) :
        token = expr[i]
        if i < len(expr)-3 and (expr[i]+expr[i+1]+expr[i+2]) in precedence:
            while (stack and precedence_of(stack[-1]) > precedence_of(expr[i]+expr[i+1]+expr[i+2])):
                output.append(stack.pop())
            stack.append(expr[i]+expr[i+1]+expr[i+2])
            output.append(' ')
            i += 3
        elif not token in '()' :
            output.append(token) # Add operand to the output
            i+=1
        elif token == ')':  # Closing parenthesis (reversed input)
            stack.append(token)
            i+=1
        elif token == '(':
            while stack and stack[-1] != ')':
                output.append(stack.pop())
            stack.pop()
            i+=1# Pop the closing parenthesis
                
    while stack:
        output.append(stack.pop())

    return ''.join(output[::-1]).split()  # Reverse the output for prefix format

In [260]:
pref_results = []

for (number, expr) in results :
    expr = str(expr)
    expr = expr.replace('(', '( ')
    expr = expr.replace(')', ' )')
    expr = expr.replace('*', ' * ')
    pref_expr = inf_to_pref(expr)
    pref_results.append((number, pref_expr))

#### End of it

In [47]:
pref_results = naive #in case not sympy

In [48]:
def TDD (tree):
    i = 0
    res = [[]]
    for e in tree :
        if e in BINARY :
            res[i].append([e, 0, 0, 2])
            i += 1
            res.append([])
            Ascend=False
        elif e in UNARY :
            res[i].append([e, None, 'unary', 1])
            i += 1
            res.append([])
            Ascend=False
        else :
            res[i].append([e, None, 'leaf', 0])
            Ascend=True
        
        while Ascend and i>0 and res[i] != [] :
            if res[i-1][-1][1] != None:
                side = res[i-1][-1][3]
                if side == 2 :
                    res[i-1][-1][1] += 1
                elif side == 1 :
                    res[i-1][-1][2] += 1
            res[i-1][-1][3] -= 1
            Ascend=False
            if res[i-1][-1][3] == 0:
                i -= 1
                Ascend=True
        j=i-1
        while j>0 :
            if res[j-1][-1][1] != None :
                side = res[j-1][-1][3]
                if side == 2 :
                    res[j-1][-1][1] += 1
                elif side == 1 :
                    res[j-1][-1][2] += 1
            j -= 1
    while res[-1] == [] :
        res.pop()
    return res

In [49]:
naive_trees = []
for (line, (number, pref_expr)) in enumerate(pref_results) :
    tree = TDD (pref_expr)
    depth = len(tree)
    left, right = 0, 0
    distr = []
    for (pos, e) in enumerate(tree) :
        if e != [] :
            local = len(e)/(2**pos)
            distr.append(local)
            for op in e :
                if op[1] != None :
                    left += op[1]
                    right += op[2]
    nodes = sum (distr)
    fullness = nodes/(2**depth-1)
    skewness = left/(left+right)

    naive_trees.append([number, tree, depth, fullness, skewness, distr])

In [52]:
dg = pd.DataFrame({'Formula': [pref_to_inf(naive[i][1]) for i in range(len(naive))]})
dg['tree'] = [naive_trees[i][1] for i in range(len(naive_trees))]
dg['depth'] = [naive_trees[i][2] for i in range(len(naive_trees))]
dg['fullness'] = [naive_trees[i][3] for i in range(len(naive_trees))]
dg['skewness'] = [naive_trees[i][4] for i in range(len(naive_trees))]
dg['distr'] = [naive_trees[i][5] for i in range(len(naive_trees))]

dg[['depth', 'fullness', 'skewness']].groupby('depth').mean()

Unnamed: 0_level_0,fullness,skewness
depth,Unnamed: 1_level_1,Unnamed: 2_level_1
3,0.3571429,0.521505
4,0.2276596,0.459166
5,0.1267281,0.418524
6,0.05349011,0.300716
7,0.02723917,0.285246
8,0.01461064,0.295172
9,0.00729131,0.263795
10,0.00360088,0.229841
11,0.001801178,0.21686
12,0.0009028586,0.195917


In [53]:
dg1 = pd.concat((dg.depth, pd.DataFrame(dg.distr.values.tolist()).add_prefix('level_')), axis=1)
dg1.groupby('depth').mean()

Unnamed: 0_level_0,level_0,level_1,level_2,level_3,level_4,level_5,level_6,level_7,level_8,level_9,...,level_21,level_22,level_23,level_24,level_25,level_26,level_27,level_28,level_29,level_30
depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1.0,1.0,0.5,,,,,,,,...,,,,,,,,,,
4,1.0,1.0,1.0,0.414894,,,,,,,...,,,,,,,,,,
5,1.0,1.0,1.0,0.700893,0.227679,,,,,,...,,,,,,,,,,
6,1.0,1.0,0.688525,0.358607,0.234631,0.088115,,,,,...,,,,,,,,,,
7,1.0,1.0,0.7875,0.3625,0.153125,0.1125,0.04375,,,,...,,,,,,,,,,
8,1.0,1.0,0.869565,0.491848,0.1875,0.091712,0.06284,0.022249,,,...,,,,,,,,,,
9,1.0,1.0,0.8,0.4625,0.24625,0.115625,0.05625,0.033594,0.011641,,...,,,,,,,,,,
10,1.0,1.0,0.809091,0.420455,0.226136,0.127273,0.057102,0.023864,0.01456,0.00522,...,,,,,,,,,,
11,1.0,1.0,0.844828,0.443966,0.199353,0.098599,0.053879,0.025189,0.011449,0.007139,...,,,,,,,,,,
12,1.0,1.0,0.826531,0.418367,0.215561,0.121173,0.061224,0.031569,0.013074,0.005142,...,,,,,,,,,,


Conclusion de tout ça : le pilotage avec le ratio est bon, par contre il y a un pb de skewness, il ne vient pas de la combinaison linéaire, à méditer, ça empire dans les arbres longs.

### Qualifying Kamienny 2022 usage data

In [57]:
os.getcwd()

'C:\\Users\\nbachelard\\Desktop\\ENS\\physique\\ML\\projet\\py\\dataset\\statistics'

In [115]:
storage_location = 'C:\\Users\\nbachelard\\Desktop\\ENS\\physique\\ML\\projet\\py\\dataset\\data'

l_bruts = [f for f in os.listdir(storage_location) if os.path.isfile(os.path.join(storage_location, f))]

os.chdir (storage_location)

#### Healing files I fucked up last time

In [113]:
for filename in l_bruts :
    print("processing : "+filename)
    suba = 'N'
    subb = '_'
    idx1 = filename.index(suba)
    idx2 = filename.index(subb, idx1)
    apriori_size = ''
    for idx in range(idx1 + len(suba), idx2):
        apriori_size += filename[idx]
    apriori_size = int(apriori_size)
    
    corr = np.load(filename, allow_pickle=True)
    removed = 0
    for i in range(apriori_size) :
        if not np.isnan(corr[i]) :
            if np.isnan(np.sum(corr[i]['values'])):
                corr[i] = np.NaN
                removed += 1
    actual_size = apriori_size - removed
    new_filename = filename[:idx1]+f"N{actual_size}"+filename[idx2:]
    np.save(new_filename, corr)

processing : D10_u10_N100_10.npy
processing : D10_u10_N100_10_gpu.npy
processing : D10_u10_N100_11.npy
processing : D10_u10_N100_11_gpu.npy
processing : D10_u10_N100_12.npy
processing : D10_u10_N100_12_gpu.npy
processing : D10_u10_N100_13.npy
processing : D10_u10_N100_13_gpu.npy
processing : D10_u10_N100_14.npy
processing : D10_u10_N100_14_gpu.npy
processing : D10_u10_N100_15.npy
processing : D10_u10_N100_15_gpu.npy
processing : D10_u10_N100_16.npy
processing : D10_u10_N100_16_gpu.npy
processing : D10_u10_N100_17.npy
processing : D10_u10_N100_17_gpu.npy
processing : D10_u10_N100_18.npy
processing : D10_u10_N100_18_gpu.npy
processing : D10_u10_N100_19.npy
processing : D10_u10_N100_19_gpu.npy
processing : D10_u10_N100_20.npy
processing : D10_u10_N100_20_gpu.npy
processing : D2_u10_N100_0.npy
processing : D2_u10_N100_1.npy
processing : D2_u10_N100_10.npy
processing : D2_u10_N100_10_gpu.npy
processing : D2_u10_N100_11.npy
processing : D2_u10_N100_11_gpu.npy
processing : D2_u10_N100_12.npy


#### End of it

In [121]:
kamienny = []

for filename in l_bruts :
    corr = np.load(filename, allow_pickle=True)
    apriori_size = np.shape(corr)[0]
    for i in range(apriori_size) :
        try :
            kamienny.append(corr[i]['expression'])
        except TypeError :
            pass

In [144]:
random.shuffle(kamienny)
k_study = kamienny[:1000]

k_considered = []
for (rank, expr) in enumerate(k_study):
    k_considered.append((rank, expr))

In [147]:
def pref_to_inf(expr):
    stack = []
    for i in range(len(expr)-1,-1,-1):
        e = expr[i]
        if e in BINARY:
            right = stack.pop()
            left = stack.pop()
            stack.append(f"({left} {e} {right})")
        elif e in UNARY:
            right = stack.pop()
            stack.append(f"{e}({right})")
        else :
            stack.append(str(e))
    return stack[0]

In [149]:
def TDD (tree):
    i = 0
    res = [[]]
    for e in tree :
        if e in BINARY :
            res[i].append([e, 0, 0, 2])
            i += 1
            res.append([])
            Ascend=False
        elif e in UNARY :
            res[i].append([e, None, 'unary', 1])
            i += 1
            res.append([])
            Ascend=False
        else :
            res[i].append([e, None, 'leaf', 0])
            Ascend=True
        
        while Ascend and i>0 and res[i] != [] :
            if res[i-1][-1][1] != None:
                side = res[i-1][-1][3]
                if side == 2 :
                    res[i-1][-1][1] += 1
                elif side == 1 :
                    res[i-1][-1][2] += 1
            res[i-1][-1][3] -= 1
            Ascend=False
            if res[i-1][-1][3] == 0:
                i -= 1
                Ascend=True
        j=i-1
        while j>0 :
            if res[j-1][-1][1] != None :
                side = res[j-1][-1][3]
                if side == 2 :
                    res[j-1][-1][1] += 1
                elif side == 1 :
                    res[j-1][-1][2] += 1
            j -= 1
    while res[-1] == [] :
        res.pop()
    return res

In [150]:
kamienny_trees = []
for (line, (number, pref_expr)) in enumerate(k_considered) :
    tree = TDD (pref_expr)
    depth = len(tree)
    left, right = 0, 0
    distr = []
    for (pos, e) in enumerate(tree) :
        if e != [] :
            local = len(e)/(2**pos)
            distr.append(local)
            for op in e :
                if op[1] != None :
                    left += op[1]
                    right += op[2]
    nodes = sum (distr)
    fullness = nodes/(2**depth-1)
    skewness = left/(left+right)

    kamienny_trees.append([number, tree, depth, fullness, skewness, distr])

In [151]:
dh = pd.DataFrame({'Formula': [pref_to_inf(k_considered[i][1]) for i in range(len(k_considered))]})
dh['tree'] = [kamienny_trees[i][1] for i in range(len(kamienny_trees))]
dh['depth'] = [kamienny_trees[i][2] for i in range(len(kamienny_trees))]
dh['fullness'] = [kamienny_trees[i][3] for i in range(len(kamienny_trees))]
dh['skewness'] = [kamienny_trees[i][4] for i in range(len(kamienny_trees))]
dh['distr'] = [kamienny_trees[i][5] for i in range(len(kamienny_trees))]

dh[['depth', 'fullness', 'skewness']].groupby('depth').mean()

Unnamed: 0_level_0,fullness,skewness
depth,Unnamed: 1_level_1,Unnamed: 2_level_1
3,0.3571429,0.333333
5,0.1290323,0.363636
6,0.08333333,0.493333
7,0.03207021,0.328889
8,0.01729984,0.399941
9,0.008026541,0.374429
10,0.003593139,0.175676
11,0.002438782,0.66358
12,0.0006975446,0.095238
13,0.0004485598,0.306812


In [152]:
dh1 = pd.concat((dh.depth, pd.DataFrame(dh.distr.values.tolist()).add_prefix('level_')), axis=1)
dh1.groupby('depth').mean()

Unnamed: 0_level_0,level_0,level_1,level_2,level_3,level_4,level_5,level_6,level_7,level_8,level_9,...,level_17,level_18,level_19,level_20,level_21,level_22,level_23,level_24,level_25,level_26
depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1.0,1.0,0.5,,,,,,,,...,,,,,,,,,,
5,1.0,1.0,1.0,0.75,0.25,,,,,,...,,,,,,,,,,
6,1.0,1.0,1.0,1.0,0.875,0.375,,,,,...,,,,,,,,,,
7,1.0,1.0,0.833333,0.625,0.395833,0.166667,0.052083,,,,...,,,,,,,,,,
8,1.0,1.0,1.0,0.75,0.395833,0.177083,0.0625,0.026042,,,...,,,,,,,,,,
9,1.0,1.0,0.833333,0.625,0.375,0.15625,0.067708,0.03125,0.013021,,...,,,,,,,,,,
10,1.0,1.0,1.0,0.5,0.0625,0.0625,0.03125,0.007812,0.007812,0.003906,...,,,,,,,,,,
11,1.0,1.0,1.0,1.0,0.625,0.1875,0.09375,0.046875,0.023438,0.011719,...,,,,,,,,,,
12,1.0,1.0,0.5,0.125,0.125,0.0625,0.015625,0.015625,0.007812,0.001953,...,,,,,,,,,,
13,1.0,1.0,0.833333,0.458333,0.166667,0.09375,0.0625,0.028646,0.018229,0.007812,...,,,,,,,,,,


Conclusion de tout ça : c'est ok, modulo le pb de combinaison linéaire, à voir si on regénère one day.