In [1]:
import numpy as np
import json
import csv

import os
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

import pandas as pd


In [2]:
NUM_DATA = 5

NUM_VARS = 20
VAR_NAME_TYPE = 'numerical' # 'numerical' | 'alphabetical' | 'words'

PERMUTE_EQNS = True
MIN_NUM_PERMS = 1
MAX_NUM_PERMS = 1
MAX_PERM_SIZE = NUM_VARS

PERMUTE_NAMES = True

NUM_NAME_HINTS = 6 # ITERATE
NUM_EQN_HINTS = 2

NUM_EXAMPLES = 0 # TODO implement later


EXAMPLE_CONFIG = {
    "style": 'verbose', 
    "pad_equals_prompt" : False, 
    "pad_equals_eval" : True,
    }
'''
style =  'regular' | 'minimal' | 'verbose' (assume all variables evaluated in best order)
if:
    a=1
    b=-a
then:
    b=-1 minimal
    b=-(+a)=-1 regular
    b=-(+1)=-(+1)=-1 verbose 
'''

VALUE = 1 #ex. a = 1, b = -a, ...

NP_SEED=5
np.random.seed(NP_SEED)




OPENAI_MODEL = "text-davinci-002"
# _NUM_FINAL_VARS = NUM_VARS_LIST[-1]
# _NUM_TOKENS_GEN = 20 # max 20 min 16 average 18
# _NUM_BASE_TOKENS = 50 # ideally 14
# MAX_COMPLETION_TOKENS = int(_NUM_TOKENS_GEN*(_NUM_FINAL_VARS-NUM_EQN_HINTS) + _NUM_BASE_TOKENS)
MAX_COMPLETION_TOKENS = 500



In [3]:
_BASE_NAME = f'[{NUM_DATA}]{NUM_NAME_HINTS}_hints,{NUM_VARS}_var,model_{OPENAI_MODEL}'

FILE_PATH = rf'data/{_BASE_NAME}.json'
CONFIG_PATH = rf'data/[config]{_BASE_NAME}.json'



_CONFIG = {
"NUM_DATA":NUM_DATA,
"NUM_VARS":NUM_VARS,
"VAR_NAME_TYPE":VAR_NAME_TYPE,
"PERMUTE_EQNS":PERMUTE_EQNS,
"MIN_NUM_PERMS":MIN_NUM_PERMS,
"MAX_NUM_PERMS":MAX_NUM_PERMS,
"MAX_PERM_SIZE":MAX_PERM_SIZE,
"PERMUTE_NAMES":PERMUTE_NAMES,
"NUM_NAME_HINTS":NUM_NAME_HINTS,
"NUM_EQN_HINTS":NUM_EQN_HINTS,
"NUM_EXAMPLES":NUM_EXAMPLES,
"EXAMPLE_CONFIG":EXAMPLE_CONFIG,
"VALUE":VALUE,
"NP_SEED":NP_SEED,
"OPENAI_MODEL":OPENAI_MODEL,
"MAX_COMPLETION_TOKENS":MAX_COMPLETION_TOKENS,
}

In [4]:
class Cycle:
    def __init__(self, cycle): # cycle = (0,1,2)
        self.cycle = cycle
    def permute(self, elt, start_at_one=False): # how cycle acts on elt ex. (0,1,2) 2 = 0
        if elt not in self.cycle:
            return elt
        arg_elt = self.cycle.index(elt)
        return self.cycle[(arg_elt+1) % len(self.cycle)]
    def getCycle(self):
        return self.cycle

def permute(arr,cycles): # cycles = [(c1),(c2),...]; c1 = (1,2)
    cycles = [Cycle(cyc) for cyc in cycles]

    hats = list(range(len(arr)))
    new_hats = []
    for hat,hat_idx in enumerate(hats):
        for c in cycles[::-1]:
            hat = c.permute(hat)
        new_hats.append(hat)
    return [arr[new_hat] for new_hat in new_hats]

def strOfPerm(perm): #perm of form [(c1),(c2),(c3),...]
    perm_str=''
    for cyc in perm:
        cyc_str = ','.join([str(elt) for elt in cyc])
        perm_str += f'({cyc_str})'
    return perm_str

def testCycles():
    ctest=Cycle([0,1,3,2,5,6])
    print(f'we have cycle {ctest.getCycle()}')
    print("ctest.permute(3):",ctest.permute(3))
    print("ctest.permute(6):",ctest.permute(6))
    print()
    c1=(0,1)
    c2=(1,2)
    c3=(0,1,2)
    print(f'we have cycles c1={c1}, c2={c2}, c3={c3}')
    print('permute using [c1,c2]')
    print(permute(["a","b","c"],[c1,c2]))
    print()

    print('permute using [c3]')
    print(permute(["a","b","c"],[c3]))
    print('indeed they are the same')
# testCycles()

In [5]:
'''
returns system of equation strings, WITHOUT permuting the equations
'''


def generateSystemOfEqns(
    # config
    num_vars=NUM_VARS,
    var_name_type=VAR_NAME_TYPE,
    permute_eqns=PERMUTE_EQNS,
    permute_names=PERMUTE_NAMES,
    num_name_hints=NUM_NAME_HINTS,
    num_eqn_hints=NUM_EQN_HINTS,
    num_examples=NUM_EXAMPLES,
    example_config=EXAMPLE_CONFIG,
    init_val=VALUE,
    # non-config
    numeric_var_label='a',
    numeric_var_numbers_range=range(NUM_VARS)
):

    def signedNum(x): return f'+{str(x)}' if x > 0 else str(x)
    def signOfNum(x): return "+" if x > 0 else "-"

    # the variable names ex. a,b,c,a0,c42,...
    names = []
    permuted_names = []
    if 'num' in var_name_type:  # numerical names ex. like a0, a42, ...
        numbers = np.random.choice(
            numeric_var_numbers_range, size=num_vars, replace=False)
        names = [f'{numeric_var_label}{num}' for num in numbers]
        permuted_names = list(np.array(names)[np.random.choice(
            range(num_vars), size=num_vars, replace=False)]) if permute_names else names

    elif 'word' in var_name_type:
        words_str = 'Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.'.replace(',','').replace('.','')
        words_str_rand_spaces = ''
        for i,ltr in enumerate(words_str):
            if ltr == ' ' and i%2 == 0:
                words_str_rand_spaces += '[]'
            else:
                words_str_rand_spaces += ltr
        
        words = words_str_rand_spaces.split(' ')
        words = [word.replace('[]', ' ') for word in words]
        select_idxs = np.random.choice(len(words), size=num_vars, replace=False)
        select_words = [words[idx] for idx in select_idxs]

        names = select_words
        permuted_names = list(np.array(names)[np.random.choice(
            range(num_vars), size=num_vars, replace=False)]) if permute_names else names
        

    else:  # alphabetical names ex. a,b,...,y,z (at most 26)
        names = list(map(chr, range(97, 97+26)))[:num_vars]
        permuted_names = list(map(chr, np.random.choice(
            range(26), size=num_vars, replace=False)+97)) if permute_names else names

    # the variable values ex. 1 or -1
    values = [init_val]+list(2*np.random.randint(0, 2, size=num_vars-1)-1)

    # signs g_i where var_i = g_{i-1} var_{i-1} where i[1,num_vars)
    signs = [signOfNum(values[idx+1]/values[idx]) for idx in range(num_vars-1)]

    def eqn_at_idx(idx):
        _PAD = example_config['pad_equals_prompt']

        if idx == 0:
            _s = f"{permuted_names[0]}={signedNum(init_val)}"
        else:
            # ex. z=+1, n=-z, ...
            _s = f"{permuted_names[idx]}={signs[idx-1]}{permuted_names[idx-1]}"

        if _PAD:
            _s = _s.replace('=', ' = ')
        return _s

    def eval_expr_at_idx(idx):  # example: we have a=-1 and are evaluating b=+a
        _PAD = example_config['pad_equals_eval']
        _STYLE = example_config['style']

        _s = ''

        if idx == 0:
            _s = f"{permuted_names[0]}={signedNum(init_val)}"
        else:
            _s += f"{permuted_names[idx]}"  # 'b'
            if _STYLE == 'verbose' or _STYLE == 'regular':
                _s += f"={signs[idx-1]}{permuted_names[idx-1]}"  # ' = +a'
            if _STYLE == 'verbose':
                # ' = +(-1)'
                _s += f"={signs[idx-1]}({signedNum(values[idx-1])})"
            _s += f"={signedNum(values[idx])}"  # ' = -1'

        if _PAD:
            _s = _s.replace('=', ' = ')

        # ex. b=+a=+(+1)=+1, ...
        return _s

    equations = [eqn_at_idx(idx) for idx in range(0, num_vars)]
    evaluated = [eval_expr_at_idx(idx) for idx in range(0, num_vars)]
    eval_dict = {permuted_names[i]: values[i] for i in range(num_vars)}

    name_perm_str = (
        f'{names}->{permuted_names}').replace('\'', '').replace(' ', '')
    return equations, evaluated, eval_dict, (name_perm_str, names, permuted_names)


In [6]:
data = []

for data_idx in range(NUM_DATA):
    # generate System of Equations
    ordered_eqns, ordered_eval, eval_dict, _misc = generateSystemOfEqns()
    name_perm_str, ordered_names, permuted_names = _misc

    # permute equations
    if PERMUTE_EQNS:
        _num_perms = np.random.randint(MIN_NUM_PERMS,high=MAX_NUM_PERMS+1)

        eqn_perm = []
        for _ in range(_num_perms):
            _size_of_perm = min(np.random.randint(2,NUM_VARS), MAX_PERM_SIZE)
            _perm = list(np.random.choice(range(NUM_VARS),_size_of_perm, replace=False))
            eqn_perm.append(_perm)
    else:
        eqn_perm = [()]
    permuted_eqns = permute(ordered_eqns,eqn_perm)
    
    #build prompt
    prompt_parts = []
    _name_hint = ' ' + ','.join([permuted_names[i] for i in range(NUM_NAME_HINTS)])
    _dotdotdot = ',...' if NUM_NAME_HINTS > 0 and NUM_NAME_HINTS < NUM_VARS else ''
    prompt_parts.append('\n'.join(permuted_eqns))
    prompt_parts.append(f'\nEvaluating all the variables{_name_hint}{_dotdotdot}:') # prompt_parts.append('\nEvaluating all the variables according to the hint:')
    prompt_parts.append('\n'.join(ordered_eval[:NUM_EQN_HINTS]))
    prompt_text = '\n'.join(prompt_parts) + '\n'


    #send prompt
    response = openai.Completion.create(
    model=OPENAI_MODEL,
    prompt=prompt_text,
    temperature=0,
    max_tokens=MAX_COMPLETION_TOKENS,
    top_p=0,
    frequency_penalty=0,
    presence_penalty=0,
    # stop=["(START"],
    )
    completion_text = response.choices[0].text

    expected_completion_eqns = ordered_eval[NUM_EQN_HINTS:]
    expected_completion_text = '\n'.join(expected_completion_eqns)


    data_pt = {
        "prompt": prompt_text,
        "expected_completion": expected_completion_text,
        "completion": completion_text,
        "eval_dict": eval_dict,


        "eqn_perm": strOfPerm(eqn_perm),
        "name_perm": name_perm_str,

        "index": data_idx,
    }

    data.append(data_pt)
    

In [7]:
df = pd.DataFrame(data, columns=data_pt.keys())
df.to_json(FILE_PATH, orient = 'records')

OSError: Cannot save file into a non-existent directory: 'pad_eval'

In [None]:
df_config = pd.DataFrame([_CONFIG], columns=_CONFIG.keys())
df_config.to_json(CONFIG_PATH, orient = 'records')

In [None]:
print(data[0]['prompt'])

passages and more=+1
in the 1960s with=-passages and more
of Lorem Ipsum=-in the 1960s with
the=+popularised
Lorem=+leap into
printing and typesetting industry=-the
five centuries but also=-printing and typesetting industry
leap into=+five centuries but also
popularised=+of Lorem Ipsum
dummy=+Lorem
the=-dummy
text of=-the

Evaluating all the variables passages and more,in the 1960s with,of Lorem Ipsum,popularised,the,printing and typesetting industry,five centuries but also,leap into,Lorem,dummy,the,text of:
passages and more = +1
in the 1960s with = -passages and more = -(+1) = -1

