In [1]:
"""
Tree-of-Thought prompting
"""

'\nTree-of-Thought prompting\n'

In [2]:
import sys
sys.path.append('../../')
from run_process import get_unused_data
sys.path.append('../../../')
from utils import stats_results_unused

In [3]:
# define variables
NUM_FILES = 305
SAMPLES_FOLDER_NAME = '../random_samples_variables'
GPT_SAVED_FILE_NAME = 'unused_variables_gpt'

In [4]:
# read in files from folder random_cells
random_cells = []

for i in range(NUM_FILES):
    file_name = f'{SAMPLES_FOLDER_NAME}/{i}.py'
    with open(file_name, 'r') as f:
        random_cells.append(f.read())

In [5]:
main_prompt = """A variable is unused if it is assigned using an equals operator but is not referenced after being defined. Remove unused variables in the code delimited by triple backticks. First, identify unused variables in the code. Then, remove the identified unused variables from the code. If a variable is unused in an unpacking statement, change it to an underscore. Output under the headings 'Unused variables' and 'Updated code' respectively."""

ex1_input = """```python
time = 5
d = 5
print(d)

def calc():
    a = 5
    b = 4
    return a + b
```"""

ex1_output = """First, we identify unused variables. Analyzing the code, the only variables present are 'time' and 'd'. We don't consider 'calc' as it is a function, not a variable. We see that after 'time' is declared it is not referenced later, so it is unused. 'd', however, is used later in a print statement.

Unused variables:
['time']

Now, we remove these unused variables from the code. We simply remove the variable 'time' and leave the rest of the code as is.

Updated code:
```python
d = 5
print(d)

def calc():
    a = 5
    b = 4
    return a + b
```"""

ex2_input = """```python
val1 = 'hello'
val2 = 'world'
```"""

ex2_output = """First, we identify unused variables. Analyzing the code, the only variables present are 'val1' and 'val2'. Both these variables are not used after declaration so they are both unused.

Unused variables:
['val1', 'val2']

Now, we remove these unused variables from the code. As the code only contains these variables and nothing else, after removing we end up with empty code.

Updated code:
```python
```"""

ex3_input = """```python
z = [1, 2, 3]
a, b, c = z
print(a + b)
```"""

ex3_output = """First, we identify unused variables. Analyzing the code, the variable definitions present are 'z', 'a', 'b', and 'c'. 'z' is used after definition in an unpacking statement and 'a' and 'b' are used after definition in a print statement. However, 'c' is unused after definition.

Unused variables:
['c']

Now, we remove these unused variables from the code. Since the only unused variable is 'c', which was assigned in an unpacking statement, we replace it with an underscore.

Updated code:
```python
z = [1, 2, 3]
a, b, _ = z
print(a + b)
```"""

ex4_input = """```python
def associate():
    return center.path()

path = associate()
```"""

ex4_output = """First, we identify unused variables. Analyzing the code, the only variable definition present is 'path'. We ignore 'associate' as it is a function, not a variable. We see that 'path' is not referenced after declaration, so it is unused.

Unused variables:
['path']

Now, we remove these unused variables from the code. We simply remove the variable definition for 'path'.

Updated code:
```python
def associate():
    return center.path()
```"""

identify_vote_prompt = """Given an original task and multiple choices, choose the best answer for the original task. Analyze each choice in detail, then conclude in the last line 'The best choice is {s}', where s is the integer id of the choice. If all choices are equally good, return the smallest id. If no choice is good, return 0.
Original task: A variable is unused if it is assigned using an equals operator but is not referenced after being defined. Identify unused variables in the code delimited by triple backticks. If there are no unused variables, return an empty list."""

identify_vote_ex1_input = """```python
time = 5
d = 5
print(d)

def calc():
    a = 5
    b = 4
    return a + b
```

Choice 1:
[]
Choice 2:
['time']
Choice 3:
['time', 'calc']"""

identify_vote_ex1_output = """Choice 1 is incorrect. Looking at the code, we see that there is a variable definition 'time' that is not referenced after being defined, so the empty list is incorrect.
Choice 2 is correct. Looking at the code, we see that there is a variable definition 'time' that is not referenced after being defined, so the list containing 'time' is correct.
Choice 3 is incorrect. While 'time' is an unused variable definition, 'calc' is a function, not a variable. Therefore, 'calc' is not an unused variable.

The best choice is 2."""

code_vote_prompt = """Given an original task and multiple choices, choose the best answer for the original task. Analyze each choice in detail, then conclude in the last line 'The best choice is {s}', where s is the integer id of the choice. If all choices are equally good, return the smallest id. If no choice is good, return 0.
Original task: Remove the variable definitions for the variables specified by the user in the code delimited by triple backticks. If there are no variable definitions specified by the user, return the original code."""

code_vote_ex1_input = """Original code:
```python
time = 5
d = 5
print(d)

def calc():
    a = 5
    b = 4
    return a + b
```

Variables to remove:
['time']

Choice 1:
```python
time = 5
def calc():
    a = 5
    b = 4
    return a + b
```
Choice 2:
```python
d = 5
print(d)

def calc():
    a = 5
    b = 4
    return a + b
```
Choice 3:
```python
time = 5
d = 5
print(d)

def calc():
    a = 5
    b = 4
    return a + b
```"""

code_vote_ex1_output = """Choice 1 is incorrect. Choice 1 does not remove the specified variable 'time' and instead removes the wrong variable 'd'.
Choice 2 is correct. It properly removes the variable definition specified which is 'time' and leaves the rest of the code unchanged.
Choice 3 is incorrect. It simply returns the original code, which is not what we want.

The best choice is 2."""

def get_cot_prompt(input_code):
    return [
        {"role": "user", "content": main_prompt},
        {"role": "user", "content": ex1_input},
        {"role": "assistant", "content": ex1_output},
        {"role": "user", "content": ex2_input},
        {"role": "assistant", "content": ex2_output},
        {"role": "user", "content": ex3_input},
        {"role": "assistant", "content": ex3_output},
        {"role": "user", "content": ex4_input},
        {"role": "assistant", "content": ex4_output},
        {"role" : "user", "content" : f"```python\n{input_code}\n```"},
    ]

def get_identified_names(identify_trials, identify_completions):
    # Get identified items
    identified_names = []

    for i in range(identify_trials):
        if identify_completions.choices[i].finish_reason == 'stop':
            try:
                unused_names = identify_completions.choices[i]['message']['content'].split('Unused variables:')[1].strip("\n")
            except:
                print("unexpected format for unused variables", identify_completions.choices[i]['message']['content'])
                unused_names = None
            else:
                # if None
                if 'None' in unused_names:
                    unused_names = None
                # if we have a list of items
                elif "[" in unused_names:
                    unused_names = unused_names.split("[")[1].split("]")[0].split(",")
                    unused_names = [name.strip().strip("'`") for name in unused_names if name.strip() != ""]
                # we have a bullet point list
                elif "-" in unused_names:
                    unused_names = unused_names.strip("- ").split("\n- ")
                    new_unused_names = []
                    for name in unused_names:
                        split =  name.split("`")
                        if len(split) > 1:
                            new_unused_names.append(split[1])
                        else:
                            new_unused_names.append(split[0])
                    unused_names = new_unused_names
                else:
                    print("unexpected format for unused variables", unused_names)
                    unused_names = None
        else:
            unused_names = None
        
        if unused_names is not None:
            unused_names.sort()
        
        if unused_names == []:
            unused_names = None
        identified_names.append(unused_names)
    
    return identified_names

def get_identify_vote_msgs(cell_src):
    def func(choices):
        final_msg = f"```python\n{cell_src}\n```\n\n"

        for i, choice in enumerate(choices):
            final_msg += f"Choice {i + 1}:\n{choice}\n"
        
        return [
        {"role": "user", "content": identify_vote_prompt},
        {"role": "user", "content": identify_vote_ex1_input},
        {"role": "assistant", "content": identify_vote_ex1_output},
        {"role": "user", "content": final_msg}
        ]
    
    return func

def get_code_vote_msgs(original_code):
    def func(unused, choices):
        final_msg = f"Original code:\n```python\n{original_code}\n```\n\Variables to remove:\n{unused}\n\n"
        
        for i, choice in enumerate(choices):
            final_msg += f"Choice {i + 1}:\n{choice}\n"
        
        return [
        {"role": "user", "content": code_vote_prompt},
        {"role": "user", "content": code_vote_ex1_input},
        {"role": "assistant", "content": code_vote_ex1_output},
        {"role" : "user", "content" : final_msg}
        ]

    return func

In [6]:
identify_trials = 5
code_trials = 3
identify_vote_trials = 6
code_vote_trials = 4
identify_stop = "Updated code"

In [7]:
# Estimate cost
import sys
sys.path.append("../../..")
import utils

def estimate_tokens():
    in_tok = ''
    out_tok = ''
    for i, cell_src in enumerate(random_cells):
        # trial
        in_tok += main_prompt + ex1_input + ex1_output + ex2_input + ex2_output + ex3_input + ex3_output + ex4_input + ex4_output
        in_tok += f"```python\n{cell_src}\n```"
        out_tok += (ex1_output[:int(len(ex1_input)/2)] * identify_trials)
        # vote trial
        in_tok += identify_vote_prompt + identify_vote_ex1_input + identify_vote_ex1_output
        in_tok += f"```python\n{cell_src}\n```\n\n"
        out_tok += identify_vote_ex1_output * identify_vote_trials
        # code
        in_tok += main_prompt + ex1_input + ex1_output + ex2_input + ex2_output + ex3_input + ex3_output + ex4_input + ex4_output
        in_tok += f"```python\n{cell_src}\n```"
        in_tok += ex1_output[:int(len(ex1_input)/2)]
        out_tok += (ex1_output[int(len(ex1_input)/2):] * code_trials)
        # vote code
        in_tok += code_vote_prompt + code_vote_ex1_input + code_vote_ex1_output
        in_tok += f"Original code:\n```python\n{cell_src}\n```\n\Variables to remove:['test', 'test']\n\n"
        out_tok += code_vote_ex1_output * code_vote_trials
    return in_tok, out_tok

in_tok, out_tok = estimate_tokens()

utils.gpt_35_turbo_token_dollar_cost(in_tok, out_tok)

2.119024

In [8]:
# # GPT Tree of Thought
# import sys
# sys.path.append('../../../')
# from tree_of_thought import solve_toc

# # identify and remove unused using GPT
# gpt_results = []
# for i in range(NUM_FILES):
#     input_msgs_cot = get_cot_prompt(random_cells[i])
#     get_identified_names_func = get_identified_names
#     get_identify_votes_msgs_func = get_identify_vote_msgs(random_cells[i])
#     get_code_votes_msgs_func = get_code_vote_msgs(random_cells[i])

#     print(f'Processing file {i}')
#     identified, updated_code = solve_toc(input_msgs_cot, identify_trials, code_trials, identify_vote_trials, code_vote_trials, identify_stop, get_identified_names_func, get_identify_votes_msgs_func, get_code_votes_msgs_func)
#     print(f'File {i} - {identified}')
#     gpt_results.append({'identified': identified, 'updated_code': updated_code})

# # save the results to a file
# with open(GPT_SAVED_FILE_NAME, 'w') as f:
#     f.write(str(gpt_results))

In [9]:
# read in gpt result from file
with open(GPT_SAVED_FILE_NAME, 'r') as f:
    gpt_results = eval(f.read())

In [10]:
# save the results to a variable
gpt_identified = [eval(var['identified']) for var in gpt_results]
gpt_code = [var['updated_code'] for var in gpt_results]

In [11]:
# save the updated code to files
# if the code is None we write the original code
import os

if not os.path.exists('gpt_code'):
    os.makedirs('gpt_code')

for i, code in enumerate(gpt_code):
    with open(f'gpt_code/{i}.py', 'w') as f:
        if gpt_identified[i] is None or gpt_code[i] is None:
            f.write(random_cells[i])
        else:
            f.write(code)

In [12]:
before = get_unused_data(NUM_FILES, SAMPLES_FOLDER_NAME, 'variable')

total_before = sum(len(item) for item in before)
print(f'Total before: {total_before}')

Total before: 586


In [13]:
after = get_unused_data(NUM_FILES, 'gpt_code', 'variable')

total_after = sum(len(item) for item in after)
print(f'Total after: {total_after}')

Total after: 413


In [14]:
# List percentage difference between before and after for total
print(f'Total percentage difference: {(total_after - total_before) / total_before * 100}%')

Total percentage difference: -29.522184300341298%


In [15]:
stats_results_unused(gpt_identified, before)

GPT before count: 283
Vulture before count: 586
------------
True positives: 176
False positives: 107
False negatives: 395
------------
Files with at least one false positive (and no false negatives)
13: 1 false positives
22: 4 false positives
33: 1 false positives
35: 1 false positives
61: 2 false positives
64: 1 false positives
87: 1 false positives
97: 2 false positives
109: 2 false positives
135: 1 false positives
146: 3 false positives
179: 4 false positives
190: 1 false positives
197: 1 false positives
206: 1 false positives
240: 1 false positives
265: 5 false positives
279: 1 false positives
299: 1 false positives
------------
Files with at least one false negative (and no false positives)
0: 1 false negatives
1: 1 false negatives
3: 2 false negatives
4: 4 false negatives
5: 1 false negatives
6: 7 false negatives
7: 2 false negatives
12: 2 false negatives
14: 1 false negatives
18: 3 false negatives
19: 1 false negatives
20: 1 false negatives
21: 1 false negatives
23: 2 false neg

In [32]:
print(random_cells[5])

reset_graph()

n_inputs = 28 * 28  # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_hidden3 = 50
n_hidden4 = 50
n_hidden5 = 50
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
    hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3")
    hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4")
    hidden5 = tf.layers.dense(hidden4, n_hidden5, activation=tf.nn.relu, name="hidden5")
    logits = tf.layers.dense(hidden5, n_outputs, name="outputs")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")


In [33]:
print(gpt_code[5])

reset_graph()

n_inputs = 28 * 28  # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_hidden3 = 50
n_hidden4 = 50
n_hidden5 = 50
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
    hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3")
    hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4")
    hidden5 = tf.layers.dense(hidden4, n_hidden5, activation=tf.nn.relu, name="hidden5")
    logits = tf.layers.dense(hidden5, n_outputs, name="outputs")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")


In [34]:
before[5]

['loss']

In [35]:
after[5]

['loss']

In [36]:
gpt_identified[5]

In [16]:
# delete 'gpt_code' folder
!rm -rf gpt_code