In [None]:
"""
Tree-of-Thought prompting
"""

In [None]:
import sys
sys.path.append('../../')
from run_process import get_unused_data
sys.path.append('../../../')
from utils import stats_results_unused

In [None]:
# define variables
NUM_FILES = 20
SAMPLES_FOLDER_NAME = '../random_cells_unused_variables_vulture_seed42'
GPT_SAVED_FILE_NAME = 'unused_variables_gpt'

In [None]:
# read in files from folder random_cells
random_cells = []

for i in range(NUM_FILES):
    file_name = f'{SAMPLES_FOLDER_NAME}/{i}.py'
    with open(file_name, 'r') as f:
        random_cells.append(f.read())

In [None]:
main_prompt = """A variable is unused if it is assigned using an equals operator but is not referenced after being defined. Remove unused variables in the code delimited by triple backticks. First, identify unused variables in the code. Then, remove the identified unused variables from the code. If a variable is unused in an unpacking statement, change it to an underscore. Output under the headings 'Unused variables' and 'Updated code' respectively. If there are no unused variables, return an empty list ([]) and the unmodified code."""

ex1_input = """```python
time = 5
d = 5
print(d)

def calc():
    a = 5
    b = 4
    return a + b
```"""

ex1_output = """First, we identify unused variables. Analyzing the code, the only variables present are 'time' and 'd'. We don't consider 'calc' as it is a function, not a variable. We see that after 'time' is declared it is not referenced later, so it is unused. 'd', however, is used later in a print statement.

Unused variables:
['time']

Now, we remove these unused variables from the code. We simply remove the variable 'time' and leave the rest of the code as is.

Updated code:
```python
d = 5
print(d)

def calc():
    a = 5
    b = 4
    return a + b
```"""

ex2_input = """```python
val1 = 'hello'
val2 = 'world'
```"""

ex2_output = """First, we identify unused variables. Analyzing the code, the only variables present are 'val1' and 'val2'. Both these variables are not used after declaration so they are both unused.

Unused variables:
['val1', 'val2']

Now, we remove these unused variables from the code. As the code only contains these variables and nothing else, after removing we end up with empty code.

Updated code:
```python
```"""

ex3_input = """```python
z = [1, 2, 3]
a, b, c = z
print(a + b)
```"""

ex3_output = """First, we identify unused variables. Analyzing the code, the variable definitions present are 'z', 'a', 'b', and 'c'. 'z' is used after definition in an unpacking statement and 'a' and 'b' are used after definition in a print statement. However, 'c' is unused after definition.

Unused variables:
['c']

Now, we remove these unused variables from the code. Since the only unused variable is 'c', which was assigned in an unpacking statement, we replace it with an underscore.

Updated code:
```python
z = [1, 2, 3]
a, b, _ = z
print(a + b)
```"""

ex4_input = """```python
def associate():
    return center.path()

path = associate()
```"""

ex4_output = """First, we identify unused variables. Analyzing the code, the only variable definition present is 'path'. We ignore 'associate' as it is a function, not a variable. We see that 'path' is not referenced after declaration, so it is unused.

Unused variables:
['path']

Now, we remove these unused variables from the code. We simply remove the variable definition for 'path'.

Updated code:
```python
def associate():
    return center.path()
```"""

identify_vote_prompt = """Given an original task and multiple choices, choose the best answer for the original task. Analyze each choice in detail, then conclude in the last line 'The best choice is {s}', where s is the integer id of the choice. If all choices are equally good, return the smallest id. If no choice is good, return 0.
Original task: A variable is unused if it is assigned using an equals operator but is not referenced after being defined. Identify unused variables in the code delimited by triple backticks. If there are no unused variables, return an empty list."""

identify_vote_ex1_input = """```python
time = 5
d = 5
print(d)

def calc():
    a = 5
    b = 4
    return a + b
```

Choice 1:
[]
Choice 2:
['time']
Choice 3:
['time', 'calc']"""

identify_vote_ex1_output = """Choice 1 is incorrect. Looking at the code, we see that there is a variable definition 'time' that is not referenced after being defined, so the empty list is incorrect.
Choice 2 is correct. Looking at the code, we see that there is a variable definition 'time' that is not referenced after being defined, so the list containing 'time' is correct.
Choice 3 is incorrect. While 'time' is an unused variable definition, 'calc' is a function, not a variable. Therefore, 'calc' is not an unused variable.

The best choice is 2."""

code_vote_prompt = """Given an original task and multiple choices, choose the best answer for the original task. Analyze each choice in detail, then conclude in the last line 'The best choice is {s}', where s is the integer id of the choice. If all choices are equally good, return the smallest id. If no choice is good, return 0.
Original task: Remove the variable definitions for the variables specified by the user in the code delimited by triple backticks. If there are no variable definitions specified by the user, return the original code."""

code_vote_ex1_input = """Original code:
```python
time = 5
d = 5
print(d)

def calc():
    a = 5
    b = 4
    return a + b
```

Variables to remove:
['time']

Choice 1:
```python
time = 5
def calc():
    a = 5
    b = 4
    return a + b
```
Choice 2:
```python
d = 5
print(d)

def calc():
    a = 5
    b = 4
    return a + b
```"""

code_vote_ex1_output = """Choice 1 is incorrect. Choice 1 does not remove the specified variable 'time' and instead removes the wrong variable 'd'.
Choice 2 is correct. It properly removes the variable definition specified which is 'time' and leaves the rest of the code unchanged.

The best choice is 2."""

def get_cot_prompt(input_code):
    return [
        {"role": "user", "content": main_prompt},
        {"role": "user", "content": ex1_input},
        {"role": "assistant", "content": ex1_output},
        {"role": "user", "content": ex2_input},
        {"role": "assistant", "content": ex2_output},
        {"role": "user", "content": ex3_input},
        {"role": "assistant", "content": ex3_output},
        {"role": "user", "content": ex4_input},
        {"role": "assistant", "content": ex4_output},
        {"role" : "user", "content" : f"```python\n{input_code}\n```"},
    ]

def get_identified_names(identify_trials, identify_completions):
    # Get identified items
    identified_names = []

    for i in range(identify_trials):
        if identify_completions.choices[i].finish_reason == 'stop':
            try:
                unused_names = identify_completions.choices[i]['message']['content'].split('Unused variables:')[1].strip("\n")
            except:
                print("unexpected format for unused variables", identify_completions.choices[i]['message']['content'])
                unused_names = None
            else:
                # if None
                if 'None' in unused_names:
                    unused_names = None
                # if we have a list of items
                elif "[" in unused_names:
                    unused_names = unused_names.split("[")[1].split("]")[0].split(",")
                    unused_names = [name.strip().strip("'") for name in unused_names if name.strip() != ""]
                # we have a bullet point list
                elif "-" in unused_names:
                    unused_names = unused_names.strip("- ").split("\n- ")
                    new_unused_names = []
                    for name in unused_names:
                        split =  name.split("`")
                        if len(split) > 1:
                            new_unused_names.append(split[1])
                        else:
                            new_unused_names.append(split[0])
                    unused_names = new_unused_names
                else:
                    print("unexpected format for unused variables", unused_names)
                    unused_names = None
        else:
            unused_names = None
        
        if unused_names is not None:
            unused_names.sort()
        identified_names.append(unused_names)
    
    return identified_names

def get_identify_vote_msgs(cell_src):
    def func(choices):
        final_msg = f"```python\n{cell_src}\n```\n\n"

        for i, choice in enumerate(choices):
            final_msg += f"Choice {i + 1}:\n{choice}\n"
        
        return [
        {"role": "user", "content": identify_vote_prompt},
        {"role": "user", "content": identify_vote_ex1_input},
        {"role": "assistant", "content": identify_vote_ex1_output},
        {"role": "user", "content": final_msg}
        ]
    
    return func

def get_code_vote_msgs(original_code):
    def func(unused, choices):
        final_msg = f"Original code:\n```python\n{original_code}\n```\n\Variables to remove:\n{unused}\n\n"
        
        for i, choice in enumerate(choices):
            final_msg += f"Choice {i + 1}:\n{choice}\n"
        
        return [
        {"role": "user", "content": code_vote_prompt},
        {"role": "user", "content": code_vote_ex1_input},
        {"role": "assistant", "content": code_vote_ex1_output},
        {"role" : "user", "content" : final_msg}
        ]

    return func

In [None]:
# GPT Tree of Thought
import sys
sys.path.append('../../../')
from tree_of_thought import solve_toc

identify_trials = 10
code_trials = 3
identify_vote_trials = 10
code_vote_trials = 5
identify_stop = "Updated code"

# identify and remove unused using GPT
gpt_results = []
for i in range(NUM_FILES):
    input_msgs_cot = get_cot_prompt(random_cells[i])
    get_identified_names_func = get_identified_names
    get_identify_votes_msgs_func = get_identify_vote_msgs(random_cells[i])
    get_code_votes_msgs_func = get_code_vote_msgs(random_cells[i])

    print(f'Processing file {i}')
    identified, updated_code = solve_toc(input_msgs_cot, identify_trials, code_trials, identify_vote_trials, code_vote_trials, identify_stop, get_identified_names_func, get_identify_votes_msgs_func, get_code_votes_msgs_func)
    print(f'File {i} - {identified}')
    gpt_results.append({'identified': identified, 'updated_code': updated_code})

# save the results to a file
with open(GPT_SAVED_FILE_NAME, 'w') as f:
    f.write(str(gpt_results))

In [None]:
# read in gpt result from file
with open(GPT_SAVED_FILE_NAME, 'r') as f:
    gpt_results = eval(f.read())

In [None]:
# save the results to a variable
gpt_identified = [var['identified'] for var in gpt_results]
gpt_code = [var['updated_code'] for var in gpt_results]

In [None]:
# save the updated code to files
# if the code is None we write the original code
import os

if not os.path.exists('gpt_code'):
    os.makedirs('gpt_code')

for i, code in enumerate(gpt_code):
    with open(f'gpt_code/{i}.py', 'w') as f:
        if gpt_identified[i] is None or gpt_code[i] is None:
            f.write(random_cells[i])
        else:
            f.write(code)

In [None]:
# print random_cells to new folder
# TODO I think is just temporary for now bc of 20 files, later we will use all the files

import os
if not os.path.exists('random_cells'):
    os.makedirs('random_cells')
for i, code in enumerate(random_cells):
    with open(f'random_cells/{i}.py', 'w') as f:
        f.write(code)

In [None]:
before = get_unused_data(NUM_FILES, 'random_cells', 'variable')

total_before = sum(len(item) for item in before)
print(f'Total before: {total_before}')

In [None]:
after = get_unused_data(NUM_FILES, 'gpt_code', 'variable')

total_after = sum(len(item) for item in after)
print(f'Total after: {total_after}')

In [None]:
# List percentage difference between before and after for total
print(f'Total percentage difference: {(total_after - total_before) / total_before * 100}%')

In [None]:
stats_results_unused(gpt_identified, before)