In [None]:
"""
Tree-of-Thought prompting
"""

In [None]:
NUM_FILES = 20
FOLDER_NAME = '../random_samples_formatting'
GPT_SAVED_FILE_NAME = 'formatted_code_gpt'
GPT_SAVED_FOLDER_NAME = 'reformatted_gpt'

In [None]:
# read in files from folder random_cells
random_cells = []

for i in range(NUM_FILES):
    file_name = f'{FOLDER_NAME}/{i}.py'
    with open(file_name, 'r') as f:
        random_cells.append(f.read())

In [None]:
main_prompt = """Format the code delimited by triple backticks according to PEP 8 conventions. Do not add, remove, or change anything else. First, identify formatting issues in the code. Then, fix these issues in the code. Structure your response under the following headings: 'Identified formatting issues' and 'Formatted code'.
Formatting issues fall under the following major categories:
E1: Indentation
E2: Whitespace
E3: Blank line
E4: Import
E5: Line length
E7: Statement
W1: Indentation warning
W2: Whitespace warning
W5: Line break warning"""

shots_input = []
shots_output = []
for i in range(1, 5):
    with open(f'../shots/shot{i}.py', 'r') as f:
        shots_input.append(f.read())
    with open(f'../shots/shot{i}_after.py', 'r') as f:
        shots_output.append(f.read())

ex1_input = f"""```python
{shots_input[0]}
```"""

ex1_output = f"""First, we identify formatting issues in the code that violate PEP 8 conventions according to the major categories provided. We can see that the two import statements should be on a separate line. The indentation of the return statement in the function 'evaluate' is also tabbed instead of spaced. Finally, the function 'evaluate' is missing two blank lines after its definition.

Identified formatting issues:
- E401: multiple imports on one line
- W191: indentation contains tabs
- E305: expected 2 blank lines after class or function definition, found 1

Now, we fix these issues in the code. For E401, we separate the two import statements onto separate lines. For W191, we replace the tab with four spaces. For E305, we add two blank lines after the function 'evaluate'.

Formatted code:
```python
{shots_output[0]}
```"""

ex2_input = f"""```python
{shots_input[1]}
```"""

ex2_output = f"""First, we identify formatting issues in the code that violate PEP 8 conventions according to the major categories provided. We can see that there is missing whitespace around the '*' operator in the. There is also a line break after the '*' operator instead of being before the operator. Finally, there is a semicolon at the end of the line where there shouldn't be.

Identified formatting issues:
- E226: missing whitespace around arithmetic operator
- W504: line break after binary operator
- E703: statement ends with a semicolon

Now, we fix these issues in the code. For E226, we add whitespace around the '*' operator. For W504, we line break before the '*' operator, bringing the operator down to the next line. For E703, we remove the semicolon at the end of the line.

Formatted code:
```python
{shots_output[1]}
```"""

ex3_input = f"""```python
{shots_input[2]}
```"""

ex3_output = f"""First, we identify formatting issues in the code that violate PEP 8 conventions according to the major categories provided. Firstly, one of the blank lines contains whitespace and should be removed. Secondly, the if statement is too long and should be split into multiple lines. Finally, the indentation of the variable assignment in the if statement is not a multiple of 4.

Identified formatting issues:
- W293: blank line contains whitespace
- E501: line too long
- E111: indentation is not a multiple of 4

Now, we fix these issues in the code. For W293, we remove the whitespace in the blank line. For E501, we split the if statement into two lines. For E111, we add an extra space to the indent of the variable assignment in the if statement so it is a multiple of 4.

Formatted code:
```python
{shots_output[2]}
```"""

ex4_input = f"""```python
{shots_input[3]}
```"""

ex4_output = f"""First, we identify formatting issues in the code that violate PEP 8 conventions according to the major categories provided. Firstly, there should be two blank lines after the import statement, not one. Secondly, the return statement in the function 'find' is too long and should be split into multiple lines. Additionally, a blank line contains whitespace and should be removed. The second line of the print statement is also under-indented. Finally, the closing bracket of the variable assignment for 'updated_offset' does not match the indentation of the opening bracket's line.

Identified formatting issues:
- E302: expected 2 blank lines, found 1
- E501: line too long
- W293: blank line contains whitespace
- E128: continuation line under-indented for visual indent
- E123: closing bracket does not match indentation of opening bracket's line

Now, we fix these issues in the code. For E302, we add an extra blank line after the import statement. For E501, we split the return statement into two lines. For W293, we remove the whitespace in the blank line. For E128, we indent the second line of the print statement so that 'count' is aligned with 'new_prompt' in the line above. For E123, we remove indentation from the closing bracket of the variable assignment for 'updated_offset' so it is indented at the same level as the first line.

Formatted code:
```python
{shots_output[3]}
```"""

identify_vote_prompt = """Given an original task and multiple choices, choose the best answer for the original task. Analyze each choice in detail, then conclude in the last line 'The best choice is {s}', where s is the integer id of the choice. If all choices are equally good, return the smallest id. If no choice is good, return 0.
Original task: Identify formatting issues in the code delimited by triple backticks according to PEP 8 conventions.
Formatting issues fall under the following major categories:
E1: Indentation
E2: Whitespace
E3: Blank line
E4: Import
E5: Line length
E7: Statement
W1: Indentation warning
W2: Whitespace warning
W5: Line break warning"""

identify_vote_ex1_input = f"""Code:
```python
{shots_input[0]}
```

Choice 1:
- E401: multiple imports on one line
- W191: indentation contains tabs
- E305: expected 2 blank lines after class or function definition, found 1
Choice 2:
- E226: missing whitespace around arithmetic operator
- W504: line break after binary operator
- E703: statement ends with a semicolon
Choice 3:
- W391: blank line at end of file"""

identify_vote_ex1_output = """Choice 1 is the best choice. It correctly identifies each formatting issue according to the major categories provided. Each issue identified is present in the code. No issues are missing and no extra issues are identified.
Choice 2 is not the best choice. None of the issues identified are actually present in the code and none of the issues present in the code are correctly identified.
Choice 3 is not the best choice. W391 does not fall under any of the major categories provided.

The best choice is 1."""

code_vote_prompt = """Given an original task and multiple choices, choose the best answer for the original task. Analyze each choice in detail, then conclude in the last line 'The best choice is {s}', where s is the integer id of the choice. If all choices are equally good, return the smallest id. If no choice is good, return 0.
Original task: Fix the following formatting issues in the code delimited by triple backticks. Do not add, remove, or change anything else. Output the formatted code with the identified issues rectified."""

code_vote_ex1_input = f"""Formatting issues to fix:
- E401: multiple imports on one line
- W191: indentation contains tabs
- E305: expected 2 blank lines after class or function definition, found 1

Code:
```python
{shots_input[0]}
```

Choice 1:
```python
{shots_input[0]}
```
Choice 2:
```python
{shots_output[0]}
```
Choice 3:
```python
import utils, rain


def evaluate(x, y):
    return rain.proccess(x, y)


utils.print(evaluate(3, 4))
```
"""

code_vote_ex1_output = """Choice 1 is incorrect. It does not change any of the issues identified and simply outputs the original code.
Choice 2 is correct. It correctly fixes each and every formatting issue identified without changing anything else.
Choice 3 is incorrect. While it fixes W191 and E305, it does not fix E401.

The best choice is 2."""

def get_cot_prompt(cell_src):
    return [
        {"role": "user", "content": main_prompt},
        {"role": "user", "content": ex1_input},
        {"role": "assistant", "content": ex1_output},
        {"role": "user", "content": ex2_input},
        {"role": "assistant", "content": ex2_output},
        {"role": "user", "content": ex3_input},
        {"role": "assistant", "content": ex3_output},
        {"role": "user", "content": ex4_input},
        {"role": "assistant", "content": ex4_output},
        {"role" : "user", "content" : f"```python\n{cell_src}\n```"},
    ]

def get_identified_names(identify_trials, identify_completions):
    identified_names = []
    for i in range(identify_trials):
        if identify_completions.choices[i].finish_reason == 'stop':
            try:
                issues = identify_completions.choices[i]['message']['content'].split('Identified formatting issues:')[1].strip("\n").split("\n")[0]
            except:
                print("unexpected format for issue", identify_completions.choices[i]['message']['content'])
                issues = None
        else:
            issues = None
        identified_names.append(issues)
    return identified_names

def get_identify_vote_msgs(cell_src):
    def func(choices):
        final_msg = f"Code:\n```python\n{cell_src}\n```\n\n"

        for i, choice in enumerate(choices):
            final_msg += f"Choice {i + 1}:\n{choice}\n"
        
        return [
        {"role": "user", "content": identify_vote_prompt},
        {"role": "user", "content": identify_vote_ex1_input},
        {"role": "assistant", "content": identify_vote_ex1_output},
        {"role": "user", "content": final_msg}
        ]
    
    return func

def get_code_vote_msgs(cell_src):
    def func(new_issues, choices):
        final_msg = f"Formatting issues to fix:\n{new_issues}\n\nCode:\n```python\n{cell_src}\n```\n\n"
        
        for i, choice in enumerate(choices):
            final_msg += f"Choice {i + 1}:\n{choice}\n"
        
        return [
        {"role": "user", "content": code_vote_prompt},
        {"role": "user", "content": code_vote_ex1_input},
        {"role": "assistant", "content": code_vote_ex1_output},
        {"role" : "user", "content" : final_msg}
        ]

    return func

In [None]:
# GPT Tree of Thought
import sys
sys.path.append('../../../')
from tree_of_thought import solve_toc

identify_trials = 5
code_trials = 3
identify_vote_trials = 6
code_vote_trials = 4
identify_stop = "Updated code"

# identify and remove unused using GPT
gpt_results = []
for i in range(NUM_FILES):
    input_msgs_cot = get_cot_prompt(random_cells[i])
    get_identified_names_func = get_identified_names
    get_identify_votes_msgs_func = get_identify_vote_msgs(random_cells[i])
    get_code_votes_msgs_func = get_code_vote_msgs(random_cells[i])

    print(f'Processing file {i}')
    identified, updated_code = solve_toc(input_msgs_cot, identify_trials, code_trials, identify_vote_trials, code_vote_trials, identify_stop, get_identified_names_func, get_identify_votes_msgs_func, get_code_votes_msgs_func)
    print(f'File {i} - {identified}')
    gpt_results.append({'identified': identified, 'updated_code': updated_code})

# save the results to a file
with open(GPT_SAVED_FILE_NAME, 'w') as f:
    f.write(str(gpt_results))

In [None]:
# read in gpt result from file
with open(GPT_SAVED_FILE_NAME, 'r') as f:
    gpt_results = eval(f.read())

In [None]:
# save the results to a variable
gpt_issues = [var['identified'] for var in gpt_results]
gpt_new_code = [var['updated_code'] for var in gpt_results]

In [None]:
# print all to new folder
import os
if not os.path.exists(GPT_SAVED_FOLDER_NAME):
    os.makedirs(GPT_SAVED_FOLDER_NAME)
for i, code in enumerate(gpt_new_code):
    with open(f'{GPT_SAVED_FOLDER_NAME}/{i}.py', 'w') as f:
        if gpt_new_code[i] is None or gpt_changes[i] is None:
            f.write(random_cells[i])
        else:
            f.write(code)

In [None]:
sys.path.append('../')
from common import pycodestyle, group_by_error, print_num_reductions, print_percentage_difference, IGNORE_TYPES

In [None]:
# store error counts in a hash
error_counts_before = pycodestyle(FOLDER_NAME, NUM_FILES, IGNORE_TYPES)

# print the error counts
total_errors_before = sum(error_counts_before.values())
print(f'Total before: {total_errors_before}')

In [None]:
# store error counts in a hash
error_counts_after = pycodestyle(GPT_SAVED_FOLDER_NAME, NUM_FILES, IGNORE_TYPES)

# print the error counts
total_errors_after = sum(error_counts_after.values())
print(f'Total after: {total_errors_after}')

In [None]:
error_counts_before

In [None]:
error_counts_after

In [None]:
# List percentage difference between before and after for total
print(f'Total percentage difference: {(total_errors_after - total_errors_before) / total_errors_before * 100}%')

In [None]:
error_counts_grouped_before = group_by_error(error_counts_before)
error_counts_grouped_before

In [None]:
error_counts_grouped_after = group_by_error(error_counts_after)
error_counts_grouped_after

In [None]:
print_num_reductions(error_counts_grouped_before, error_counts_grouped_after)

In [None]:
print_percentage_difference(error_counts_grouped_before, error_counts_grouped_after)