In [1]:
import sys
sys.path.append('../../')
from utils import print_check_gpt_results

In [2]:
NUM_FILES = 20
FOLDER_NAME = '../random_samples_formatting'
GPT_SAVED_FILE_NAME = 'formatted_code_gpt'
GPT_SAVED_FOLDER_NAME = 'reformatted_gpt'

In [3]:
# read in files from folder random_cells
random_cells = []

for i in range(NUM_FILES):
    file_name = f'{FOLDER_NAME}/{i}.py'
    with open(file_name, 'r') as f:
        random_cells.append(f.read())

In [4]:
my_key = "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"

In [5]:
shots_input = []
shots_output = []
for i in range(1, 5):
    with open(f'../shots/shot{i}.py', 'r') as f:
        shots_input.append(f.read())
    with open(f'../shots/shot{i}_after.py', 'r') as f:
        shots_output.append(f.read())

t1task = """Identify formatting issues in the code delimited by triple backticks according to PEP 8 conventions.
Formatting issues fall under the following major categories:
E1: Indentation
E2: Whitespace
E3: Blank line
E4: Import
E5: Line length
E7: Statement
W1: Indentation warning
W2: Whitespace warning
W3: Blank line warning
W5: Line break warning"""

t1ex1_input = f"""```python
{shots_input[0]}
```"""

t1ex1_output = f"""Identified formatting issues:
- E401: multiple imports on one line
- W191: indentation contains tabs
- E305: expected 2 blank lines after class or function definition, found 1"""

t1ex2_input = f"""```python
{shots_input[1]}
```"""

t1ex2_output = f"""Identified formatting issues:
- E226: missing whitespace around arithmetic operator
- W504: line break after binary operator
- E703: statement ends with a semicolon"""

t1ex3_input = f"""```python
{shots_input[2]}
```"""

t1ex3_output = f"""Identified formatting issues:
- W293: blank line contains whitespace
- E501: line too long
- E111: indentation is not a multiple of 4
- W391: blank line at end of file"""

t1ex4_input = f"""```python
{shots_input[3]}
```"""

t1ex4_output = f"""Identified formatting issues:
- E302: expected 2 blank lines, found 1
- E501: line too long
- W293: blank line contains whitespace
- E128: continuation line under-indented for visual indent
- E123: closing bracket does not match indentation of opening bracket's line"""

In [6]:
# identify using GPT
import openai
openai.api_key = my_key

# GPT
def identify_issues(cell_src):
    while True:
        try:
            completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                temperature=0,
                messages = [
                {"role": "user", "content": t1task},
                {"role": "user", "content": t1ex1_input},
                {"role": "assistant", "content": t1ex1_output},
                {"role": "user", "content": t1ex2_input},
                {"role": "assistant", "content": t1ex2_output},
                {"role": "user", "content": t1ex3_input},
                {"role": "assistant", "content": t1ex3_output},
                {"role": "user", "content": t1ex4_input},
                {"role": "assistant", "content": t1ex4_output},
                {"role": "user", "content": f"```python\n{cell_src}\n```"},
            ]
            )
        except Exception as e:
            if 'maximum context length' in str(e):
                print('...Error.. too long...' + str(e))
                return 'length', ''
            else:
                print('...Error.. trying again...' + str(e))
        else:
            break
    return completion.choices[0].finish_reason, completion.choices[0].message["content"]

gpt_results = []
for i, cell_src in enumerate(random_cells):
    print(f'Processing file {i}')
    finish_reason, result = identify_issues(cell_src)
    print(f'File {i} - {finish_reason}')
    gpt_results.append({'reason': finish_reason, 'result': result})

# save the results to a file
with open(GPT_SAVED_FILE_NAME, 'w') as f:
    f.write(str(gpt_results))

Processing file 0
File 0 - stop
Processing file 1
File 1 - stop
Processing file 2
File 2 - stop
Processing file 3
File 3 - stop
Processing file 4
File 4 - stop
Processing file 5
File 5 - stop
Processing file 6
File 6 - stop
Processing file 7
File 7 - stop
Processing file 8
File 8 - stop
Processing file 9
File 9 - stop
Processing file 10
File 10 - stop
Processing file 11
File 11 - stop
Processing file 12
File 12 - stop
Processing file 13
File 13 - stop
Processing file 14
File 14 - stop
Processing file 15
File 15 - stop
Processing file 16
File 16 - stop
Processing file 17
File 17 - stop
Processing file 18
File 18 - stop
Processing file 19
File 19 - stop


In [7]:
# read in gpt result from file
with open(GPT_SAVED_FILE_NAME, 'r') as f:
    gpt_results = eval(f.read())

In [8]:
# checking finish reason for identified functions
failed_ids_stage1 = print_check_gpt_results(gpt_results)
failed_ids = failed_ids_stage1
print("Failed ids: ", failed_ids_stage1)
print("Failed count: ", len(failed_ids_stage1))

stop: 20
Failed ids:  []
Failed count:  0


In [12]:
gpt_changes = []

# get all the code from the results
for i, result in enumerate(gpt_results):
    if gpt_results[i]['reason'] == 'stop':
        split = result['result'].split('Identified formatting issues:')
        if len(split) > 1:
            issues = split[1].strip("\n")
            gpt_changes.append(issues)
        else:
            gpt_changes.append('No issues identified')
    else:
        gpt_changes.append(None)

In [None]:
t2task = """Fix the following formatting issues in the code delimited by triple backticks. Do not add, remove, or change anything else. Output the formatted code with the identified issues rectified."""

t2ex1_input = f"""Formatting issues to fix:
- E401: multiple imports on one line
- W191: indentation contains tabs
- E305: expected 2 blank lines after class or function definition, found 1

Code:
```python
{shots_input[0]}
```"""

t2ex1_output = f"""```python
{shots_output[0]}
```"""

t2ex2_input = f"""Formatting issues to fix:
- E226: missing whitespace around arithmetic operator
- W504: line break after binary operator
- E703: statement ends with a semicolon

Code:
```python
{shots_input[1]}
```"""

t2ex2_output = f"""```python
{shots_output[1]}
```"""

t2ex3_input = f"""Formatting issues to fix:
- W293: blank line contains whitespace
- E501: line too long
- E111: indentation is not a multiple of 4
- W391: blank line at end of file

Code:
```python
{shots_input[2]}
```"""

t2ex3_output = f"""```python
{shots_output[2]}
```"""

t2ex4_input = f"""Formatting issues to fix:
- E302: expected 2 blank lines, found 1
- E501: line too long
- W293: blank line contains whitespace
- E128: continuation line under-indented for visual indent
- E123: closing bracket does not match indentation of opening bracket's line

Code:
```python
{shots_input[3]}
```"""

t2ex4_output = f"""```python
{shots_output[3]}
```"""

In [None]:
# identify unused functions using vulture (GPT)
import openai
openai.api_key = my_key

# GPT
def format_code(issues, cell_src):
    while True:
        try:
            completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                temperature=0,
                messages = [
                {"role": "user", "content": t2task},
                {"role": "user", "content": t2ex1_input},
                {"role": "assistant", "content": t2ex1_output},
                {"role": "user", "content": t2ex2_input},
                {"role": "assistant", "content": t2ex2_output},
                {"role": "user", "content": t2ex3_input},
                {"role": "assistant", "content": t2ex3_output},
                {"role": "user", "content": t2ex4_input},
                {"role": "assistant", "content": t2ex4_output},
                {"role": "user", "content": f"Formatting issues to fix:\n{issues}\n\nCode:\n```python\n{cell_src}\n```"},
            ]
            )
        except Exception as e:
            if 'maximum context length' in str(e):
                print('...Error.. too long...' + str(e))
                return 'length', ''
            else:
                print('...Error.. trying again...' + str(e))
        else:
            break
    return completion.choices[0].finish_reason, completion.choices[0].message["content"]

gpt_results_code = []
for i, cell_src in enumerate(random_cells):
    # skip cells that failed before
    if i in failed_ids:
        print(f'Skipping file {i} -- failed in task 1')
        gpt_results_code.append({'reason': 'skipped', 'result': None})
        continue
    print(f'Processing file {i}')
    finish_reason, result = format_code(cell_src)
    print(f'File {i} - {finish_reason}')
    gpt_results_code.append({'reason': finish_reason, 'result': result})

# save the results to a file
with open(GPT_SAVED_FILE_NAME + "_code", 'w') as f:
    f.write(str(gpt_results_code))

In [None]:
# read in gpt result from file
with open(GPT_SAVED_FILE_NAME + "_code", 'r') as f:
    gpt_results_code = eval(f.read())

In [None]:
# checking finish reason for identified functions
failed_ids_stage2 = print_check_gpt_results(gpt_results)
failed_ids += failed_ids_stage2
print("Failed ids: ", failed_ids_stage2)
print("Failed count: ", len(failed_ids_stage2))

In [None]:
# total failed step 1 and step 2
print("Total failed step 1 and step 2: ", len(failed_ids))

In [None]:
sys.path.append('../')
from common import pycodestyle, group_by_error, print_num_reductions, print_percentage_difference, IGNORE_TYPES

In [None]:
gpt_formatted_code = []

for i, result in enumerate(gpt_results_code):
    if i in failed_ids:
        gpt_formatted_code.append(None)
        continue
    code = result['result'].split("```")[1].strip("\n")
    code = code.split("```")[1]
    if code.startswith('python'):
        code = code[6:].strip("\n")
    gpt_formatted_code.append(code)

In [None]:
# print all to new folder
import os
if not os.path.exists(GPT_SAVED_FOLDER_NAME):
    os.makedirs(GPT_SAVED_FOLDER_NAME)
for i, code in enumerate(gpt_formatted_code):
    if i in failed_ids:
        continue
    with open(f'{GPT_SAVED_FOLDER_NAME}/{i}.py', 'w') as f:
        f.write(code)

In [None]:
# store error counts in a hash
error_counts_before = pycodestyle(FOLDER_NAME, NUM_FILES, IGNORE_TYPES, failed_ids)

# print the error counts
total_errors_before = sum(error_counts_before.values())
print(f'Total before: {total_errors_before}')

In [None]:
# store error counts in a hash
error_counts_after = pycodestyle(GPT_SAVED_FOLDER_NAME, NUM_FILES, IGNORE_TYPES, failed_ids)

# print the error counts
total_errors_after = sum(error_counts_after.values())
print(f'Total after: {total_errors_after}')

In [None]:
error_counts_before

In [None]:
error_counts_after

In [None]:
# List percentage difference between before and after for total
print(f'Total percentage difference: {(total_errors_after - total_errors_before) / total_errors_before * 100}%')

In [None]:
error_counts_grouped_before = group_by_error(error_counts_before)
error_counts_grouped_before

In [None]:
error_counts_grouped_after = group_by_error(error_counts_after)
error_counts_grouped_after

In [None]:
print_num_reductions(error_counts_grouped_before, error_counts_grouped_after)

In [None]:
print_percentage_difference(error_counts_grouped_before, error_counts_grouped_after)