In [1]:
"""
Shot prompt
4-shot
"""

'\nShot prompt\n4-shot\n'

In [2]:
import sys
sys.path.append('../../')
from run_process import get_unused_data
sys.path.append('../../../')
from utils import stats_results_unused

In [4]:
# define variables
NUM_FILES = 305
SAMPLES_FOLDER_NAME = '../random_samples_variables'
GPT_SAVED_FILE_NAME = 'unused_variables_gpt'

In [5]:
# read in files from folder random_cells
random_cells = []

for i in range(NUM_FILES):
    file_name = f'{SAMPLES_FOLDER_NAME}/{i}.py'
    with open(file_name, 'r') as f:
        random_cells.append(f.read())

In [6]:
task = "A variable is unused if it is assigned using an equals operator but is not referenced after being defined. Identify and remove unused variables in the code delimited by triple backticks. For unused variables in unpacking, change these to underscores. Ignore unused functions, imports, or classes. Output under the headings 'Unused variables' and 'Updated code'"

ex1_input = """```python
time = 5
d = 5
print(d)

def calc():
    a = 5
    b = 4
return a + b
```"""

ex1_output = """Unused variables:
['time']

Updated code:
```python
d = 5
print(d)

def calc():
    a = 5
    b = 4
    return a + b
```"""

ex2_input = """```python
val1 = 'hello'
val2 = 'world'
```"""

ex2_output = """Unused variables:
['val1', 'val2']

Updated code:
```python
```"""

ex3_input = """```python
z = [1, 2, 3]
a, b, c = z
print(a + b)
```"""

ex3_output = """Unused variables:
['c']

Updated code:
```python
z = [1, 2, 3]
a, b, _ = z
print(a + b)
```"""

ex4_input = """"```python
def associate():
    return center.path()

path = associate()
```"""

ex4_output = """Unused variables:
['path']

Updated code:
```python
def associate():
    return center.path()
```"""

In [7]:
# Estimate cost
import sys
sys.path.append("../../..")
import utils

def estimate_tokens():
    in_tok = ''
    out_tok = ''
    for i, cell_src in enumerate(random_cells):
        # estimate prompt
        in_tok += task + ex1_input + ex1_output + ex2_input + ex2_output + ex3_input + ex3_output + ex4_input + ex4_output
        in_tok += f"```python\n{cell_src}\n```"
        # estimate response
        out_tok += cell_src
    return in_tok, out_tok

in_tok, out_tok = estimate_tokens()

utils.gpt_35_turbo_token_dollar_cost(in_tok, out_tok)

0.29550950000000004

In [8]:
# # identify unused
# import openai
# openai.api_key = my_key

# # GPT
# def identify_remove_unused(cell_src):
#     while True:
#         try:
#             completion = openai.ChatCompletion.create(
#                 model="gpt-3.5-turbo",
#                 temperature=0,
#                 messages = [
#                 {"role": "user", "content": task},
#                 {"role": "user", "content": ex1_input},
#                 {"role": "assistant", "content": ex1_output},
#                 {"role": "user", "content": ex2_input},
#                 {"role": "assistant", "content": ex2_output},
#                 {"role": "user", "content": ex3_input},
#                 {"role": "assistant", "content": ex3_output},
#                 {"role": "user", "content": ex4_input},
#                 {"role": "assistant", "content": ex4_output},
#                 {"role": "user", "content": f"```python\n{cell_src}\n```"}
#             ]
#             )
#         except Exception as e:
#             if 'maximum context length' in str(e):
#                 print('...Error.. too long...' + str(e))
#                 return 'length', None
#             else:
#                 print('...Error.. trying again...' + str(e))
#         else:
#             break
#     return completion.choices[0].finish_reason, completion.choices[0].message["content"]

# gpt_results = []
# for i, cell_src in enumerate(random_cells):
#     print(f'Processing file {i}')
#     finish_reason, result = identify_remove_unused(cell_src)
#     print(f'File {i} - {finish_reason}')
#     gpt_results.append({'reason': finish_reason, 'result': result})

# # save the results to a file
# with open(GPT_SAVED_FILE_NAME, 'w') as f:
#     f.write(str(gpt_results))

In [9]:
# read in gpt result from file
with open(GPT_SAVED_FILE_NAME, 'r') as f:
    gpt_results = eval(f.read())

In [10]:
# now split the data into files
gpt_unused_names = []
gpt_updated_code = []

for i, result in enumerate(gpt_results):
    if result['reason'] == 'stop':
        # split the result into unused names and updated code
        result_split = result['result'].split('Updated code:')
        # get the unused names
        unused_names = eval(result_split[0].split('Unused variables:')[1].strip("\n"))
        # get the updated code
        updated_code = result_split[1].split('```')[1]
        if updated_code.startswith('python'):
            updated_code = updated_code[6:]
        updated_code = updated_code.strip('\n')
        # store
        if unused_names == []:
            unused_names = None
        gpt_unused_names.append(unused_names)
        gpt_updated_code.append(updated_code)
    else:
        # if we error we assume no unused functions
        gpt_unused_names.append(None)
        gpt_updated_code.append(None)

In [11]:
# save the updated code to files
import os
if not os.path.exists('gpt_code'):
    os.makedirs('gpt_code')

failed = 0
for i, code in enumerate(gpt_updated_code):
    with open(f'gpt_code/{i}.py', 'w') as f:
        if gpt_unused_names[i] is None or gpt_updated_code[i] is None:
            failed += 1
            f.write(random_cells[i])
        else:
            f.write(code)

print(f'Failed {failed} times')

Failed 2 times


In [12]:
before = get_unused_data(NUM_FILES, SAMPLES_FOLDER_NAME, 'variable')

total_before = sum(len(item) for item in before)
print(f'Total before: {total_before}')

Total before: 586


In [13]:
after = get_unused_data(NUM_FILES, 'gpt_code', 'variable')

total_after = sum(len(item) for item in after)
print(f'Total after: {total_after}')

Total after: 157


In [14]:
# List percentage difference between before and after for total
print(f'Total percentage difference: {(total_after - total_before) / total_before * 100}%')

Total percentage difference: -73.20819112627987%


In [15]:
stats_results_unused(gpt_unused_names, before)

GPT before count: 526
Vulture before count: 586
------------
True positives: 446
False positives: 80
False negatives: 119
------------
Files with at least one false positive (and no false negatives)
2: 2 false positives
28: 4 false positives
53: 1 false positives
61: 1 false positives
97: 5 false positives
109: 1 false positives
121: 2 false positives
123: 2 false positives
135: 1 false positives
151: 1 false positives
152: 3 false positives
158: 4 false positives
179: 1 false positives
195: 1 false positives
268: 1 false positives
289: 1 false positives
------------
Files with at least one false negative (and no false positives)
4: 1 false negatives
6: 6 false negatives
12: 2 false negatives
15: 2 false negatives
17: 1 false negatives
23: 1 false negatives
29: 1 false negatives
30: 1 false negatives
32: 1 false negatives
38: 1 false negatives
44: 1 false negatives
46: 2 false negatives
51: 2 false negatives
54: 1 false negatives
83: 1 false negatives
89: 1 false negatives
92: 1 false 

In [32]:
print(random_cells[6])

reset_graph()

n_inputs = 28 * 28  # MNIST
n_hidden1 = 300 # 재사용
n_hidden2 = 50  # 재사용
n_hidden3 = 50  # 재사용
n_hidden4 = 20  # 새로 만듦!
n_outputs = 10  # 새로 만듦!

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")


In [33]:
print(gpt_updated_code[6])

reset_graph()

n_inputs = 28 * 28  # MNIST
n_hidden1 = 300 # 재사용
n_hidden2 = 50  # 재사용
n_hidden3 = 50  # 재사용
n_hidden4 = 20  # 새로 만듦!

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")


In [34]:
before[6]

['n_hidden1', 'n_hidden2', 'n_hidden3', 'n_hidden4', 'n_outputs', 'X', 'y']

In [35]:
after[6]

['n_hidden1', 'n_hidden2', 'n_hidden3', 'n_hidden4', 'X', 'y']

In [36]:
gpt_unused_names[6]

['n_outputs']

In [21]:
# delete 'gpt_code' folder
!rm -rf gpt_code