In [None]:
"""
Shot and decomposed prompting: decompose into identify then remove
"""

In [1]:
my_key = "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"

In [2]:
import sys
sys.path.append('../../../')
from utils import print_check_gpt_results

In [3]:
# define variables
NUM_FILES = 20
FOLDER_NAME = '../../determining_files_rename/random_samples_variables'
GPT_SAVED_FILE_NAME = 'rename_variable_gpt'

In [4]:
# read in files from folder random_cells 
random_cells = []

# read in cells
for i in range(NUM_FILES):
    file_name = f'{FOLDER_NAME}/{i}.py'
    with open(file_name, 'r') as f:
        random_cells.append(f.read())

# read in readmes
with open(f'{FOLDER_NAME}/readmes.txt', 'r') as f:
    readmes = eval(f.read())

In [35]:
t1task = "Suggest a better name for the variable specified in the code delimited by triple backticks that is more meaningful and better reflects its usage and/or better aligns with the project's purpose. Only suggest for the single variable specified. Structure your response under the following headings: 'New variable name' (the new variable name) and 'Explanation' (a 1-2 sentence explanation of the new variable name)."

t1ex1_input = """Project purpose:
This project is about analyzing top movie genre trends from 2000 to 2010. We will look at the top genres and top movies for each year.

Variable:
dat

Code:
```python
import pandas as pd

dat = pd.read_csv('data.csv')
dat.head()
```"""

t1ex1_output = """New variable name:
movie_data_df

Explanation:
The new variable name 'movie_data_df' reflects its usage as a dataframe containing movie data."""

t1ex2_input = """Project purpose:
The focus of this project is determining the most common crime types in LA.

Variable to rename:
group

Code:
```python
group = df.groupby('gender')

for gender, grouped_data in group:
    print(f"Gender: {gender}")
    print(grouped_data['crime_type'].value_counts())
    print("\n")
```"""

t1ex2_output = """New variable name:
gender_crime_groups

Explanation:
The new variable name 'gender_crime_groups' aligns with its purpose of grouping crime data by gender."""

t1ex3_input = """Project purpose:
This repository contains a collection of graph theory assignments for MAT381.

Variable to rename:
queue

Code:
```python
queue[0] = queue[0].lower()
while queue:
    vertex = queue.popleft()
    print(vertex, end=' ')

    for neighbor in graph[vertex]:
        if neighbor not in visited:
            queue.append(neighbor)
            visited.add(neighbor)
```"""

t1ex3_output = """New variable name:
vertex_queue

Explanation:
By renaming queue to vertex_queue, it's now clearer that the variable is being used to store vertices."""

t1ex4_input = """Project purpose:
This project is for analyzing the relationship between the number of hours studied and the exam scores of students.

Variable to rename:
result

Code:
```python
result, info = pearsonr(queue['Hours'], queue['Scores'])
print(result)
```"""

t1ex4_output = """New variable name:
hours_scores_correlation

Explanation:
The variable name 'hours_scores_correlation' reflects the purpose of the variable, which is to store the correlation between the number of hours studied and the exam scores of students."""

In [36]:
# rename using GPT
import openai
openai.api_key = my_key

# GPT
def rename(purpose, cell_src, name):
    while True:
        try:
            completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                temperature=0,
                messages = [
                    {"role" : "user", "content" : t1task},
                    {"role" : "user", "content" : t1ex1_input},
                    {"role" : "assistant", "content" : t1ex1_output},
                    {"role" : "user", "content" : t1ex2_input},
                    {"role" : "assistant", "content" : t1ex2_output},
                    {"role" : "user", "content" : t1ex3_input},
                    {"role" : "assistant", "content" : t1ex3_output},
                    {"role" : "user", "content" : t1ex4_input},
                    {"role" : "assistant", "content" : t1ex4_output},
                    {"role" : "user", "content" : f"Project purpose:\n{purpose}\n\nVariable to rename:\n{name}\n\nCode:\n```python\n{cell_src}\n```"}
                ]
            )
        except Exception as e:
            if 'maximum context length' in str(e):
                print('...Error.. too long...' + str(e))
                return 'length', None
            else:
                print('...Error.. trying again...' + str(e))
        else:
            break
    return completion.choices[0].finish_reason, completion.choices[0].message["content"]

gpt_results = []
for i, cell_src in enumerate(random_cells):
    print(f'Processing file {i}')
    finish_reason, result = rename(readmes[i], cell_src, 'variable_def')
    print(f'File {i} - {finish_reason}')
    gpt_results.append({'reason': finish_reason, 'result': result})

# save the results to a file
with open(GPT_SAVED_FILE_NAME, 'w') as f:
    f.write(str(gpt_results))

Processing file 0
File 0 - stop
Processing file 1
File 1 - stop
Processing file 2
File 2 - stop
Processing file 3
File 3 - stop
Processing file 4
File 4 - stop
Processing file 5
File 5 - stop
Processing file 6
File 6 - stop
Processing file 7
File 7 - stop
Processing file 8
File 8 - stop
Processing file 9
File 9 - stop
Processing file 10
File 10 - stop
Processing file 11
File 11 - stop
Processing file 12
File 12 - stop
Processing file 13
File 13 - stop
Processing file 14
File 14 - stop
Processing file 15
File 15 - stop
Processing file 16
File 16 - stop
Processing file 17
File 17 - stop
Processing file 18
File 18 - stop
Processing file 19
File 19 - stop


In [37]:
# read in gpt result from file
with open(GPT_SAVED_FILE_NAME, 'r') as f:
    gpt_results = eval(f.read())

In [39]:
# now split the data into files
gpt_new_names = []
gpt_explanation = []

for i, result in enumerate(gpt_results):
    if result['reason'] == 'stop':
        # split the result
        first_split = result['result'].split('New variable name:')[1].split('Explanation:')
        updated_name = first_split[0].strip()
        explanation = first_split[1].strip()

        # update name
        if len(updated_name.split('`')) == 3:
            updated_name = updated_name.split('`')[1]
        
        # store
        gpt_new_names.append(updated_name)
        gpt_explanation.append(explanation)
    else:
        # if we error we assume nothing
        gpt_new_names.append(None)
        gpt_explanation.append(None)

In [40]:
t2ex1_input = """Rename the variable 'dat' to 'movie_data_df' in the code delimited by triple backticks. Do not change anything else.
```python
import pandas as pd

dat = pd.read_csv('data.csv')
dat.head()
```"""

t2ex1_output = """```python
import pandas as pd

movie_data_df = pd.read_csv('data.csv')
movie_data_df.head()
```"""

t2ex2_input = """Rename the variable 'group' to 'gender_crime_groups' in the code delimited by triple backticks. Do not change anything else.
```python
group = df.groupby('gender')

for gender, grouped_data in group:
    print(f"Gender: {gender}")
    print(grouped_data['crime_type'].value_counts())
    print("\n")
```"""

t2ex2_output = """```python
gender_crime_groups = df.groupby('gender')

for gender, grouped_data in gender_crime_groups:
    print(f"Gender: {gender}")
    print(grouped_data['crime_type'].value_counts())
    print("\n")
```"""

t2ex3_input = """Rename the variable 'queue' to 'vertex_queue' in the code delimited by triple backticks. Do not change anything else.
```python
queue[0] = queue[0].lower()
while queue:
    vertex = queue.popleft()
    print(vertex, end=' ')

    for neighbor in graph[vertex]:
        if neighbor not in visited:
            queue.append(neighbor)
            visited.add(neighbor)
```"""

t2ex3_output = """```python
vertex_queue[0] = vertex_queue[0].lower()
while vertex_queue:
    vertex = vertex_queue.popleft()
    print(vertex, end=' ')

    for neighbor in graph[vertex]:
        if neighbor not in visited:
            vertex_queue.append(neighbor)
            visited.add(neighbor)
```"""

t2ex4_input = """Rename the variable 'result' to 'hours_scores_correlation' in the code delimited by triple backticks. Do not change anything else.
```python
result, info = pearsonr(queue['Hours'], queue['Scores'])
print(result)
```"""

t2ex4_output = """```python
hours_scores_correlation, info = pearsonr(queue['Hours'], queue['Scores'])
print(hours_scores_correlation)
```"""

In [41]:
# rename using GPT
import openai
openai.api_key = my_key

# GPT
def rename(cell_src, before, after):
    while True:
        try:
            completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                temperature=0,
                messages = [
                    {"role" : "user", "content" : t2ex1_input},
                    {"role" : "assistant", "content" : t2ex1_output},
                    {"role" : "user", "content" : t2ex2_input},
                    {"role" : "assistant", "content" : t2ex2_output},
                    {"role" : "user", "content" : t2ex3_input},
                    {"role" : "assistant", "content" : t2ex3_output},
                    {"role" : "user", "content" : t2ex4_input},
                    {"role" : "assistant", "content" : t2ex4_output},
                    {"role" : "user", "content" : f"Rename the variable '{before}' to '{after}' in the code delimited by triple backticks. Do not change anything else.\n```python\n{cell_src}\n```"}
                ]
            )
        except Exception as e:
            if 'maximum context length' in str(e):
                print('...Error.. too long...' + str(e))
                return 'length', None
            else:
                print('...Error.. trying again...' + str(e))
        else:
            break
    return completion.choices[0].finish_reason, completion.choices[0].message["content"]

gpt_results_code = []
for i, cell_src in enumerate(random_cells):
    if gpt_new_names[i] is None or gpt_explanation[i] is None:
        print(f'Skipping file {i} as we failed before')
        gpt_results_code.append({'reason': 'skipped', 'result': None})
    else:
        print(f'Processing file {i}')
        finish_reason, result = rename(cell_src, 'variable_def', gpt_new_names[i])
        print(f'File {i} - {finish_reason}')
        gpt_results_code.append({'reason': finish_reason, 'result': result})

# save the results to a file
with open(GPT_SAVED_FILE_NAME + "_code", 'w') as f:
    f.write(str(gpt_results_code))

Processing file 0
File 0 - stop
Processing file 1
File 1 - stop
Processing file 2
File 2 - stop
Processing file 3
File 3 - stop
Processing file 4
File 4 - stop
Processing file 5
File 5 - stop
Processing file 6
File 6 - stop
Processing file 7
File 7 - stop
Processing file 8
File 8 - stop
Processing file 9
File 9 - stop
Processing file 10
File 10 - stop
Processing file 11
File 11 - stop
Processing file 12
File 12 - stop
Processing file 13
File 13 - stop
Processing file 14
File 14 - stop
Processing file 15
File 15 - stop
Processing file 16
File 16 - stop
Processing file 17
File 17 - stop
Processing file 18
File 18 - stop
Processing file 19
File 19 - stop


In [42]:
# read in gpt result from file
with open(GPT_SAVED_FILE_NAME + "_code", 'r') as f:
    gpt_results_code = eval(f.read())

In [44]:
# now split the data into files
gpt_new_code = []

for i, result in enumerate(gpt_results_code):
    if gpt_results_code[i]['reason'] == 'stop':
        updated_code = result['result'].split('```')[1]
        if updated_code.startswith('python'):
            updated_code = updated_code[6:]
        updated_code = updated_code.strip('\n')
        gpt_new_code.append(updated_code)
    else:
        gpt_new_code.append(None)

In [52]:
# Count the number of times the variable name is/isn't successfully changed
import sys
sys.path.append('../../determining_files_rename')
from ast_determine_usable_items import compare_code

pass_count = 0
fail_count = 0

for i in range(NUM_FILES):
    if gpt_new_names[i] is None or gpt_new_code[i] is None or gpt_explanation[i] is None:
        fail_count += 1
        print(i)
    elif compare_code(random_cells[i], gpt_new_code[i], 'variable_def', gpt_new_names[i]):
        pass_count += 1
    else:
        fail_count += 1
        print(f'Failed on file {i}')

print(f'Pass count: {pass_count}, {pass_count / (pass_count + fail_count) * 100}%')
print(f'Fail count: {fail_count}, {fail_count / (pass_count + fail_count) * 100}%')

Failed on file 12
Pass count: 19, 95.0%
Fail count: 1, 5.0%
