In [7]:
# Function for extracting the code from the responses
import regex as re

def extract_code(response):
    scraper_code = re.search(r"(?<=```)[\s\S]+?(?=```)", response)
    if scraper_code != None:
        scraper_code = scraper_code.group(0)
    else:
        scraper_code = response
    # check if first line is "python"
    if scraper_code.startswith("python"):
        scraper_code = scraper_code[6:]
    return scraper_code

In [2]:
# Function for generating the CSV files for the test set
import os
import pandas as pd
import json

def run_scripts(csv_folder, solution_code_folder):
    err_dict = {}
    for filename in os.listdir(solution_code_folder):
        if filename.endswith('.py'):
            import_line = f'from {solution_code_folder.replace("/", ".")} import {filename[:-3]}'
            print(import_line)

            try:
                exec(import_line)
            except:
                err_dict[filename[:-3]] = 0
                continue

            try:
                df = pd.read_csv('scraped_data.csv')
            except:
                err_dict[filename[:-3]] = 1
                continue

            err_dict[filename[:-3]] = 2

            # Save the CSV file
            df.to_csv(f'{csv_folder}/{filename[:-3]}.csv', index=False)

In [3]:
# Function for jaccard, dice and overlap similarity
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def dice_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    return float(2 * intersection) / (len(list1) + len(list2))

def overlap_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    return float(intersection) / min(len(list1), len(list2))

In [4]:
# Function for calculating score of model for each script in the test set
def calculate_test_accuracy(generated_csv_folder, correct_csv_folder):
    # Calculate the score for each script
    score_dict = {}
    for filename in os.listdir(correct_csv_folder):
        generated_csv = pd.read_csv(f'{generated_csv_folder}/{filename}')
        correct_csv = pd.read_csv(f'{correct_csv_folder}/{filename}')

        # Load all columns of each CSV file and store each column in a list in a dictionary
        correct_dict = {}
        generated_dict = {}
        for column in correct_csv.columns:
            correct_dict[column] = correct_csv[column].tolist()
            generated_dict[column] = generated_csv[column].tolist()
        
        # For each correct column, find the generated column with the highest similarity score
        total_score = 0
        for column in correct_dict.keys():
            max_score = 0
            for column2 in generated_dict.keys():
                max_score = max(max_score, jaccard_similarity(correct_dict[column], generated_dict[column2]))
            total_score += max_score

        # Calculate the average score for this csv file
        score_dict[filename[:-3]] = total_score / len(correct_dict.keys())

    # Save the score dictionary as a JSON file
    with open('score_dict.json', 'w') as f:
        json.dump(score_dict, f)

In [5]:
# Function for calculating Mann-Whitney U test for finding significance of difference in mean between two models
import numpy as np
from scipy.stats import mannwhitneyu

def calculate_mann_whitney_u_test(model_name1, model_name2):
    # Load the score dictionary for each model
    with open(f'score_dict_{model_name1}.json', 'r') as f:
        score_dict_model1 = json.load(f)
    with open(f'score_dict_{model_name2}.json', 'r') as f:
        score_dict_model2 = json.load(f)
    
    # Get the two lists of scores
    scores_model1 = score_dict_model1.values()
    scores_model2 = score_dict_model2.values()

    # Calculate the Mann-Whitney U test
    stat, p = mannwhitneyu(scores_model1, scores_model2)
    print('Statistics=%.3f, p=%.3f' % (stat, p))

    # Interpretation of the test
    alpha = 0.05
    if p > alpha:
        print(f'Fail to reject H0: {model_name1} and {model_name2} have the same distribution')
    else:
        print(f'Reject H0: {model_name1} and {model_name2} have different distributions')


def mean_score_of_model(model_name):
    # Load the score dictionary for each model
    with open(f'score_dict_{model_name}.json', 'r') as f:
        score_dict_model = json.load(f)
    
    # Get the list of scores
    scores_model = score_dict_model.values()

    # Calculate the mean score
    mean_score = np.mean(list(scores_model))
    print(f'Mean score of {model_name}: {mean_score}')

In [9]:
import os

# Should be done for each model folder
models_data_folder = "models_data"
for model_folder in os.listdir(models_data_folder):
    model_folder = 'finetuned'
    model_folder = f'{models_data_folder}/{model_folder}'
    # Extract the code from the responses
    generated_responses_folder = f'{model_folder}/generated_responses'
    for filename in os.listdir(generated_responses_folder):
        with open(f'{generated_responses_folder}/{filename}', 'r') as f:
            response = f.read()
        code = extract_code(response)
        with open(f'{model_folder}/extracted_code/{filename[:-4]}_code.py', 'w') as f:
            f.write(code)

In [8]:
import os

# TODO: Make sure the scripts are saved with the correct names

# Should be done for each model folder
models_data_folder = "models_data"
for model_folder in os.listdir(models_data_folder):
    model_folder = 'finetuned'
    # Run the scripts
    generated_csv_folder = f'{model_folder}/generated_csv_files'
    correct_csv_folder = f'{models_data_folder}/correct_csv_files'
    # websites_csv = pd.read_csv('websites_evaluation.csv')
    run_scripts(generated_csv_folder, f'{model_folder}/extracted_code')

    # Calculate the score for each script
    calculate_test_accuracy(generated_csv_folder, correct_csv_folder)

# Calculate the Mann-Whitney U test for finding significance of difference in mean between two models
model_name1 = 'model1'
model_name2 = 'model2'
calculate_mann_whitney_u_test(model_name1, model_name2)

# Calculate the mean score of each model
mean_score_of_model(model_name1)
mean_score_of_model(model_name2)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'finetuned/extracted_code'

In [17]:
# Convert list of strings to multiple txt files
import json

with open('generated_responses.json', 'r') as f:
    generated_responses = json.load(f)

responses_list = generated_responses['finetuned_lr0.001_e1_r16_seed42']
dtu_count = 0
airbnb_count = 0
imdb_count = 0
for i in range(len(responses_list)):
    if "downloaded_pages/airbnb.html" in responses_list[i]:
        filename = f'airbnb_{airbnb_count}.txt'
        airbnb_count += 1
    elif "downloaded_pages/DTU_entrepreneurship.html" in responses_list[i]:
        filename = f'DTU_entrepreneurship_{dtu_count}.txt'
        dtu_count += 1
    elif "downloaded_pages/imdb.html" in responses_list[i]:
        filename = f'imdb_{imdb_count}.txt'
        imdb_count += 1
    else:
        filename = f'unknown_{i}.txt'

    with open(f'models_data/finetuned/generated_responses/{filename}', 'w') as f:
        f.write(responses_list[i])

In [11]:
# Generate correct CSV files
import os


run_scripts('models_data/correct_csv_files', 'solution_code_human/imdb')

from solution_code_human.imdb import imdb_0
from solution_code_human.imdb import imdb_1
from solution_code_human.imdb import imdb_2
from solution_code_human.imdb import imdb_3
from solution_code_human.imdb import imdb_4
from solution_code_human.imdb import imdb_5
from solution_code_human.imdb import imdb_6
from solution_code_human.imdb import imdb_7
from solution_code_human.imdb import imdb_8
from solution_code_human.imdb import imdb_9
