In [1]:
import pandas as pd

prompt_vs_context = pd.read_csv('data/MTBP/Prompt_vs_context/prompt_vs_context.csv')

In [2]:
def keep_only_generated_code_prompts(codes):
    """ Takes as input a list of codes and return the generated codes as a list of generated code.
    """
    generated_code = []

    for i in range(len(codes)):
        lines = codes[i].split('\n')
        code = lines[1:]
        generated_code.append('\n'.join(code))

    return generated_code

def keep_only_generated_code_context(codes):
    """ Takes as input a list of codes and return the generated codes as a list of generated code.
    """
    generated_codes = []

    for i in range(len(codes)):
        text = process_function_with_context(codes[i], i)
        generated_codes.append(text)
        
    return generated_codes

def process_function_with_context(text, prompt_index):
    # Split the text into lines
    lines = text.split('\n')
    
    # Initialize counter for the prompt index
    prompt_counter = -1
    # Initialize variable to store the result
    result = []
    
    # Iterate over the lines
    for line in lines:
        # If the line starts with '#', increment the prompt counter
        if line.strip().startswith('#'):
            prompt_counter += 1
        
        # If the current prompt counter equals the given prompt index, add the line to the result
        if prompt_counter == prompt_index and not line.strip().startswith('#'):
            result.append(line.strip())
    
    # Join the result lines with '\n' and return it
    return '\n'.join(result)

In [3]:
import ast
import re
from collections import Counter


def process_generated_codes(data):

    generated_prompts = []
    generated_contexts = []

    for j in range(len(data)):

        generated_prompt = []
        generated_context = []

        codes_prompt = ast.literal_eval(data.iloc[j]['codes_by_prompts']) 
        codes_context = ast.literal_eval(data.iloc[j]['codes_with_context'])

        for i in range(len(codes_prompt)):
            generated_prompt = keep_only_generated_code_prompts(codes_prompt)
            generated_context = keep_only_generated_code_context(codes_context)
        
        generated_prompts.append(generated_prompt)
        generated_contexts.append(generated_context)

    return generated_prompts, generated_contexts

def calculate_lengths(codes_prompt, codes_context):
    lengths_prompt = [[len(code) for code in inner_list] for inner_list in codes_prompt]
    lengths_context = [[len(code) for code in inner_list] for inner_list in codes_context]
    
    return lengths_prompt, lengths_context


def get_most_common_words(codes_list1, codes_list2, topn=10):
    # Flatten the list of lists and join all the codes into a single string
    all_codes_str = ' '.join([' '.join(sublist) for sublist in codes_list1+codes_list2])

    # Use regular expressions to find all occurrences of each word
    all_words = re.findall(r'\b\w+\b', all_codes_str)

    # Use a Counter to count the occurrences of each word
    word_counter = Counter(all_words)

    # Return the top n most common words and their counts
    return word_counter.most_common(topn)

In [4]:
gen_p, gen_c = process_generated_codes(prompt_vs_context)
most_common_words = get_most_common_words(gen_p, gen_c)
lengths_prompt, lengths_context = calculate_lengths(gen_p, gen_c)