In [4]:
import pandas as pd
from transformers import PLBartForConditionalGeneration, PLBartTokenizer

In [5]:
### Read the code snippets
code_clusters = pd.read_csv("public/code_clusters_9999.csv")
code_snippets = code_clusters['Code_Block'].to_numpy()

In [6]:
### Create the model
tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
code_snippet = """
def fibonacci(n):
    if n <= 0:
        return 0
    elif n == 1:
        return 1
    else:
        return fibonacci(n-1) + fibonacci(n-2)
"""
inputs = tokenizer(code_snippet, return_tensors="pt", max_length=512, truncation=True)
# print("inputs", inputs)
model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-python-en_XX")
translated_tokens = model.generate(
    **inputs,
    decoder_start_token_id=tokenizer.lang_code_to_id["__en_XX__"],
    max_length=50,            # Maximum length of the generated summary
    min_length=10,            # Minimum length of the generated summary
    num_beams=40,              # Number of beams for beam search
    no_repeat_ngram_size=2,   # Prevent repeating 2-grams
    early_stopping=False,      # Stop early when num_beams sentences are finished
    length_penalty=1.0,       # Length penalty
    temperature=0.7,          # Temperature for next token sampling
    top_k=50,                 # Top-k filtering
    top_p=0.98,               # Nucleus filtering
    repetition_penalty=1.2    # Penalty for repetition
)
summary = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
print(code_snippet)
print(summary)




def fibonacci(n):
    if n <= 0:
        return 0
    elif n == 1:
        return 1
    else:
        return fibonacci(n-1) + fibonacci(n-2)

['Returns 1 or 2 depending on the value of n. If n is not a number this function will return 0 or 1.']


In [7]:
from transformers import PLBartForConditionalGeneration, PLBartTokenizer

# Load the model and tokenizer
model_name = "uclanlp/plbart-base"
tokenizer = PLBartTokenizer.from_pretrained(model_name)
model = PLBartForConditionalGeneration.from_pretrained(model_name)

# Define the code snippet to summarize
code_snippet = """
def fibonacci(n):
    if n <= 0:
        return 0
    elif n == 1:
        return 1
    else:
        return fibonacci(n-1) + fibonacci(n-2)
"""

# Prepend the code with a task-specific prefix indicating the language and task
input_text = f"<python> summarize: {code_snippet}"

# Tokenize the input code snippet
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate the summary
summary_ids = model.generate(inputs.input_ids, max_length=50, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the summary
print("Summary:", summary)


Summary: if 0: elif n == 1: return 1 else: return fibonacci(n-1) + fibonacci(n-2)


In [8]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = PLBartTokenizer.from_pretrained("trl-internal-testing/tiny-random-PLBartForConditionalGeneration")
model = AutoModelForSeq2SeqLM.from_pretrained("trl-internal-testing/tiny-random-PLBartForConditionalGeneration")

# Define the code snippet to summarize
code_snippet = """
def fibonacci(n):
    if n <= 0:
        return 0
    elif n == 1:
        return 1
    else:
        return fibonacci(n-1) + fibonacci(n-2)
"""

# Prepend the code with a task-specific prefix indicating the language and task
input_text = f"<python> summarize: {code_snippet}"

# Tokenize the input code snippet
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate the summary
summary_ids = model.generate(inputs.input_ids, max_length=50, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the summary
print("Summary:", summary)



OSError: Can't load tokenizer for 'trl-internal-testing/tiny-random-PLBartForConditionalGeneration'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'trl-internal-testing/tiny-random-PLBartForConditionalGeneration' is the correct path to a directory containing all relevant files for a PLBartTokenizer tokenizer.

In [None]:
### model choice
from transformers import PLBartTokenizer, PLBartForConditionalGeneration

# Load the tokenizer and model
tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-python-en_XX")



In [None]:
def generate_summary(input):
    # Tokenize the input code snippet
    inputs = tokenizer(code_snippet, return_tensors="pt", max_length=512, truncation=True)

    # Generate the summary with hyperparameters
    translated_tokens = model.generate(
        **inputs,
        decoder_start_token_id=tokenizer.lang_code_to_id["__en_XX__"],
        max_length=50,            # Maximum length of the generated summary
        min_length=30,            # Minimum length of the generated summary
        num_beams=20,             # Number of beams for beam search
        no_repeat_ngram_size=2,   # Prevent repeating 2-grams
        early_stopping=False,     # Stop early when num_beams sentences are finished
        length_penalty=0.5,       # Length penalty
        temperature=0.7,          # Temperature for next token sampling
        top_k=50,                 # Top-k filtering
        top_p=0.95,               # Nucleus filtering
        repetition_penalty=1.2    # Penalty for repetition
    )

    # Decode and print the summary
    summary = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    return summary

In [None]:
# Define the code snippet
summaries = []
for code_snippet in code_snippets[:10]:
    # Decode and print the summary
    summary = generate_summary(translated_tokens)
    summaries.append(summary)
    
df = pd.DataFrame({"Code_Block": code_snippets[:10], "Summary": summaries}, columns=["Code_Block", "Summary"])
df.to_csv("public/code_summaries.csv", index=False)




In [9]:
#Read json file
import json

with open('public/conala-paired-test.json') as f:
    data = []
    for line in f:
        data.append(json.loads(line))


eval_df = pd.DataFrame(data)[['snippet', 'rewritten_intent']]
eval_df

Unnamed: 0,snippet,rewritten_intent
0,"os.kill(os.getpid(), signal.SIGUSR1)",send a signal `signal.SIGUSR1` to the current ...
1,bytes.fromhex('4a4b4c').decode('utf-8'),decode a hex string '4a4b4c' to UTF-8.
2,all(x == myList[0] for x in myList),check if all elements in list `myList` are ide...
3,"print('%*s : %*s' % (20, 'Python', 20, 'Very G...",format number of spaces between strings `Pytho...
4,d.decode('cp1251').encode('utf8'),
...,...,...
495,"re.findall('http://[^t][^s""]+\\.html', document)",match urls whose domain doesn't start with `t`...
496,"mystring.replace(' ', '! !').split('!')",split a string `mystring` considering the spac...
497,"open(path, 'r')",open file `path` with mode 'r'
498,[[sum(item) for item in zip(*items)] for items...,sum elements at the same index in list `data`


In [10]:
eval_snippets = eval_df['snippet'].to_numpy()[:100]
eval_intents = eval_df['rewritten_intent'].to_numpy()[:100]

In [13]:
from transformers import AutoTokenizer, AutoModelWithLMHead, SummarizationPipeline

pipeline = SummarizationPipeline(
    model=AutoModelWithLMHead.from_pretrained("SEBIS/code_trans_t5_base_code_documentation_generation_php_multitask"),
    tokenizer=AutoTokenizer.from_pretrained("SEBIS/code_trans_t5_base_code_documentation_generation_php_multitask", skip_special_tokens=True),
    device=0
)
tokenized_code = eval_snippets[0]
pipeline([tokenized_code], max_length=50)

Your max_length is set to 50, but your input_length is only 17. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)


os.kill(os.getpid(), signal.SIGUSR1)


[{'summary_text': 'return 0 }"'}]

In [101]:
generated_intents = []
for eval_snippet in eval_snippets:
    summary = generate_summary(eval_snippet)
    generated_intents.append(summary)

In [102]:
import sacrebleu

bleu = sacrebleu.corpus_bleu(generated_intents, [[eval_intents[i]] for i in range(len(eval_intents))], smooth_method='exp')
print(f"BLEU-4 Score: {bleu.score}")

BLEU-4 Score: 2.2437748935664263
