### 1. Ucitavanje dataset-a sa HF

In [95]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd

# Load the datasets
dataset_python = load_dataset("bigcode/the-stack-smol-xs", "python")
dataset_c = load_dataset("bigcode/the-stack-smol-xs", "c")
dataset_cpp = load_dataset("bigcode/the-stack-smol-xs", "c++")

# Split the datasets into training and evaluation sets
train_test_split = lambda dataset: dataset['train'].train_test_split(test_size=0.2)

split_python = train_test_split(dataset_python)
split_c = train_test_split(dataset_c)
split_cpp = train_test_split(dataset_cpp)

# Convert to DataFrames
df_python_train = pd.DataFrame(split_python['train'])
df_python_eval = pd.DataFrame(split_python['test'])

df_c_train = pd.DataFrame(split_c['train'])
df_c_eval = pd.DataFrame(split_c['test'])

df_cpp_train = pd.DataFrame(split_cpp['train'])
df_cpp_eval = pd.DataFrame(split_cpp['test'])


In [96]:
df_python_train.head()

Unnamed: 0,content,lang,size,ext,max_stars_count,avg_line_length,max_line_length,alphanum_fraction
0,# -*- coding: utf-8 -*-\n'''\nReturn data to a...,Python,4218,py,,25.257485,101,0.621622
1,# -*- coding: utf-8 -*-\n# Copyright 2019 Cohe...,Python,584,py,,22.461538,64,0.648973
2,#\n# This file is part of pySMT.\n#\n# Copyr...,Python,7256,py,,39.221622,80,0.580072
3,import os\nimport signal\nimport subprocess\ni...,Python,1292,py,35.0,28.086957,84,0.692724
4,"def raises(err, lamda):\n try:\n lam...",Python,2982,py,3.0,27.357798,78,0.551643


In [97]:
df_python_eval.head()

Unnamed: 0,content,lang,size,ext,max_stars_count,avg_line_length,max_line_length,alphanum_fraction
0,import sqlite3\n\n\ndef init():\n global da...,Python,576,py,11.0,23.04,47,0.604167
1,# This file is a part of Arjuna\n# Copyright 2...,Python,3844,py,13.0,33.719298,136,0.706296
2,import matplotlib.pyplot as plt\nimport seabor...,Python,116081,py,1.0,530.050228,10170,0.711874
3,import numpy as np\n\nimport mss\n\nfrom redis...,Python,6780,py,2.0,35.684211,176,0.631416
4,import six\n\nfrom odin import bases\nfrom odi...,Python,2523,py,22.0,30.39759,119,0.662703


### 2. Ucitavanje pretreniranog modela

In [98]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

##### Tokenizovanje podataka

In [99]:
from datasets import Dataset
train_dataset_python = Dataset.from_pandas(df_python_train) 

In [100]:
train_dataset_python['content']

["# -*- coding: utf-8 -*-\n'''\nReturn data to a mongodb server\n\nRequired python modules: pymongo\n\n\nThis returner will send data from the minions to a MongoDB server. To\nconfigure the settings for your MongoDB server, add the following lines\nto the minion config files::\n\n    mongo.db: <database name>\n    mongo.host: <server ip address>\n    mongo.user: <MongoDB username>\n    mongo.password: <MongoDB user password>\n    mongo.port: 27017\n\nAlternative configuration values can be used by prefacing the configuration.\nAny values not found in the alternative configuration will be pulled from\nthe default location::\n\n    alternative.mongo.db: <database name>\n    alternative.mongo.host: <server ip address>\n    alternative.mongo.user: <MongoDB username>\n    alternative.mongo.password: <MongoDB user password>\n    alternative.mongo.port: 27017\n\n  To use the mongo returner, append '--return mongo' to the salt command. ex:\n\n    salt '*' test.ping --return mongo_return\n\n  T

In [101]:
tokenizer.pad_token = tokenizer.eos_token

In [125]:
# tokenize train_dataset_python with the GPT2 tokenizer

def tokenize_function(examples):
    tokens = tokenizer(examples["content"], max_length=1024, padding='max_length', truncation=True)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


In [126]:
# use tokenize_function to train_dataset_python
train_dataset_python = train_dataset_python.map(tokenize_function, batched=True)

# tokenize eval_dataset_python with the GPT2 tokenizer
eval_dataset_python = Dataset.from_pandas(df_python_eval)
eval_dataset_python = eval_dataset_python.map(tokenize_function, batched=True)

Map: 100%|██████████| 80/80 [00:00<00:00, 96.15 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 50.78 examples/s]


In [127]:
model.config.pad_token_id = tokenizer.pad_token_id

In [128]:
training_arguments = TrainingArguments(
    output_dir="./gpt2-python",
    num_train_epochs=1,
    per_device_train_batch_size=4
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset_python,
    eval_dataset=eval_dataset_python
)

In [129]:
train_dataset_python

Dataset({
    features: ['content', 'lang', 'size', 'ext', 'max_stars_count', 'avg_line_length', 'max_line_length', 'alphanum_fraction', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 80
})

In [130]:
train_dataset_python['content']

["# -*- coding: utf-8 -*-\n'''\nReturn data to a mongodb server\n\nRequired python modules: pymongo\n\n\nThis returner will send data from the minions to a MongoDB server. To\nconfigure the settings for your MongoDB server, add the following lines\nto the minion config files::\n\n    mongo.db: <database name>\n    mongo.host: <server ip address>\n    mongo.user: <MongoDB username>\n    mongo.password: <MongoDB user password>\n    mongo.port: 27017\n\nAlternative configuration values can be used by prefacing the configuration.\nAny values not found in the alternative configuration will be pulled from\nthe default location::\n\n    alternative.mongo.db: <database name>\n    alternative.mongo.host: <server ip address>\n    alternative.mongo.user: <MongoDB username>\n    alternative.mongo.password: <MongoDB user password>\n    alternative.mongo.port: 27017\n\n  To use the mongo returner, append '--return mongo' to the salt command. ex:\n\n    salt '*' test.ping --return mongo_return\n\n  T

In [131]:
eval_dataset_python

Dataset({
    features: ['content', 'lang', 'size', 'ext', 'max_stars_count', 'avg_line_length', 'max_line_length', 'alphanum_fraction', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 20
})

In [132]:
trainer.train()


100%|██████████| 20/20 [06:20<00:00, 19.04s/it]

{'train_runtime': 380.7458, 'train_samples_per_second': 0.21, 'train_steps_per_second': 0.053, 'train_loss': 1.6358243942260742, 'epoch': 1.0}





TrainOutput(global_step=20, training_loss=1.6358243942260742, metrics={'train_runtime': 380.7458, 'train_samples_per_second': 0.21, 'train_steps_per_second': 0.053, 'total_flos': 41806725120000.0, 'train_loss': 1.6358243942260742, 'epoch': 1.0})

In [133]:
trainer.evaluate()

100%|██████████| 3/3 [00:13<00:00,  4.45s/it]


{'eval_loss': 1.172829270362854,
 'eval_runtime': 21.6043,
 'eval_samples_per_second': 0.926,
 'eval_steps_per_second': 0.139,
 'epoch': 1.0}

### Generating outputs

In [134]:
# Function to generate text using the trained model
def generate_code(prompt, model, tokenizer, max_length=10240):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2, top_p=0.95, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [145]:
# Example of generating code
prompt = "def add_numbers(a, b, c):"
prompt2 = '''
import


'''
generated_code = generate_code(prompt2, model, tokenizer)
print(generated_code)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



import


from django.db import models



from flask import Flask

 from djang.py import import logging




### Generate code on evaluation dataset

In [136]:
print(eval_dataset_python[0]['content'])

import sqlite3


def init():
    global database_file
    global db
    global cursor

    database_file = "db/database.db"
    db = sqlite3.connect(database_file)
    cursor = db.cursor()

    # Nomes das colunas do database
    # sql = "select * from database where 1=0;"
    # cursor.execute(sql)
    # p = [d[0] for d in cursor.description]
    # print(p)

    # def query(command, arguments=[]):
    #     _db = sqlite3.connect(database_file)
    #     _c = _db.cursor()
    #     _c.execute(command, arguments)
    #     results = _c.fetchall()
    #     return results



In [137]:
predictions = []
for example in eval_dataset_python:
    input_text = example['content']
    print(input_text)
    generated_code = generate_code(input_text, model, tokenizer)
    print(generated_code)
    print("-------------------------------")
    predictions.append(generated_code)
print(predictions)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


import sqlite3


def init():
    global database_file
    global db
    global cursor

    database_file = "db/database.db"
    db = sqlite3.connect(database_file)
    cursor = db.cursor()

    # Nomes das colunas do database
    # sql = "select * from database where 1=0;"
    # cursor.execute(sql)
    # p = [d[0] for d in cursor.description]
    # print(p)

    # def query(command, arguments=[]):
    #     _db = sqlite3.connect(database_file)
    #     _c = _db.cursor()
    #     _c.execute(command, arguments)
    #     results = _c.fetchall()
    #     return results



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


import sqlite3


def init():
    global database_file
    global db
    global cursor

    database_file = "db/database.db"
    db = sqlite3.connect(database_file)
    cursor = db.cursor()

    # Nomes das colunas do database
    # sql = "select * from database where 1=0;"
    # cursor.execute(sql)
    # p = [d[0] for d in cursor.description]
    # print(p)

    # def query(command, arguments=[]):
    #     _db = sqlite3.connect(database_file)
    #     _c = _db.cursor()
    #     _c.execute(command, arguments)
    #     results = _c.fetchall()
    #     return results

-------------------------------
# This file is a part of Arjuna
# Copyright 2015-2021 Rahul Verma

# Website: www.RahulVerma.net

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#   http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

IndexError: index out of range in self

In [None]:
# # Generate predictions on the evaluation dataset
# def generate_predictions(dataset, model, tokenizer):
#     predictions = []
#     for example in dataset:
#         input_text = example['content']
#         generated_code = generate_code(input_text, model, tokenizer)
#         print(generated_code)
#         predictions.append(generated_code)
#     return predictions

In [None]:
# generated_codes = generate_predictions(eval_dataset_python, model, tokenizer)
# print(generated_codes[:5])  # Print the first 5 generated predictions

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Input text:  # -*- coding: utf-8 -*-
# Generated by Django 1.9.1 on 2016-11-14 19:51
from __future__ import unicode_literals

from django.db import migrations, models


class Migration(migrations.Migration):

    dependencies = [
        ('annotation', '0031_auto_20161111_1943'),
    ]

    operations = [
        migrations.AddField(
            model_name='masterobservation',
            name='observation_time',
            field=models.PositiveIntegerField(blank=True, null=True),
        ),
        migrations.AddField(
            model_name='observation',
            name='observation_time',
            field=models.PositiveIntegerField(blank=True, null=True),
        ),
    ]

Generating code...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


# -*- coding: utf-8 -*-
# Generated by Django 1.9.1 on 2016-11-14 19:51
from __future__ import unicode_literals

from django.db import migrations, models


class Migration(migrations.Migration):

    dependencies = [
        ('annotation', '0031_auto_20161111_1943'),
    ]

    operations = [
        migrations.AddField(
            model_name='masterobservation',
            name='observation_time',
            field=models.PositiveIntegerField(blank=True, null=True),
        ),
        migrations.AddField(
            model_name='observation',
            name='observation_time',
            field=models.PositiveIntegerField(blank=True, null=True),
        ),
    ]

Input text:  from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

from hyperopt_synthetic import run_one_exp as hyperopt_synthetic_opt
from xbbo_synthetic import run_one_exp as xbbo_synthetic_opt

max_call = 50
if __name__ == "__main__":
    rng = np.random.RandomState(42)
    result_opt

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

from hyperopt_synthetic import run_one_exp as hyperopt_synthetic_opt
from xbbo_synthetic import run_one_exp as xbbo_synthetic_opt

max_call = 50
if __name__ == "__main__":
    rng = np.random.RandomState(42)
    result_opts = defaultdict(list)
    for i in range(3):
        seed = rng.randint(1e5)
        # result_opts['hyperopt-rand'].append(hyperopt_synthetic_opt('rand', max_call,seed))
        result_opts['hyperopt-tpe'].append(hyperopt_synthetic_opt('tpe', max_call,seed))
        # result_opts['hyperopt-atpe'].append(hyperopt_synthetic_opt('atpe', max_call,seed))
        # result_opts['hyperopt-mix'].append(hyperopt_synthetic_opt('mix', max_call,seed))
        result_opts['hyperopt-anneal'].append(hyperopt_synthetic_opt('anneal', max_call,seed))
        result_opts['XBBO-tpe'].append(xbbo_synthetic_opt('tpe', max_call,seed))
        result_opts['XBBO-anneal'].append(xbbo_synthetic_opt('anneal',m

IndexError: index out of range in self