In [1]:
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [2]:
dataset = load_dataset("text", data_files={"train": "train_c_code.txt", "test": "test_c_code.txt"})

Found cached dataset text (/home/moneebullah25/.cache/huggingface/datasets/text/default-6f7851ccb47a0532/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3561
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2677
    })
})

In [4]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # or any other GPT-2 variant you prefer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [5]:
# Tokenize your dataset
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="train_c_code.txt",
    block_size=128  # adjust block_size as needed
)



In [6]:
# Create a data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Set to True if you want to use masked language modeling
)

In [7]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./code_completion_model",
    overwrite_output_dir=True,
    num_train_epochs=5,  # adjust as needed
    per_device_train_batch_size=4,  # adjust as needed
    save_steps=10_000,
    save_total_limit=2,
)

In [8]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [9]:
# Fine-tune the model
trainer.train()

RuntimeError: handle_0 INTERNAL ASSERT FAILED at "/opt/conda/conda-bld/pytorch_1695392035891/work/c10/cuda/driver_api.cpp":15, please report a bug to PyTorch. 

In [None]:
# Save the fine-tuned model
model.save_pretrained("./fine_tuned_code_completion_model")
tokenizer.save_pretrained("./fine_tuned_code_completion_model")

In [None]:
# Load the fine-tuned model and tokenizer
model_name = "./fine_tuned_code_completion_model"  # Replace with the path to your fine-tuned model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [None]:
# Set the model to evaluation mode
model.eval()

# Generate sample text
prompt = "void main() {"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Adjust the max length of the generated sequence as needed
max_length = 100

# Generate the sample
output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode the generated output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Code:")
print(generated_text)