In [1]:
!pip install datasets==3.6.0

Collecting datasets==3.6.0
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-3.6.0


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, TaskType

In [3]:
# -----------------------------
# 1️⃣ Model & Tokenizer
# -----------------------------
MODEL_NAME = "EleutherAI/gpt-neo-125M"  # small model for 6GB VRAM
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  # GPT models need pad_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [4]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Model is on device: {device}")

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Model is on device: cuda


In [18]:
# -----------------------------
# 2️⃣ Load Dataset
# -----------------------------
# Using a small subset of CodeSearchNet Python for testing
dataset = load_dataset("code_search_net", "python", split="train[:1%]")  # 1% for quick training

In [19]:
dataset

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 4122
})

In [20]:
dataset[0]

{'repository_name': 'proycon/pynlpl',
 'func_path_in_repository': 'pynlpl/formats/folia.py',
 'func_name': 'AbstractElement.addidsuffix',
 'whole_func_string': 'def addidsuffix(self, idsuffix, recursive = True):\n        """Appends a suffix to this element\'s ID, and optionally to all child IDs as well. There is sually no need to call this directly, invoked implicitly by :meth:`copy`"""\n        if self.id: self.id += idsuffix\n        if recursive:\n            for e in self:\n                try:\n                    e.addidsuffix(idsuffix, recursive)\n                except Exception:\n                    pass',
 'language': 'python',
 'func_code_string': 'def addidsuffix(self, idsuffix, recursive = True):\n        """Appends a suffix to this element\'s ID, and optionally to all child IDs as well. There is sually no need to call this directly, invoked implicitly by :meth:`copy`"""\n        if self.id: self.id += idsuffix\n        if recursive:\n            for e in self:\n          

In [21]:
dataset['whole_func_string'][0]

'def addidsuffix(self, idsuffix, recursive = True):\n        """Appends a suffix to this element\'s ID, and optionally to all child IDs as well. There is sually no need to call this directly, invoked implicitly by :meth:`copy`"""\n        if self.id: self.id += idsuffix\n        if recursive:\n            for e in self:\n                try:\n                    e.addidsuffix(idsuffix, recursive)\n                except Exception:\n                    pass'

In [22]:
# Build input-output pairs: docstring + function name + language as input, code as target
inputs, outputs = [], []
for example in dataset:
    docstring = example.get('func_documentation_string', '')
    func_name = example.get('func_name', '')
    language = example.get('language', 'python')
    code = example['whole_func_string']

    # Input: docstring + function name + language
    input_text = f"# Language: {language}\n# Function: {func_name}\n\"\"\"{docstring}\"\"\"\n{func_name}("
    output_text = code[len(func_name):]  # rest of function code after the name
    inputs.append(input_text)
    outputs.append(output_text)

In [23]:
# -----------------------------
# 4️⃣ Tokenize Dataset
# -----------------------------
import torch

MAX_LEN = 512
tokenized_texts = []

for inp, out in zip(inputs, outputs):
    enc = tokenizer(inp + out, truncation=True, padding="max_length", max_length=MAX_LEN)
    enc["labels"] = enc["input_ids"].copy()  # important!
    # mask padding
    enc["labels"] = [l if l != tokenizer.pad_token_id else -100 for l in enc["labels"]]
    tokenized_texts.append(enc)

full_dataset = Dataset.from_dict({
    "input_ids": [t["input_ids"] for t in tokenized_texts],
    "attention_mask": [t["attention_mask"] for t in tokenized_texts],
    "labels": [t["labels"] for t in tokenized_texts],  # <- add labels
})


In [24]:
# -----------------------------
# 5️⃣ Train / Eval Split
# -----------------------------
split_index = int(0.9 * len(full_dataset))
train_dataset = full_dataset.select(range(split_index))
eval_dataset = full_dataset.select(range(split_index, len(full_dataset)))

In [25]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # causal LM, not masked LM
)


In [26]:
# -----------------------------
# 4️⃣ LoRA Configuration
# -----------------------------
lora_config = LoraConfig(
    r=16,  # low-rank for low VRAM
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # attention layers only
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM
)

In [27]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # only LoRA params are trainable

trainable params: 589,824 || all params: 125,788,416 || trainable%: 0.4689




In [28]:
# -----------------------------
# 5️⃣ Training Arguments
# -----------------------------
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    fp16=True if device=="cuda" else False,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none"
)

In [29]:
# -----------------------------
# 6️⃣ Trainer
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)



In [30]:
# -----------------------------
# 9️⃣ Train
# -----------------------------
trainer.train()

Step,Training Loss,Validation Loss
100,1.3253,1.432426
200,1.237,1.394757
300,1.27,1.356186
400,1.2126,1.332668
500,1.2059,1.319527
600,1.2674,1.309617
700,1.2477,1.301351
800,1.2168,1.295668
900,1.1311,1.291234
1000,1.1475,1.287957


TrainOutput(global_step=1392, training_loss=1.2111523399750392, metrics={'train_runtime': 812.1756, 'train_samples_per_second': 13.7, 'train_steps_per_second': 1.714, 'total_flos': 2926612821049344.0, 'train_loss': 1.2111523399750392, 'epoch': 3.0})

In [31]:
# -----------------------------
# 10️ Save Model & Tokenizer
# -----------------------------
model.save_pretrained("./lora_code_model")
tokenizer.save_pretrained("./lora_code_model")
print("LoRA fine-tuning complete! Saved at ./lora_code_model")

LoRA fine-tuning complete! Saved at ./lora_code_model


In [32]:
prompt = "# Language: python\n# Function: add_numbers\n\"\"\"Adds two numbers\"\"\"\nadd_numbers("
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# generation
outputs = model.generate(
    **inputs,
    max_new_tokens=100,  # give more space for full function
    do_sample=True,
    temperature=0.3,     # less randomness
    top_p=0.9,
)


generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_code)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# Language: python
# Function: add_numbers
"""Adds two numbers"""
add_numbers(numbers):
        """Adds two numbers"""
        numbers = [numbers[0] for n in numbers]
        if len(numbers) < 2:
            raise ValueError("numbers must be integers")
        if len(numbers) > 2:
      


In [34]:
from huggingface_hub import notebook_login
notebook_login()
# Push to Hub (your username/repo_name)
repo_name = "Noobhacker69/Coding-CasualLLM-LORA"  # you choose the name
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:  24%|##3       |  567kB / 2.37MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Noobhacker69/Coding-CasualLLM-LORA/commit/856bbe7d23d6682fe5f99cd5bae38c63d434b43c', commit_message='Upload tokenizer', commit_description='', oid='856bbe7d23d6682fe5f99cd5bae38c63d434b43c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Noobhacker69/Coding-CasualLLM-LORA', endpoint='https://huggingface.co', repo_type='model', repo_id='Noobhacker69/Coding-CasualLLM-LORA'), pr_revision=None, pr_num=None)