In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, platform, warnings
from datasets import load_dataset
from trl import SFTTrainer
from huggingface_hub import notebook_login
#Use a sharded model to fine-tune in the free version of Google Colab.
base_model = "mistralai/Mistral-7B-v0.1" #bn22/Mistral-7B-Instruct-v0.1-sharded


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pythonCodes_prompt = """Below is an instruction that describes a code task. Write a response that 
appropriately return the python code.

### Instruction:
{}

### Response:
{}"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs      = examples["output"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        text = pythonCodes_prompt.format(instruction, output)
        texts.append(text)
    return { "text" : texts}
pass

from datasets import load_dataset
dataset = load_dataset("flytech/python-codes-25k", split = "train")
dataset = dataset.train_test_split(test_size=0.1, shuffle=True)
#dataset = dataset.map(formatting_prompts_func, batched = True,)
print(len(dataset["train"]))
print(len(dataset["test"]))

for i in range(5):
    print(dataset["train"][i])

Downloading readme: 100%|██████████| 3.19k/3.19k [00:00<00:00, 8.03MB/s]
Downloading data: 100%|██████████| 26.4M/26.4M [00:02<00:00, 11.7MB/s]
Downloading data: 100%|██████████| 25.4M/25.4M [00:01<00:00, 24.0MB/s]
Generating train split: 49626 examples [00:00, 101106.30 examples/s]

44663
4963
{'output': '```python\ndef reverseWords(sentence): \n  return \' \'.join(sentence.split()[::-1])\n\n# Output: "sentence a is This"\n```', 'text': 'Write a Python code that reverses the order of words in a sentence. "This is a sentence" Of course! ```python\ndef reverseWords(sentence): \n  return \' \'.join(sentence.split()[::-1])\n\n# Output: "sentence a is This"\n```', 'instruction': 'Write a Python code that reverses the order of words in a sentence. "This is a sentence"', 'input': ''}
{'output': '```python\nimport numpy as np\nfrom sklearn.neighbors import KNeighborsClassifier\n\ndata = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])\n\n# Create the model\nmodel = KNeighborsClassifier()\n\n# Train the model\nmodel.fit(data[:, 0].reshape(-1, 1), data[:, 1].reshape(-1, 1))\n\n# Output the result\nprint(model.predict([[2]])) # [[3]]\n```', 'text': 'Create a Python program to classify a given set of data using a k-nearest neighbors algorithm data = [[1, 2], [2, 3],




In [4]:
# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

Loading checkpoint shards: 100%|██████████| 2/2 [00:44<00:00, 22.36s/it]


(True, True)

In [5]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )
model = get_peft_model(model, peft_config)

In [6]:
# Training Arguments
# Hyperparameters should beadjusted based on the hardware you using
training_arguments = TrainingArguments(
    output_dir= "outputs2",
    num_train_epochs= 1,
    per_device_train_batch_size= 8,
    gradient_accumulation_steps= 2,
    optim = "paged_adamw_8bit",
    save_steps= 5000,
    logging_steps= 30,
    learning_rate= 2e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.3,
    group_by_length= True,
    lr_scheduler_type= "constant"
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

Map:   0%|          | 0/44663 [00:00<?, ? examples/s]

Map: 100%|██████████| 44663/44663 [00:04<00:00, 10105.58 examples/s]
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [7]:
trainer.train()




Step,Training Loss
30,0.8716
60,0.9133
90,0.7263
120,0.812
150,0.7502
180,0.629
210,0.7577
240,0.6245
270,0.7121
300,0.6619


TrainOutput(global_step=2791, training_loss=0.6029868846681117, metrics={'train_runtime': 26876.2486, 'train_samples_per_second': 1.662, 'train_steps_per_second': 0.104, 'total_flos': 3.010660678019973e+17, 'train_loss': 0.6029868846681117, 'epoch': 1.0})

In [8]:
new_model = "mistral_code"
trainer.model.save_pretrained(new_model)

In [9]:
trainer.tokenizer.save_pretrained(new_model)

('mistral_code/tokenizer_config.json',
 'mistral_code/special_tokens_map.json',
 'mistral_code/tokenizer.json')

In [16]:
trainer.save_state()