# Installation des librairies manquantes

In [None]:
! pip uninstall transformers -y
! pip install transformers
! pip install bitsandbytes
! pip install einops
! pip install peft
#! pip install trl

In [None]:
# Bug selon la version de datasets, besoin d'installer une version plus récente que celle de l'environnement pré-installé :
! pip uninstall datasets -y
! pip install datasets==2.13.1

In [None]:
import transformers
print(transformers.__version__)
import datasets
print(datasets.__version__)

# Restaurant dataset

In [None]:
import einops
import torch
from datasets import load_dataset,Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig,get_peft_model
#from trl import SFTTrainer,DataCollatorForCompletionOnlyLM

In [None]:
dataset0 = load_dataset("Argen7um/restrant-qa")['train'].select(range(877))
dataset0

In [None]:
dataset0['Prompt'][0]

In [None]:
removed_text = '\nUsers may communicate with you in English or Chinese, and you should respond in the language in which the question was asked. \nEnsure to maintain a hospitable and supportive tone, embodying the warm and welcoming spirit of our restaurant. Your goal is to enhance the dining experience of our guests by facilitating seamless and delightful interactions, ensuring they receive accurate and helpful information to make their dining experience memorable\n[context]:'
#"You possess extensive knowledge about the restaurant’s menu, operating hours, reservation policies, food ingredients, special diet accommodations, and health and safety practices.'
#Feel free to assist the users by providing informative and friendly responses to questions about restaurant locations, directions, parking facilities, and public transport access. Additionally, you are adept at explaining menu items, offering suggestions, explaining dish ingredients, and helping with special dietary requests such as vegetarian, vegan, gluten-free, and allergy-specific needs.
text = []
for i in range(877):
    text.append(dataset0['Prompt'][i].replace(removed_text,''))

In [None]:
import pandas as pd
dataset_df = pd.DataFrame(columns = ['Prompt'])
dataset_df['Prompt'] = text
dataset1 = Dataset.from_pandas(dataset_df)
dataset1

In [None]:
for i in range(3):
    print('\n')
    print(dataset1['Prompt'][i])

In [None]:
"""
question = []
response = []
for i in range(877):
    question.append(dataset0['train'][i]['Prompt'].split('[question]:')[1].split('[answer]:')[0].replace(' [/INST]\n',''))
    response.append(dataset0['train'][i]['Prompt'].split('[question]:')[1].split('[answer]:')[1].replace('</s>',''))
import pandas as pd
data = pd.DataFrame(columns = ['question','answer'])
data['question'] = question
data['answer'] = response
data

dataset1 = Dataset.from_pandas(data)
dataset1
"""

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 

In [None]:
def tokenize(element):
    return tokenizer(
        element["Prompt"],
        truncation=True,
        max_length=2048,
        add_special_tokens=False,
    )

dataset_tokenized = dataset1.map(
    tokenize, 
    batched=True, 
    #num_proc=os.cpu_count(),    # multithreaded
    remove_columns=["Prompt"]     # don't need the strings anymore, we have tokens from here on
)

In [None]:
tokenizer('[answer]'),tokenizer.decode([24115])

In [None]:
def collate(elements):
    tokenlist=[e["input_ids"] for e in elements]
    tokens_maxlen=max([len(t) for t in tokenlist])  # length of longest input
    input_ids,labels,attention_masks = [],[],[]
    
    for tokens in tokenlist:
        
        # how many pad tokens to add for this sample
        index_answer = tokens.index(24115) # on récupère l'index de début de la réponse
        tokens_question = tokens[:index_answer] # on s'arrête à la réponse
        pad_len_question =tokens_maxlen-len(tokens_question)
        pad_len_answer = tokens_maxlen-len(tokens[index_answer+2:])
        
        input_ids.append( tokens_question + [tokenizer.pad_token_id]*pad_len_question )   
        labels.append( tokens[index_answer+2:] + [tokenizer.pad_token_id]*pad_len_answer )    
        attention_masks.append( [1]*len(tokens_question) + [0]*pad_len_question ) 

    batch={
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels),
        "attention_mask": torch.tensor(attention_masks)
    }
    return batch

In [None]:
"""
def collate(elements):
    tokenlist=[e["input_ids"] for e in elements]
    tokens_maxlen=max([len(t) for t in tokenlist])  # length of longest input
    input_ids,labels,attention_masks = [],[],[]
    
    for tokens in tokenlist:
        
        # how many pad tokens to add for this sample
        index_answer = tokens.index(24115)
        pad_len=tokens_maxlen-len(tokens)
        pad_len_answer = tokens_maxlen-len(tokens[index_answer+2:])
        
        # pad input_ids with pad_token, labels with ignore_index (-100) and set attention_mask 1 where content, otherwise 0
        input_ids.append( tokens + [tokenizer.pad_token_id]*pad_len )   
        labels.append( tokens[index_answer+2:] + [tokenizer.pad_token_id]*pad_len_answer )    
        attention_masks.append( [1]*len(tokens) + [0]*pad_len ) 

    batch={
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels),
        "attention_mask": torch.tensor(attention_masks)
    }
    return batch
    
"""

# Téléchargement du model pre-entrainé 

In [None]:
# BitsAndBytes permet le fine tuning avec "quantification" pour réduire l'impact mémoire et les calculs
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
        "mistralai/Mistral-7B-Instruct-v0.1",
        #"facebook/opt-350m",
        device_map="auto",
        torch_dtype=torch.float16, #torch.bfloat16,
        trust_remote_code=True
            )

## Configuration du peft LoRa

In [None]:
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64 #128

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "Wqkv",
        "out_proj",
        "up_proj",
        "down_proj",
    ])


In [None]:
model = get_peft_model(model, peft_config)

In [None]:
output_dir = "/kaggle/working/"
per_device_train_batch_size = 1
gradient_accumulation_steps = 16 
optim = "paged_adamw_32bit"
save_steps = 55 
logging_steps = 55
learning_rate = 1e-4
max_grad_norm = 0.3
max_steps = 55 * 14 
warmup_ratio = 0.03
lr_scheduler_type = "linear"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    #save_steps=save_steps,
    logging_steps=logging_steps,
    save_strategy= 'no', #''epoch',
    #evaluation_strategy = "steps",#"epoch",
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    report_to = 'none',
    save_total_limit = 1
)

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=collate,
    train_dataset=dataset_tokenized,
    #eval_dataset=lm_datasets, #dataset_tokenized["test"],
    args=training_arguments,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model('/kaggle/working/')

In [None]:
"""
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained("/kaggle/input/mistral", 
                                                  load_in_4bit=True
                                                )
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/mistral")
"""

#text = """Below is a question from Human. Write a response.\n    
### Question:\n Is it recommended to base clinical and therapeutic decisions solely on ChatGPT's
#knowledge.?\n    
### Response:"""
"""
inputs = tokenizer(text, return_tensors="pt").to('cuda')
out = model.generate(**inputs, max_new_tokens=50)

print(tokenizer.decode(out[0]))
"""