In [14]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
    "unsloth/gemma-2b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/codellama-34b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

Unsloth currently does not work on multi GPU setups - sadly we are a 2 brother team so enabling it will require much more work, so we have to prioritize. Please understand!We do have a beta version, which you can contact us about!
Thank you for your understanding and we appreciate it immensely!

Multiple CUDA devices detected but we require a single device.
We will override CUDA_VISIBLE_DEVICES to first device: 0.
  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: NVIDIA A100-PCIE-40GB. Max memory: 39.394 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.45it/s]


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.6 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!

If you want to use the `ChatML` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).

In [3]:
import pandas as pd
import numpy as np
from datasets import Dataset

# Load the dataset
file_path = 'training.xlsx'
df = pd.read_excel(file_path)

# Map sentiment labels
label_mapping = {'NEG': 'Negative', 'NEU': 'Neutral', 'POS': 'Positive'}
df['SENTIMENT'] = df['SENTIMENT'].map(label_mapping)

# Extract features and labels
X = np.array(df['TEXT'].tolist())
y = np.array(df['SENTIMENT'].tolist())

# Prepare the data dictionary
data_dict = {
    'X_train': X,
    'y_train': y,
}

formatted_dataset = Dataset.from_dict(data_dict)

In [4]:
prompt = """Based on the following French text content, identify the sentiment of the text from sentiments_list: ['Positive', 'Neutral', 'Negative']
If the text does not fit into any of the above mentioned sentiments, please return the answer nearest to any of the sentiments mentioned in sentiments_list.
Return only one sentiment from the sentiments_list above.

Examples:
Positive:
- "J'adore cette nouvelle fonctionnalité! Elle est vraiment incroyable et facilite tellement les choses."
- "Quel excellent service! Je suis vraiment satisfait de mon expérience."

Neutral:
- "Le produit fonctionne comme prévu."
- "Les instructions étaient claires et précises."

Negative:
- "Je suis très déçu par la qualité de ce produit. Il ne fonctionne pas correctement."
- "Le service client était très mauvais et n'a pas résolu mon problème."
### text:
{}

### category:
{}
"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    text = examples["X_train"]
    category     = examples["y_train"]
    #categories_list = examples["categories_list"]
    contents = []
    for X_train, y_train in zip(text, category):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        content = prompt.format(X_train, y_train) + EOS_TOKEN
        contents.append(content)
    return { "response" : contents, }
pass

dataset = formatted_dataset.map(formatting_prompts_func, batched = True,)

Map: 100%|███████████████████████████████████████████████████████████████| 1132/1132 [00:00<00:00, 145911.25 examples/s]


In [5]:
dataset

Dataset({
    features: ['X_train', 'y_train', 'response'],
    num_rows: 1132
})

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "response",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2): 100%|████████████████████████████████████████████████████| 1132/1132 [00:00<00:00, 2604.41 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [7]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-PCIE-40GB. Max memory = 39.394 GB.
4.5 GB of memory reserved.


In [7]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,132 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 108,920,832


Step,Training Loss
1,1.645
2,1.6508
3,1.7021
4,1.6663
5,1.6203
6,1.514
7,1.1778
8,0.9657
9,0.6841
10,0.5343


In [9]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

84.2305 seconds used for training.
1.4 minutes used for training.
Peak reserved memory = 5.072 GB.
Peak reserved memory for training = 0.572 GB.
Peak reserved memory % of max memory = 12.875 %.
Peak reserved memory for training % of max memory = 1.452 %.


In [8]:
model.save_pretrained("h_anno_codellama34B")

In [9]:
tokenizer.save_pretrained("h_anno_codellama34B")

('h_anno_codellama34B/tokenizer_config.json',
 'h_anno_codellama34B/special_tokens_map.json',
 'h_anno_codellama34B/tokenizer.model',
 'h_anno_codellama34B/added_tokens.json',
 'h_anno_codellama34B/tokenizer.json')

#### Testing the model using one instance

In [6]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/codellama-34b-bnb-4bit", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

prompt = """Based on the following French text content, identify the sentiment of the text from sentiments_list: ['Positive', 'Neutral', 'Negative']
If the text does not fit into any of the above mentioned sentiments, please return the answer nearest to any of the sentiments mentioned in sentiments_list.
Return only one sentiment from the sentiments_list above.

Examples:
Positive:
- "J'adore cette nouvelle fonctionnalité! Elle est vraiment incroyable et facilite tellement les choses."
- "Quel excellent service! Je suis vraiment satisfait de mon expérience."

Neutral:
- "Le produit fonctionne comme prévu."
- "Les instructions étaient claires et précises."

Negative:
- "Je suis très déçu par la qualité de ce produit. Il ne fonctionne pas correctement."
- "Le service client était très mauvais et n'a pas résolu mon problème."
### text:
{}

### category:
{}
"""

inputs = tokenizer(
[
    prompt.format(
        "Tu es belle", # text
        "", # category
    ),
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['<s> Based on the following French text content, identify the sentiment of the text from sentiments_list: [\'Positive\', \'Neutral\', \'Negative\']\nIf the text does not fit into any of the above mentioned sentiments, please return the answer nearest to any of the sentiments mentioned in sentiments_list.\nReturn only one sentiment from the sentiments_list above.\n\nExamples:\nPositive:\n- "J\'adore cette nouvelle fonctionnalité! Elle est vraiment incroyable et facilite tellement les choses."\n- "Quel excellent service! Je suis vraiment satisfait de mon expérience."\n\nNeutral:\n- "Le produit fonctionne comme prévu."\n- "Les instructions étaient claires et précises."\n\nNegative:\n- "Je suis très déçu par la qualité de ce produit. Il ne fonctionne pas correctement."\n- "Le service client était très mauvais et n\'a pas résolu mon problème."\n### text:\nTu es belle\n\n### category:\n\n### sentiment:\nPositive\n\n### sentiment_confidence_scores:\n{\'Positive\': 0.9999999403953552, \'Neutr

#### By running the following code we will get dataset['training_result'] containing the sentiment for each text in dataset['X_train'] using our trained model

In [None]:
%%capture

import torch

if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="h_anno_codellama34B",
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

training_results = [] # Create an empty list to store the results

for text in dataset['X_train']: # Iterate through each text in dataset['X_train']
    prompt_text = prompt.format(text, "") # Format the prompt with the current text
    inputs = tokenizer([prompt_text], return_tensors="pt").to("cuda") # Tokenize the input
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True) # Generate the output
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True) # Decode the output and extract the sentiment
    sentiment = decoded_output[0].strip()
    training_results.append(sentiment) # Append the sentiment to the results list

dataset = dataset.add_column('training_result', training_results)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [None]:
dataset

In [None]:
dataset['training_result'][0]

In [16]:
dataset['training_result'][2]

'Based on the following French text content, identify the sentiment of the text from sentiments_list: [\'Positive\', \'Neutral\', \'Negative\']\nIf the text does not fit into any of the above mentioned sentiments, please return the answer nearest to any of the sentiments mentioned in sentiments_list.\nReturn only one sentiment from the sentiments_list above.\n\nExamples:\nPositive:\n- "J\'adore cette nouvelle fonctionnalité! Elle est vraiment incroyable et facilite tellement les choses."\n- "Quel excellent service! Je suis vraiment satisfait de mon expérience."\n\nNeutral:\n- "Le produit fonctionne comme prévu."\n- "Les instructions étaient claires et précises."\n\nNegative:\n- "Je suis très déçu par la qualité de ce produit. Il ne fonctionne pas correctement."\n- "Le service client était très mauvais et n\'a pas résolu mon problème."\n### text:\nJe voudrais qu\'on arrête de se moquer de moi et ma culture\n\n### category:\n\nNeutral'

In [17]:
dataset['training_result'][3]

'Based on the following French text content, identify the sentiment of the text from sentiments_list: [\'Positive\', \'Neutral\', \'Negative\']\nIf the text does not fit into any of the above mentioned sentiments, please return the answer nearest to any of the sentiments mentioned in sentiments_list.\nReturn only one sentiment from the sentiments_list above.\n\nExamples:\nPositive:\n- "J\'adore cette nouvelle fonctionnalité! Elle est vraiment incroyable et facilite tellement les choses."\n- "Quel excellent service! Je suis vraiment satisfait de mon expérience."\n\nNeutral:\n- "Le produit fonctionne comme prévu."\n- "Les instructions étaient claires et précises."\n\nNegative:\n- "Je suis très déçu par la qualité de ce produit. Il ne fonctionne pas correctement."\n- "Le service client était très mauvais et n\'a pas résolu mon problème."\n### text:\nSalut Julie c\'est avec toi que je voulais parler\n\n### category:\n\nNeutral'

In [None]:
#if we want to save early
#dataset.to_excel('unfinetuned_Mistral.xlsx', index=False)

In [18]:
df2 = pd.DataFrame(dataset)
substring_to_remove = 'Based on the following French text content, identify the sentiment of the text from sentiments_list: [\'Positive\', \'Neutral\', \'Negative\']\nIf the text does not fit into any of the above mentioned sentiments, please return the answer nearest to any of the sentiments mentioned in sentiments_list.\nReturn only one sentiment from the sentiments_list above.\n\nExamples:\nPositive:\n- "J\'adore cette nouvelle fonctionnalité! Elle est vraiment incroyable et facilite tellement les choses."\n- "Quel excellent service! Je suis vraiment satisfait de mon expérience."\n\nNeutral:\n- "Le produit fonctionne comme prévu."\n- "Les instructions étaient claires et précises."\n\nNegative:\n- "Je suis très déçu par la qualité de ce produit. Il ne fonctionne pas correctement."\n- "Le service client était très mauvais et n\'a pas résolu mon problème."\n### text:'

df2['training_result'] = df2['training_result'].str.replace(substring_to_remove, '', regex=False) # Remove the substring from the 'training_result' column

sentiments_list = ['Negative', 'Neutral', 'Positive'] # Define the list of words to search for

def find_and_replace_sentiment(text): # Function to find and replace with the sentiment word
    for sentiment in sentiments_list:
        if sentiment in text:
            return sentiment
    return None

df2['training_result'] = df2['training_result'].apply(find_and_replace_sentiment) # Apply the function to the 'training_result' column


In [19]:
df2

Unnamed: 0,X_train,y_train,response,training_result
0,façon j'veux ps d'une crasseuse comme toi,Negative,"Based on the following French text content, id...",Negative
1,Claire c'est pas le mot,Neutral,"Based on the following French text content, id...",Neutral
2,Je voudrais qu'on arrête de se moquer de moi e...,Positive,"Based on the following French text content, id...",Neutral
3,Salut Julie c'est avec toi que je voulais parler,Neutral,"Based on the following French text content, id...",Neutral
4,la vie t'es fat ima,Negative,"Based on the following French text content, id...",Negative
...,...,...,...,...
1127,tu kleh ou quoi,Neutral,"Based on the following French text content, id...",Negative
1128,sale feuj,Negative,"Based on the following French text content, id...",Negative
1129,juif de mort,Negative,"Based on the following French text content, id...",Negative
1130,amine vient 1v1 devant le lycéé,Negative,"Based on the following French text content, id...",Neutral


In [20]:
df2.to_excel('finetuned_Codellama34B.xlsx', index=False) #Save finally