# LLM Finetuning using AutoTrain

In this notebook, we will finetune a Qwen/Qwen2.5-7b-instruct model using AutoTrain Advanced.


In [16]:
from datasets import load_dataset

import os
os.environ['HF_ACCESS_TOKEN'] = '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'

In [17]:
import pwb_toolbox.datasets as pwb_ds

# Load a dataset, for example, "Stocks-Daily-Price"
df = pwb_ds.load_dataset("Cryptocurrencies-Daily-Price")

# Your dataset is now ready to use!

In [15]:
df

Unnamed: 0,symbol,date,open,high,low,close,volume
0,AAVE,2024-02-26,100.330000,103.930000,97.890000,102.780000,7.225922e+04
1,AAVE,2024-02-27,102.940000,105.820000,101.050000,104.800000,6.284598e+04
2,AAVE,2024-02-28,104.920000,108.730000,97.870000,104.490000,7.446371e+04
3,AAVE,2024-02-29,104.430000,114.960000,103.160000,105.850000,8.382286e+04
4,AAVE,2024-03-01,105.850000,110.690000,105.830000,110.620000,3.172164e+04
...,...,...,...,...,...,...,...
29604,ZRX,2025-02-05,0.322564,0.326608,0.304277,0.309000,8.450443e+05
29605,ZRX,2025-02-06,0.309341,0.317677,0.287485,0.291695,1.555549e+06
29606,ZRX,2025-02-07,0.291533,0.314455,0.285354,0.294516,1.401990e+06
29607,ZRX,2025-02-08,0.294000,0.314000,0.292274,0.313982,8.660158e+05


In [5]:
dataset = load_dataset('squad')

Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 1422453.28 examples/s]
Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 2333357.54 examples/s]


In [9]:
dataset_crypto = load_dataset('paperswithbacktest/Cryptocurrencies-Daily-Price')

DatasetNotFoundError: Dataset 'paperswithbacktest/Cryptocurrencies-Daily-Price' is a gated dataset on the Hub. You must be authenticated to access it.

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
bert = pd.read_parquet('/Users/shaanp/projects/testing/datasets/cryptobert-dataset.parquet')
bert.head

Unnamed: 0,text
0,**user report** :--:--:--:-- **total submissio...
1,"purchase but no positions? dont lie, you paper..."
2,post the fucking million with today’s date the...
3,congratulations! and fuck you!
4,now use that money to make a time machine to b...


In [43]:
gptcrypto = pd.read_parquet('/Users/shaanp/projects/testing/datasets/gptcrpyot-dataset.parquet')
gptcrypto.head

Unnamed: 0,text
0,**user report** :--:--:--:-- **total submissio...
1,"purchase but no positions? dont lie, you paper..."
2,post the fucking million with today’s date the...
3,congratulations! and fuck you!
4,now use that money to make a time machine to b...
...,...
3206198,when is coinbase giving us our flare drops? it...
3206199,what is time for open and close on a chart? it...
3206200,when do we think the courts will give a decisi...
3206201,can you not buy xrp on coinbase? where can i buy


In [1]:
from autotrain.params import LLMTrainingParams
from autotrain.project import AutoTrainProject

In [2]:
HF_USERNAME = "Episte"
HF_TOKEN = "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" # get it from https://huggingface.co/settings/token
# It is recommended to use secrets or environment variables to store your HF_TOKEN
# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset

In [5]:
pip --upgrade torch


Usage:   
  /Users/shaanp/projects/testing/venv/bin/python -m pip <command> [options]

no such option: --upgrade
Note: you may need to restart the kernel to use updated packages.


In [25]:
params = LLMTrainingParams(
    model="Qwen/Qwen2.5-7B-Instruct",
    project_name="autotrainqwen",
    data_path="cogneolabs/Cogneo-Crypto-Sentiment",
    train_split="train",
    text_column="description",  # Changed from 'text' to 'description'
    trainer="default",
    epochs=1,
    batch_size=2,
    lr=3e-5,
    warmup_ratio=0.1,
    gradient_accumulation=4,
    optimizer="adamw_torch",
    scheduler="linear",
    weight_decay=0.0,
    max_grad_norm=1.0,
    seed=42,
    quantization="int4",
    target_modules="all-linear",
    block_size=1024,
    model_max_length=2048,
    padding="right",
    add_eos_token=True,
    # Optional: if you want to combine columns, you can process them beforehand
    # text_column="description,title",  # This would combine both columns
    log="none",
    peft=False,  # Set to False as in the current config
    merge_adapter=False
)

If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:

```python
params = LLMTrainingParams(
    data_path="/Users/shaanp/projects/testing/0000.csv", # this is the path to folder where train.jsonl/train.csv is located
    text_column="text", # this is the column name in the CSV/JSONL file which contains the text
    train_split = "0000" # this is the filename without extension
    .
    .
    .
)
```

In [26]:
# this will train the model locally
project = AutoTrainProject(
    params=params,
    backend="local",
    process=True
)
project.create()

[1mINFO    [0m | [32m2025-02-06 17:42:46[0m | [36mautotrain.backends.local[0m:[36mcreate[0m:[36m20[0m - [1mStarting local training...[0m
[1mINFO    [0m | [32m2025-02-06 17:42:46[0m | [36mautotrain.commands[0m:[36mlaunch_command[0m:[36m514[0m - [1m['accelerate', 'launch', '--num_machines', '1', '--num_processes', '1', '--mixed_precision', 'bf16', '-m', 'autotrain.trainers.clm', '--training_config', 'autotrain-llama32-1b-finetune/training_params.json', '--mixed_precision', 'bf16', '-m', 'autotrain.trainers.clm', '--training_config', 'autotrain-llama32-1b-finetune/training_params.json', '--mixed_precision', 'bf16', '-m', 'autotrain.trainers.clm', '--training_config', 'autotrainqwen/training_params.json', '--mixed_precision', 'no', '-m', 'autotrain.trainers.clm', '--training_config', 'autotrainqwen/training_params.json', '--mixed_precision', 'no', '-m', 'autotrain.trainers.clm', '--training_config', 'autotrainqwen/training_params.json', '--mixed_precision', 'no', '-m

Traceback (most recent call last):
  File "/Users/shaanp/projects/testing/venv/bin/accelerate", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/Users/shaanp/projects/testing/venv/lib/python3.12/site-packages/accelerate/commands/accelerate_cli.py", line 48, in main
    args.func(args)
  File "/Users/shaanp/projects/testing/venv/lib/python3.12/site-packages/accelerate/commands/launch.py", line 1146, in launch_command
    args, defaults, mp_from_config_flag = _validate_launch_command(args)
                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/shaanp/projects/testing/venv/lib/python3.12/site-packages/accelerate/commands/launch.py", line 1066, in _validate_launch_command
    raise ValueError("bf16 mixed precision requires PyTorch >= 1.10 and a supported device.")
ValueError: bf16 mixed precision requires PyTorch >= 1.10 and a supported device.


42095