In [None]:
# Clone the mistral-finetune repo
!git clone https://github.com/mistralai/mistral-finetune.git
!pip install -r /content/mistral-finetune/requirements.txt
!pip install huggingface_hub

# Login to Huggingface
from huggingface_hub import notebook_login

notebook_login()

Cloning into 'mistral-finetune'...
remote: Enumerating objects: 449, done.[K
remote: Counting objects: 100% (190/190), done.[K
remote: Compressing objects: 100% (88/88), done.[K
remote: Total 449 (delta 146), reused 109 (delta 101), pack-reused 259 (from 1)[K
Receiving objects: 100% (449/449), 234.50 KiB | 19.54 MiB/s, done.
Resolving deltas: 100% (230/230), done.
Collecting fire (from -r /content/mistral-finetune/requirements.txt (line 1))
  Downloading fire-0.6.0.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.4/88.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mistral-common>=1.3.1 (from -r /content/mistral-finetune/requirements.txt (line 4))
  Downloading mistral_common-1.3.3-py3-none-any.whl.metadata (4.1 kB)
Collecting torch==2.2 (from -r /content/mistral-finetune/requirements.txt (line 9))
  Downloading torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collec

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path

mistral_models_path = Path.home().joinpath('mistral_models', '7B-v0.3')
mistral_models_path.mkdir(parents=True, exist_ok=True)

snapshot_download(repo_id="mistralai/Mistral-7B-Instruct-v0.3", allow_patterns=["params.json", "consolidated.safetensors", "tokenizer.model.v3"], local_dir=mistral_models_path)

!mkdir -p /content/mistral_models
!cp -r /root/mistral_models/7B-v0.3 /content/mistral_models
!rm -r /root/mistral_models/7B-v0.3

# Confirm the files have been downloaded
!ls /content/mistral_models/7B-v0.3

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

consolidated.safetensors:   0%|          | 0.00/14.5G [00:00<?, ?B/s]

tokenizer.model.v3:   0%|          | 0.00/587k [00:00<?, ?B/s]

params.json:   0%|          | 0.00/202 [00:00<?, ?B/s]

consolidated.safetensors  params.json  tokenizer.model.v3


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd

# Load the data from the JSONL file
file_path = '/content/drive/MyDrive/ACL/MultilingualLLMBias/GPT3.5-finetune-data/stacked_combined.jsonl'
df = pd.read_json(file_path, lines=True)

# Split data into training and evaluation sets
df_train = df.sample(frac=0.90, random_state=200)  # 90% for training
df_eval = df.drop(df_train.index)                  # 10% for evaluation

# save data into .jsonl files
df_train.to_json("ultrachat_chunk_train.jsonl", orient="records", lines=True)
df_eval.to_json("ultrachat_chunk_eval.jsonl", orient="records", lines=True)



In [None]:
!mkdir -p /content/data

# Save the reformatted datasets to the /data directory
!mv ultrachat_chunk_train.jsonl /content/data/ultrachat_chunk_train.jsonl
!mv ultrachat_chunk_eval.jsonl /content/data/ultrachat_chunk_eval.jsonl

# Confirm the files exist
!ls /content/data


ultrachat_chunk_eval.jsonl  ultrachat_chunk_train.jsonl


In [None]:

# Navigate to the mistral-finetune directory
%cd /content/mistral-finetune/

# Validate and reformat the data
!python -m utils.reformat_data /content/data/ultrachat_chunk_train.jsonl
!python -m utils.reformat_data /content/data/ultrachat_chunk_eval.jsonl



/content/mistral-finetune


In [None]:
import os
import yaml

# Set environment variables
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

# Define the training configuration
config = """
# data
data:
  instruct_data: "/content/data/ultrachat_chunk_train.jsonl"  # Ensure correct path
  data: ""  # Optionally fill with pretraining data
  eval_instruct_data: "/content/data/ultrachat_chunk_eval.jsonl"  # Ensure correct path

# model
model_id_or_path: "/content/mistral_models/7B-v0.3"  # Ensure correct path
lora:
  rank: 64

# optim
# tokens per training steps = batch_size x num_GPUs x seq_len
# we recommend sequence length of 32768
# If you run into memory error, you can try reduce the sequence length
seq_len: 8192
batch_size: 1
num_microbatches: 8
max_steps: 100
optim:
  lr: 1.e-4
  weight_decay: 0.1
  pct_start: 0.05

# other
seed: 0
log_freq: 1
eval_freq: 100
no_eval: False
ckpt_freq: 100

save_adapters: True  # save only trained LoRA adapters. Set to `False` to merge LoRA adapter into the base model and save full fine-tuned model

run_dir: "/content/test_ultra"  # Ensure correct path
"""

# Save the configuration to example.yaml
with open('example.yaml', 'w') as file:
    yaml.dump(yaml.safe_load(config), file)



In [None]:
# Ensure the run_dir has not been created before
# only run this when you ran torchrun previously and created the /content/test_ultra file

#!rm -r /content/test_ultra


In [None]:

# Start training
!torchrun --nproc-per-node 1 -m train example.yaml

2024-08-15 05:01:10.061962: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-15 05:01:10.080284: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-15 05:01:10.101560: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-15 05:01:10.108083: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-15 05:01:10.123713: I tensorflow/core/platform/cpu_feature_guar

In [None]:
# Copy the fine-tuned model to Google Drive
!cp -r /content/test_ultra /content/drive/MyDrive/ACL/Mistral-7b

#below is an inference example


In [None]:
!pip install mistral_inference

Collecting mistral_inference
  Downloading mistral_inference-1.3.1-py3-none-any.whl.metadata (14 kB)
Downloading mistral_inference-1.3.1-py3-none-any.whl (25 kB)
Installing collected packages: mistral_inference
Successfully installed mistral_inference-1.3.1


In [None]:
from mistral_inference.transformer import Transformer
from mistral_inference.generate import generate

from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest


tokenizer = MistralTokenizer.from_file("/content/mistral_models/tokenizer.model.v3")  # change to extracted tokenizer file
model = Transformer.from_folder("/content/mistral_models")  # change to extracted model dir
model.load_lora("/content/test_ultra/checkpoints/checkpoint_000100/consolidated/lora.safetensors")

completion_request = ChatCompletionRequest(messages=[UserMessage(content="Explain Machine Learning to me in a nutshell.")])

tokens = tokenizer.encode_chat_completion(completion_request).tokens

out_tokens, _ = generate([tokens], model, max_tokens=64, temperature=0.0, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
result = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])

print(result)

TokenizerException: Unrecognized tokenizer file: /content/mistral_models/tokenizer.model.v3