In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llama-3.2/transformers/1b/1/config.json
/kaggle/input/llama-3.2/transformers/1b/1/README.md
/kaggle/input/llama-3.2/transformers/1b/1/USE_POLICY.md
/kaggle/input/llama-3.2/transformers/1b/1/tokenizer.json
/kaggle/input/llama-3.2/transformers/1b/1/tokenizer_config.json
/kaggle/input/llama-3.2/transformers/1b/1/LICENSE.txt
/kaggle/input/llama-3.2/transformers/1b/1/model.safetensors
/kaggle/input/llama-3.2/transformers/1b/1/special_tokens_map.json
/kaggle/input/llama-3.2/transformers/1b/1/.gitattributes
/kaggle/input/llama-3.2/transformers/1b/1/generation_config.json
/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
%%capture

# Install and update all the necessary Python packages
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb==0.17.8

In [3]:
# Load the Python packages and functions for fine-tuning and evaluation
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

In [4]:
# Initialize wandb project for experimental tracking
secret_value_0 = user_secrets.get_secret("wandb")
wandb.login(key=secret_value_0)
run = wandb.init(project='fine-tuning-llama-models', job_type="training", anonymous="allow", settings=wandb.Settings(start_method="thread"))

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33movertheskyy[0m ([33movertheskyy-workspaces[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# Set the variables for base mode, dataset, and new model name
base_model = "/kaggle/input/llama-3.2/transformers/1b/1"
new_model = "llama-3.2-1b-nlp-DisasterTweets"
dataset_name = "/kaggle/input/nlp-getting-started/train.csv"

In [6]:
# Check if base model and dataset exist
base_model_exists = os.path.exists(base_model)
dataset_exists = os.path.exists(dataset_name)

print(f"Base model directory exists: {base_model_exists}")
print(f"Dataset file exists: {dataset_exists}")

Base model directory exists: True
Dataset file exists: True


In [20]:
# # Using QLoRA config
# # Set torch dtype and attention implementation
# if torch.cuda.get_device_capability()[0] >= 8:
#     !pip install -qqq flash-attn
#     torch_dtype = torch.bfloat16
#     attn_implementation = "flash_attention_2"
# else:
#     torch_dtype = torch.float16
#     attn_implementation = "eager"

# # QLoRA config
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch_dtype,
#     bnb_4bit_use_double_quant=True,
# )
# # Load model
# model = AutoModelForCausalLM.from_pretrained(
#     base_model,
#     quantization_config=bnb_config,
#     device_map="auto",
#     attn_implementation=attn_implementation
# )

In [28]:
# Load the pre-trained causal language model with specified configurations:
# - `return_dict=True`: returns the output as a dictionary instead of a tuple.
# - `low_cpu_mem_usage=True`: reduces CPU memory usage during model loading.
# - `torch_dtype=torch.float16`: sets the model's data type to FP16 for memory efficiency.
# - `device_map="auto"`: automatically maps the model to available devices (e.g., GPU).
# - `trust_remote_code=True`: allows execution of code from remote sources, useful for models with custom configurations.
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

In [30]:
# Loading and processing the dataset

train_data = pd.read_csv(dataset_name)

# See the structure and data types of the columns
print(train_data.info())
# Check for missing values
print(train_data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None
id             0
keyword       61
location    2533
text           0
target         0
dtype: int64
