In [None]:
%pip install transformers

In [1]:
%pip install -q bitsandbytes datasets accelerate loralib

Note: you may need to restart the kernel to use updated packages.


Fine-tuning LLMs with PEFT and LoRA
https://www.youtube.com/watch?v=Us5ZFp16PaU

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import torch

# Check if CUDA (GPU) is available
if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("No GPU found.")

In [None]:
# Freezing the original weights:

for param in model.parameters():
    param.requires_grad = False  # freeze the model - train adapters later
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)

model.lm_head = CastOutputToFloat(model.lm_head)

In [None]:
# Setting up the LoRa Adapters
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable: {100 * trainable_params / all_param}%"
    )


In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]= "0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM,BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

model = AutoModelForCausalLM.from_pretrained(
    #  "bigscience/bloom-560m",
    "bigscience/bloom-7b1",
    #quantization_config=quantization_config,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-7b1")


model.safetensors.index.json:   0%|          | 0.00/28.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

ValueError: You are trying to offload the whole model to the disk. Please use the `disk_offload` function instead.

In [3]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Sample data (you can replace this with your actual dataset)
data = {
    'text': ["This is a question", "How are you?", "What is your name?", 
             "Tell me about yourself", "What is your favorite color?"],
    'label': [0, 1, 1, 0, 1]  # Sample labels
}
df = pd.DataFrame(data)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Text preprocessing with TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(kernel='linear'),
    'Random Forest': RandomForestClassifier(n_estimators=100)
}

# Train, predict, and evaluate models
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    # Evaluate model
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("-" * 50)

Model: Logistic Regression
Accuracy: 0.0000
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0

--------------------------------------------------
Model: SVM
Accuracy: 0.0000
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0

--------------------------------------------------
Model: Random Forest
Accuracy: 0.0000
Classification Report:
              precision    recall  f1-score   support

          

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier



from transformers import AutoTokenizer


mLPClassifier = MLPClassifier()
randomForestClassifier = RandomForestClassifier()




# LabelEncoder

In [4]:
from sklearn.preprocessing import LabelEncoder

labels = ['positive', 'negative', 'neutral', 'positive', 'negative']
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Display original and encoded labels
print("Original Labels: ", labels)
print("Encoded Labels: ", encoded_labels)   

# To decode the numerical labels back to original form
decoded_labels = label_encoder.inverse_transform(encoded_labels)
print("Decoded Labels: ", decoded_labels)

Original Labels:  ['positive', 'negative', 'neutral', 'positive', 'negative']
Encoded Labels:  [2 0 1 2 0]
Decoded Labels:  ['positive' 'negative' 'neutral' 'positive' 'negative']


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
