## Testing DeepSeek Loading

In [2]:
import argparse
import multiprocessing
import os

import torch
import transformers
from accelerate import PartialState
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    logging,
    set_seed,
)
from trl import SFTTrainer

# config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
lora_config = LoraConfig(
    r=8,
    target_modules=[
        "q_proj",
        "o_proj",
        "k_proj",
        "v_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    task_type="CAUSAL_LM",
)
# load model and dataset
token = os.environ.get("HF_TOKEN", None)
# print out selected device
print(PartialState().process_index)
#Check for GPU availability
print(torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU")
    device = torch.device("cuda:2")
    model = AutoModelForCausalLM.from_pretrained(
        "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
        quantization_config=bnb_config,
        attention_dropout=0.1,
        trust_remote_code=True,
    )
else:
    #Handle no GPU availiability
    print("No GPU")
    device = torch.device("cpu")
    model = AutoModelForCausalLM.from_pretrained(
        "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
        quantization_config=bnb_config,
        attention_dropout=0.1,
        trust_remote_code=True,
    )

if torch.cuda.device_count() > 1:
    print("Using DataParallel for multiple GPUs")
    model = torch.nn.DataParallel(model)

model.to(device)
# Freeze all except embeddings and first layer
for name, param in model.named_parameters():
    if "model.embed_tokens" not in name and "model.layers.0" not in name:
        param.requires_grad = False
    else:
        param.requries_grad = True

ModuleNotFoundError: No module named 'peft'