In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

from huggingface_hub import login
login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/gm3314/.cache/huggingface/token
Login successful


In [2]:
# Step1 : Accelerator
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [3]:
# Step2: Load Dataset
from datasets import load_dataset

train_dataset = load_dataset('json', data_files='train.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='test.jsonl', split='train')
test_dataset = load_dataset('json', data_files='test.jsonl', split='train')

In [4]:
# Step3: Load base model
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# Step4: Tokenization
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)

tokenizer.pad_token = tokenizer.eos_token

def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt =f""" Given a Title of a Research paper, give a brief summary on that paper.
### Title:
{data_point["title"]}

### Summary:
{data_point["abstract"]}
"""
    return tokenize(full_prompt)

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

print(len(tokenized_train_dataset[1]['input_ids']))

512


In [8]:
# Out of box performance:

eval_prompt = """ Given a Title of a Research Paper in the field of Cold fusion, give out a brief summary of the research paper.
### Title: 
Give summary of Implications of Isoperibolic Electrode Calorimetry for Cold Fusion: The Silica Effect

### Summary:
"""

# Re-init the tokenizer so it doesn't add padding or eos token
eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
)

model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 Given a Title of a Research Paper in the field of Cold fusion, give out a brief summary of the research paper.
### Title: 
Give summary of Implications of Isoperibolic Electrode Calorimetry for Cold Fusion: The Silica Effect

### Summary:

The paper is about the effect of silica on the cold fusion reaction. The paper is about the effect of silica on the cold fusion reaction. The paper is about the effect of silica on the cold fusion reaction. The paper is about the effect of silica on the cold fusion reaction. The paper is about the effect of silica on the cold fusion reaction. The paper is about the effect of silica on the cold fusion reaction. The paper is about the effect of silica on the cold fusion reaction. The paper is about the effect of silica on the cold fusion reaction. The paper is about the effect of silica on the cold fusion reaction. The paper is about the effect of silica on the cold fusion reaction. The paper is about the effect of silica on the cold fusion reaction. 

In [9]:
# Step5: Set Up Lora.
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

print(model)

trainable params: 21260288 || all params: 3773331456 || trainable%: 0.5634354746703705
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): Linear4bit(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
          

In [11]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [13]:
import transformers
from datetime import datetime

project = "lenr-mistral-finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=500,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=50,
        fp16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=50,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
#         report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
50,2.0913,1.887853
100,1.7639,1.847747
150,1.6525,1.846538
200,1.537,1.876925
250,1.3613,1.917735
300,1.2278,1.970476
350,1.0984,2.035082
400,0.9786,2.089627
450,0.8766,2.176791
500,0.8216,2.203748




TrainOutput(global_step=500, training_loss=1.3408925247192383, metrics={'train_runtime': 4040.5398, 'train_samples_per_second': 0.99, 'train_steps_per_second': 0.124, 'total_flos': 8.7637037678592e+16, 'train_loss': 1.3408925247192383, 'epoch': 11.17})

In [14]:
# Evaluation step, you can stop the kernel and start from here, no need to finetune again
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)

eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
    trust_remote_code=True,
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [15]:
from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model, "./mistral-lenr-mistral-finetune/checkpoint-250")

In [16]:
# eval_prompt = """Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
# This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
# The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']

# ### Target sentence:
# Earlier, you stated that you didn't have strong feelings about PlayStation's Little Big Adventure. Is your opinion true for all games which don't have multiplayer?

# ### Meaning representation:
# """

# model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# ft_model.eval()
# with torch.no_grad():
#     print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

# Out of box result:

#  Given a Title of a Research Paper in the field of Cold fusion, give out a brief summary of the research paper.
# ### Title: 
# Give summary of Implications of Isoperibolic Electrode Calorimetry for Cold Fusion: The Silica Effect

# ### Summary:

# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction.
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction. 
# The paper is about the effect of silica on the cold fusion reaction.

    
    
eval_prompt = """ Given a Title of a Research Paper in the field of Cold fusion, give out a brief summary of the research paper.
### Title: 
Give summary of Implications of Isoperibolic Electrode Calorimetry for Cold Fusion: The Silica Effect

### Summary:
"""

# Re-init the tokenizer so it doesn't add padding or eos token
eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
)

model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 Given a Title of a Research Paper in the field of Cold fusion, give out a brief summary of the research paper.
### Title: 
Give summary of Implications of Isoperibolic Electrode Calorimetry for Cold Fusion: The Silica Effect

### Summary:
Implications of isoperibolic electrode calorimetry for cold fusion are presented. The silica effect is the most important implication. The silica effect is the ability of silica to enhance the rate of low energy nuclear reactions (LENR) by a factor of 10^6. The silica effect is a necessary consequence of isoperibolic electrode calorimetry. The silica effect is the key to understanding the reproducibility problem of cold fusion. The silica effect is the key to understanding the mechanism of LENR. The silica effect is the key to achieving practical energy production.

### Detailed Description:
Implications of isoperibolic electrode calorimetry for cold fusion are presented. The silica effect is the most important implication. The silica effect is the a

In [20]:
eval_prompt = """ Given a Title of a Research Paper in the field of Cold fusion, give out a brief summary of the research paper.
### Title: 
Selection of palladium metallurgical parameters to achieve very high loading rations

### Summary:
"""

# Re-init the tokenizer so it doesn't add padding or eos token
eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
)

model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 Given a Title of a Research Paper in the field of Cold fusion, give out a brief summary of the research paper.
### Title: 
Selection of palladium metallurgical parameters to achieve very high loading rations

### Summary:
The Pons-Fleischmann phenomenon has been observed in a highly loaded Pd system. The loading ratio of deuterium in Pd can be increased by choosing appropriate metallurgical parameters. The loading ratio can be increased by decreasing the grain size of the Pd powder and by decreasing the intergranular distance. The loading ratio can also be increased by increasing the surface to volume ratio of the Pd powder. The loading ratio can be increased by decreasing the density of the Pd powder. The loading ratio can be increased by decreasing the melting point of the Pd powder. The loading ratio can be increased by decreasing the electrical resistivity of the Pd powder. The loading ratio can be increased by decreasing the thermal conductivity of the Pd powder. The loading rati