In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline
from datasets import Dataset
import pandas as pd
import torch

In [7]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Using device: {device}")

Using device: mps


In [2]:
# Load and prepare the data
def prepare_dataset(csv_file):
    # Read CSV file - assuming it has a column named 'sentence'
    df = pd.read_csv(csv_file)

    # Convert DataFrame to Hugging Face Dataset
    dataset = Dataset.from_pandas(df)

    return dataset


# Tokenization function
def tokenize_function(examples):
    # Add EOS token to each sentence
    texts = [text + tokenizer.eos_token for text in examples["sentence"]]

    # Tokenize with padding
    tokenized = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )

    # Create labels (shifted input_ids)
    tokenized["labels"] = tokenized["input_ids"].clone()

    return tokenized


def generate_text(prompt, max_length=50):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate text
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,  # Controls randomness (higher = more random)
        top_p=0.9,  # Nucleus sampling parameter
        do_sample=True,  # Use sampling instead of greedy decoding
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

In [3]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

In [4]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [None]:

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    print("Adding padding token")
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

In [51]:
# Prepare dataset
raw_dataset = prepare_dataset("sample_dataset.csv")

# Split dataset (80% train, 20% eval)
train_test_split = raw_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)

In [52]:
print(raw_dataset[-1]["sentence"])

Minki is married to Alison Jeon, a Korean-Canadian, who studies English and philosophy at U of Toronto. They met during his exchange student program in 2018 and got married in October 2022. They have a dog named Charlie Rose, a border terrier shih tzu mix who is 5 years old


In [53]:
# Tokenize datasets
tokenized_train = train_test_split["train"].map(
    tokenize_function,
    batched=True,
    remove_columns=train_test_split["train"].column_names,
)

tokenized_eval = train_test_split["test"].map(
    tokenize_function,
    batched=True,
    remove_columns=train_test_split["test"].column_names,
)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [54]:
# Set up training arguments
from datasets import Dataset
import datetime
import os

training_args = TrainingArguments(
    output_dir=f"training_results/llama3.2-1B/{datetime.datetime.now().strftime('%m-%d-%H-%M')}",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=0,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=1,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=1000,
    load_best_model_at_end=True,
    fp16=False,
    use_cpu=True,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

In [55]:
# Start training
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=4, training_loss=2.6908903047442436, metrics={'train_runtime': 1443.1762, 'train_samples_per_second': 0.003, 'train_steps_per_second': 0.003, 'total_flos': 11958019424256.0, 'train_loss': 2.6908903047442436, 'epoch': 1.0})

In [59]:
pipe = pipeline(
    "text-generation",
    model="/Users/minkijung/Documents/Projects/LM_as_Memory/training_results/llama3.2-1B/01-16-11-09/checkpoint-4",
    torch_dtype=torch.bfloat16,
    device_map="cpu",
)

Device set to use cpu


In [62]:
result = pipe("Alison Jeon's husband is ", max_length=300, truncation=True, num_return_sequences=5, temperature=0.7)


In [63]:
for i in range(5):
    print(result[i]["generated_text"])
    print("\n")
    print("--------------------------------------------------")
    print("\n")

Alison Jeon's husband is 1.5 times bigger than she is. How much taller would he be if he were 3 times as heavy as she is?
If a person is 6 feet 2 inches tall and weighs 190 pounds, what is her weight in kilograms?
If a person is 6 feet 2 inches tall and weighs 190 pounds, what is her weight in kilograms?


--------------------------------------------------


Alison Jeon's husband is 1/2 Korean, 1/4 Chinese and 1/4 Vietnamese
Alison Jeon's husband is 1/2 Korean, 1/4 Chinese and 1/4 Vietnamese
Alison Jeon is a Korean-American writer, actress, and producer. She is best known for her role as Kim


--------------------------------------------------


Alison Jeon's husband is 25 years older than her
Alison Jeon's husband is 25 years older than her
Alison Jeon, the wife of South Korean singer and songwriter Seo


--------------------------------------------------


Alison Jeon's husband is 20 years older than her. He is 58 years old. They met when she was 15 years old and he was 35 years old.