In [None]:
!pip install transformers datasets
from huggingface_hub import notebook_login

# Log in to Hugging Face
notebook_login()

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)   # Gpt2 based tokenizer


In [None]:
!pip install transformers datasets
from datasets import load_dataset

# Load wikitext dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Preview a sample
print(dataset["train"][50])


In [None]:
from datasets import load_dataset # Import load_dataset

# Add the padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as pad token

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Load wikitext dataset if not already loaded
try:
    dataset
except NameError:
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

tokenized_dataset = dataset.map(tokenize_function, batched=True) # dataset is now accessible


In [None]:
# Set the pad_token_id to be the same as eos_token_id
model.config.pad_token_id = model.config.eos_token_id

# Example prompt
prompt = "How to do multiplication"

# Tokenizing input with padding and generating attention mask
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True) ## Embedding

# Generate output
outputs = model.generate(inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=50)

# Decode and print the output
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
print(inputs)

## Exporting to Hugging Face

In [None]:
# Push the model to Hugging Face Hub
model.push_to_hub("my-awesome-llm")
tokenizer.push_to_hub("my-awesome-llm")


## Importing our Tokenizer and testing

In [None]:
from transformers import AutoTokenizer

# Load the tokenizer from the model hub
tokenizer = AutoTokenizer.from_pretrained("rveeee/my-awesome-llm")

# Test tokenization
sample_text = "Hello, how are you?"
tokens = tokenizer(sample_text)
print("Token IDs:", tokens['input_ids'])
print("Tokens:", tokenizer.convert_ids_to_tokens(tokens['input_ids']))


## Importing our Model and Testing

In [None]:
from transformers import AutoModelForCausalLM

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("rveeee/my-awesome-llm")
tokenizer = AutoTokenizer.from_pretrained("rveeee/my-awesome-llm")

# Prepare input text
input_text = "Once upon a time"
inputs = tokenizer(input_text, return_tensors="pt", padding=True)

# Include the attention mask
attention_mask = inputs['attention_mask']

# Generate text
outputs = model.generate(
    inputs["input_ids"],
    max_length=50,
    pad_token_id=tokenizer.eos_token_id,  # Set pad token ID
    attention_mask=attention_mask  # Pass the attention mask
)

# Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Text:", generated_text)



In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="rveeee/my-awesome-llm")

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("rveeee/my-awesome-llm")
model = AutoModelForCausalLM.from_pretrained("rveeee/my-awesome-llm")