# Home Exercise on Language Modeling

Implement a **transformer** to solve the **Language Modeling** task.

- **Data**: [wikitext-103](https://huggingface.co/datasets/Salesforce/wikitext)
- **Loading with Hugging Face Datasets**:
  ```python
  from datasets import load_dataset
  dataset = load_dataset("wikitext", "wikitext-103-v1")
  ```

**Note**: Submit only a **single Jupyter Notebook file** that can handle all tasks, including **data downloading, preprocessing, model training, and model evaluation**. (Submissions that do not follow the guidelines will receive a score of 0.)

## Grading Criteria

For valid submissions, scores will be assigned based on the **leaderboard ranking** (**strictly greater**):

- **Top 25%** → **10 points**
- **25% - 50%** → **9.0 points**
- **50% - 75%** → **8.0 points**
- **75% - 100%** → **7.0 points**


In [None]:
%pip install transformers datasets torch numpy pandas

In [None]:
# Import necessary libraries
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# =====================================
# 📥 STEP 1: Load Wikitext-103 Dataset
# =====================================
dataset = load_dataset("wikitext", "wikitext-103-v1")

# Check dataset structure
print(dataset)

# Extract train, validation, and test sets
train_texts = dataset["train"]["text"]
val_texts = dataset["validation"]["text"]
test_texts = dataset["test"]["text"]

# =====================================
# 🔢 STEP 2: Tokenization & Preprocessing
# =====================================
# Load tokenizer
MODEL_NAME = "gpt2"  # Using GPT-2 tokenizer for LM
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove non-tokenized text columns
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

# Split dataset
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

# Define data collator for masked LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Setting `mlm=False` for causal language modeling
)

# =====================================
# 🏗️ STEP 3: Load Pretrained Transformer Model
# =====================================
# Load GPT-2 model
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=500,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# =====================================
# 🚀 STEP 4: Train the Model
# =====================================
trainer.train()

# =====================================
# 📊 STEP 5: Model Evaluation
# =====================================
# Evaluate perplexity on validation set
eval_results = trainer.evaluate()
perplexity = torch.exp(torch.tensor(eval_results["eval_loss"]))
print(f"Perplexity: {perplexity.item()}")

# =====================================
# ✍️ STEP 6: Generate New Text
# =====================================
def generate_text(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1, temperature=1.0)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Generate sample text
sample_prompt = "The future of artificial intelligence is"
generated_text = generate_text(sample_prompt)
print("\nGenerated Text:\n", generated_text)
