In [None]:
# 1. Install Dependencies
!pip install huggingface_hub transformers datasets peft accelerate wandb

In [None]:
# 2. Import necessary libraries
import os
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import wandb
from google.colab import drive

In [None]:
# 3. Use secrets for login (in Colab, these can be set via userdata)
HUGGINGFACE_API_KEY = userdata.get('HUGGINGFACE_API_KEY')  # Hugging Face API Token
WANDB_API_KEY = userdata.get('WANDB_API_KEY')  # Weights & Biases Key


In [None]:


# Login to Hugging Face
os.environ["HF_HOME"] = "/content/drive/MyDrive/NLP/MODELS/huggingface_cache"  # Set Hugging Face cache directory if needed
login(token=HUGGINGFACE_API_KEY)  # Login to Hugging Face using the API token


In [None]:

# Login to Weights & Biases
wandb.login(key=WANDB_API_KEY)  # Login to W&B using the API key


In [None]:

# 4. Mount Google Drive to save models
drive.mount('/content/drive/')


In [None]:

# 5. Load and Preprocess Dataset
# Load the FEVER dataset (fact extraction and verification)
dataset = load_dataset("fever")

# Inspect the first few examples from the dataset
print(dataset["train"][0])

In [None]:

# 6. Tokenizer and Model Setup
# Define the model we will fine-tune (Llama 2 - 7B)
model_name = "meta-llama/Llama-2-7b-hf"  # You can change this to another model if needed


In [None]:

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


In [None]:

# 7. Preprocessing Dataset
# Select only a subset (5,000 samples) for training
subset_dataset = dataset["train"].select(range(5000))  # Select the first 5,000 samples


In [None]:

# Preprocessing function for tokenizing the dataset
def preprocess_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True)

# Apply preprocessing to the subset dataset
tokenized_datasets = subset_dataset.map(preprocess_function, batched=True)


In [None]:

# 8. Training Setup
# Define the training arguments (hyperparameters)
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/NLP/MODELS/FineTunedModel",  # Save model to specified Google Drive path
    evaluation_strategy="epoch",  # Evaluate every epoch
    learning_rate=2e-5,  # Learning rate
    per_device_train_batch_size=4,  # Training batch size
    per_device_eval_batch_size=8,  # Evaluation batch size
    num_train_epochs=3,  # Number of epochs
    weight_decay=0.01,  # Weight decay for optimization
    logging_dir="/content/drive/MyDrive/NLP/MODELS/Logs",  # Save logs to Google Drive
    logging_steps=10,  # Log every 10 steps
    push_to_hub=False,  # Set to True to upload the model after training
    report_to="wandb",  # Report metrics to Weights & Biases
)


In [None]:

# 9. Trainer Setup
# Initialize the Trainer with the model, training arguments, and tokenized dataset
trainer = Trainer(
    model=model,  # The model to train
    args=training_args,  # Training arguments
    train_dataset=tokenized_datasets,  # Training dataset (5K subset)
    eval_dataset=tokenized_datasets,  # Evaluation dataset (5K subset)
    tokenizer=tokenizer,  # Tokenizer for preprocessing
)


In [None]:


# Start training
trainer.train()


In [None]:

# 10. Save and Upload the Model
# Save the fine-tuned model to Google Drive
model_save_path = "/content/drive/MyDrive/NLP/MODELS/FineTunedModel"
trainer.save_model(model_save_path)  # Save the model locally in Drive


In [None]:

# Optionally, push the fine-tuned model to Hugging Face Hub
# Make sure to create a new model repo on Hugging Face first
model.push_to_hub("your_huggingface_username/your_model_repo_name")
tokenizer.push_to_hub("your_huggingface_username/your_model_repo_name")


In [None]:

# 11. Inference: Using the Fine-Tuned Model for Inference
# Example inference with the fine-tuned model
def infer_with_model(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    output = model.generate(**inputs, max_length=50)
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded_output


In [None]:

# Example usage of inference function
input_text = "What is the capital of France?"
response = infer_with_model(input_text)
print("Model's response:", response)
