<a href="https://colab.research.google.com/github/petersun1937/finetune-lm-research_topics/blob/main/finetune_lm_research_topics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install transformers datasets
!pip install -q datasets rouge-score bert-score

In [None]:
# To ignore WANDB
import os
os.environ["WANDB_DISABLED"] = "true"

# **Finetune LM (Distilgpt2 or TinyMistral, uncomment relevant parts if needed)**

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM, AutoConfig
from datasets import load_dataset

# Load dataset
# Upload or mount this file into Colab first
dataset = load_dataset("json", data_files={"train": "research_nlp_chapters_100.jsonl"}, split="train")

# Preprocess into model input format
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
#tokenizer = AutoTokenizer.from_pretrained("M4-ai/TinyMistral-248M-v3")
tokenizer.pad_token = tokenizer.eos_token  # Avoids padding error

def format_and_tokenize(example):
    prompt = f"### Problem: {example['input']}\n### Approach:"
    full_text = f"{prompt} {example['output']}"
    return tokenizer(full_text, truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(format_and_tokenize)

# Load and modify config first
config = AutoConfig.from_pretrained("M4-ai/TinyMistral-248M-v3")
config.attn_pdrop = 0.1
config.resid_pdrop = 0.1

# Load model with custom config
model = GPT2LMHeadModel.from_pretrained("distilgpt2")
#model = AutoModelForCausalLM.from_pretrained("M4-ai/TinyMistral-248M-v3", config=config)
model.resize_token_embeddings(len(tokenizer))

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_strategy="no",
    logging_steps=10,
    fp16=torch.cuda.is_available(),  # Use mixed precision on GPU if possible
    weight_decay=0.01,
)

# Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

# Train
trainer.train()

# Save model
#trainer.save_model("fine-tuned-tinyMistral")
#tokenizer.save_pretrained("fine-tuned-tinyMistral")
trainer.save_model("fine-tuned-distilgpt2")
tokenizer.save_pretrained("fine-tuned-distilgpt2")


# **Test fine-tuned model**

In [None]:
import torch
from tqdm import tqdm
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util

# Load Model & Tokenizer
#model = AutoModelForCausalLM.from_pretrained("fine-tuned-tinyMistral")
#tokenizer = AutoTokenizer.from_pretrained("fine-tuned-tinyMistral")

tokenizer = GPT2Tokenizer.from_pretrained("fine-tuned-distilgpt2")
model = GPT2LMHeadModel.from_pretrained("fine-tuned-distilgpt2")

tokenizer.pad_token = tokenizer.eos_token
model.eval()

# Load Evaluation Dataset
eval_data = load_dataset("json", data_files="test_research_nlp_chapters_10.jsonl")["train"]

# Load SBERT for Cosine Similarity
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Helper Functions
def generate_response(prompt, max_len=128):
    input_text = f"### Problem: {prompt}\n### Approach:"
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_len,
        do_sample=True,             # Enable sampling instead of greedy decoding
        temperature=0.8             # Lower = conservative, Higher = creative
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.split("### Approach:")[-1].strip()

#def repetition_ratio(text, n=5):
#    lines = [line.strip() for line in text.split("\n") if line.strip()]
#    return len(lines) / len(set(lines)) if lines else 1.0
def repetition_score(text):
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    return len(lines) / len(set(lines)) if lines else 1.0

# Generate Responses
generated_responses, rouge_scores, repetition_scores = [], [], []
for ex in eval_data:
    prompt = ex["input"]
    gold = ex["output"]
    gen = generate_response(prompt)

    generated_responses.append(gen)

    # ROUGE-L
    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l_f1 = rouge.score(gold, gen)['rougeL'].fmeasure
    rouge_scores.append(rouge_l_f1)

    # Repetition Score
    repetition_scores.append(repetition_score(gen))

# Cosine Similarity (batch)
gen_embeds = sbert_model.encode(generated_responses, convert_to_tensor=True)
ref_embeds = sbert_model.encode([ex["output"] for ex in eval_data], convert_to_tensor=True)
cosine_scores = util.cos_sim(gen_embeds, ref_embeds).diagonal().cpu().tolist()

# Build DataFrame
df = pd.DataFrame({
    "Prompt": [ex["input"] for ex in eval_data],
    "Generated": generated_responses,
    "Reference": [ex["output"] for ex in eval_data],
    "ROUGE-L": np.round(rouge_scores, 3),
    "CosineSim": np.round(cosine_scores, 3),
    "RepetitionScore": np.round(repetition_scores, 2)
})

# Print Summary
print(df[["Prompt", "Generated", "Reference", "ROUGE-L", "CosineSim", "RepetitionScore"]])
print("\nAverage ROUGE-L:", round(df["ROUGE-L"].mean(), 3))
print("Average Cosine Similarity:", round(df["CosineSim"].mean(), 3))
print("Average Repetition Score:", round(df["RepetitionScore"].mean(), 2))



# **Test Pretrained Model**

In [None]:
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util

# Load Pretrained Model & Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

#model = AutoModelForCausalLM.from_pretrained("M4-ai/TinyMistral-248M-v3")
#tokenizer = AutoTokenizer.from_pretrained("M4-ai/TinyMistral-248M-v3")

tokenizer.pad_token = tokenizer.eos_token
model.eval()

# Load Evaluation Dataset
eval_data = load_dataset("json", data_files="test_research_nlp_chapters_10.jsonl")["train"]

# Load SBERT for Cosine Similarity
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Helper Functions
def generate_response(prompt, max_len=128):
    input_text = f"### Problem: {prompt}\n### Approach:"
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_len,
        do_sample=True,             # Enable sampling instead of greedy decoding
        temperature=0.8             # Lower = conservative, Higher = creative
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.split("### Approach:")[-1].strip()

#def repetition_ratio(text, n=5):
#    lines = [line.strip() for line in text.split("\n") if line.strip()]
#    return len(lines) / len(set(lines)) if lines else 1.0
def repetition_score(text):
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    return len(lines) / len(set(lines)) if lines else 1.0

# Generate Responses
generated_responses, rouge_scores, repetition_scores = [], [], []
for ex in eval_data:
    prompt = ex["input"]
    gold = ex["output"]
    gen = generate_response(prompt)

    generated_responses.append(gen)

    # ROUGE-L
    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l_f1 = rouge.score(gold, gen)['rougeL'].fmeasure
    rouge_scores.append(rouge_l_f1)

    # Repetition Score
    repetition_scores.append(repetition_score(gen))

# Cosine Similarity (batch)
gen_embeds = sbert_model.encode(generated_responses, convert_to_tensor=True)
ref_embeds = sbert_model.encode([ex["output"] for ex in eval_data], convert_to_tensor=True)
cosine_scores = util.cos_sim(gen_embeds, ref_embeds).diagonal().cpu().tolist()

# Build DataFrame
df = pd.DataFrame({
    "Prompt": [ex["input"] for ex in eval_data],
    "Generated": generated_responses,
    "Reference": [ex["output"] for ex in eval_data],
    "ROUGE-L": np.round(rouge_scores, 3),
    "CosineSim": np.round(cosine_scores, 3),
    "RepetitionScore": np.round(repetition_scores, 2)
})

# Print Summary
print(df[["Prompt", "Generated", "Reference", "ROUGE-L", "CosineSim", "RepetitionScore"]])
print("\nAverage ROUGE-L:", round(df["ROUGE-L"].mean(), 3))
print("Average Cosine Similarity:", round(df["CosineSim"].mean(), 3))
print("Average Repetition Score:", round(df["RepetitionScore"].mean(), 2))


# **LoRA (performance not good)**

In [None]:
!pip install -q peft datasets transformers accelerate bitsandbytes

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling)
from peft import LoraConfig, get_peft_model, TaskType

# Load Dataset
dataset = load_dataset("json", data_files={"train": "research_nlp_chapters_100.jsonl"}, split="train")

# Tokenizer & Prompt Formatting
tokenizer = AutoTokenizer.from_pretrained("M4-ai/TinyMistral-248M-v3")
tokenizer.pad_token = tokenizer.eos_token

def format_and_tokenize(example):
    prompt = f"### Problem: {example['input']}\n### Approach:"
    full_text = f"{prompt} {example['output']}"
    return tokenizer(full_text, truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(format_and_tokenize, remove_columns=dataset.column_names)

# Load Base Model and Apply LoRA
config = AutoConfig.from_pretrained("M4-ai/TinyMistral-248M-v3")
base_model = AutoModelForCausalLM.from_pretrained("M4-ai/TinyMistral-248M-v3", config=config)
base_model.resize_token_embeddings(len(tokenizer))

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Adapt based on model internals
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()  # Optional: confirm LoRA is active

# TrainingArguments
training_args = TrainingArguments(
    output_dir="./results_lora",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_steps=10,
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),
    weight_decay=0.01,
    report_to="none"
)

# Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

# Train & Save
trainer.train()
trainer.save_model("fine-tuned-lora-TinyMistral-248M-v3")
tokenizer.save_pretrained("fine-tuned-lora-TinyMistral-248M-v3")


In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score as bertscore

# Load Fine-Tuned Model
model_path = "fine-tuned-lora-TinyMistral-248M-v3"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# Load Test Dataset
eval_data = load_dataset("json", data_files="test_research_nlp_chapters_10.jsonl")["train"]

def generate_response(prompt, max_len=128):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_len,
        do_sample=False,
        repetition_penalty=1.2,  # Helps reduce output loops
        temperature=0.8,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.strip()

# Initialize Metrics
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
bleu_scores, rouge_scores, preds, refs = [], [], [], []

# Run Evaluation
for ex in eval_data:
    gold = ex["output"]
    prompt = ex["input"]
    gen = generate_response(prompt)

    # Print example
    print("\n---")
    print("Prompt:", prompt)
    print("Generated:", gen)
    print("Reference:", gold)

    # BLEU
    ref_tokens = [gold.split()]
    pred_tokens = gen.split()
    bleu = sentence_bleu(ref_tokens, pred_tokens)
    bleu_scores.append(bleu)

    # ROUGE-L
    rouge_L = rouge.score(gold, gen)["rougeL"].fmeasure
    rouge_scores.append(rouge_L)

    preds.append(gen)
    refs.append(gold)

# BERTScore
P, R, F1 = bertscore(preds, refs, lang="en", rescale_with_baseline=True)

# Print Metrics
print("\n--- Evaluation Summary ---")
print(f"Average BLEU:      {sum(bleu_scores)/len(bleu_scores):.3f}")
print(f"Average ROUGE-L:   {sum(rouge_scores)/len(rouge_scores):.3f}")
print(f"Average BERTScore-F1: {F1.mean().item():.3f}")
