# üß† Gemma Reasoning Model - Inference

**Author:** Om Borda (omborda2002)  
**Competition:** Google Tunix Hack  
**Model:** Gemma 2B IT + LoRA Fine-tuned on 570k reasoning samples

## Output Format
```
<reasoning>step-by-step thinking</reasoning>
<answer>final answer</answer>
```

In [None]:
!pip install -q transformers accelerate bitsandbytes peft

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import os

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Load Model

In [None]:
# Paths - UPDATE THIS to your Kaggle model path
LORA_PATH = "/kaggle/input/gemma-reasoning-lora"  # Your uploaded LoRA adapter
BASE_MODEL = "google/gemma-2-2b-it"

# Check if path exists
if os.path.exists(LORA_PATH):
    print(f"‚úì LoRA adapter found at: {LORA_PATH}")
    print(f"  Contents: {os.listdir(LORA_PATH)}")
else:
    print(f"‚úó LoRA path not found: {LORA_PATH}")
    print("  Make sure to add your model as input data")

In [None]:
# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Get HF token from Kaggle secrets
from kaggle_secrets import UserSecretsClient
try:
    user_secrets = UserSecretsClient()
    hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
    print("‚úì HuggingFace token found")
except:
    try:
        hf_token = user_secrets.get_secret("HF_TOKEN")
        print("‚úì HuggingFace token found")
    except:
        hf_token = None
        print("‚ö† No HF token - may fail for gated models")

# Load tokenizer from HuggingFace (not from LoRA path)
print("Loading tokenizer from HuggingFace...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=hf_token)
tokenizer.pad_token = tokenizer.eos_token
print("‚úì Tokenizer loaded")

# Load base model
print("\nLoading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    token=hf_token,
)
print("‚úì Base model loaded")

# Load LoRA adapter
print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, LORA_PATH)
model.eval()
print("‚úì Model ready!")

## Inference Function

In [None]:
def generate_response(question, max_tokens=500, temperature=0.7):
    """Generate a response with reasoning."""
    prompt = f"<start_of_turn>user\n{question}\n<end_of_turn>\n<start_of_turn>model\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract model response only
    if "<start_of_turn>model" in response:
        response = response.split("<start_of_turn>model")[-1].strip()
    
    return response

print("‚úì Inference function ready")

## Test Model

In [None]:
# Sample test questions
test_questions = [
    "What is 125 + 347?",
    "Solve: 2x + 5 = 13",
    "A train travels 240 km in 4 hours. What is its speed?",
    "What is the probability of rolling a 6 on a fair die?",
    "If all cats are animals, and Whiskers is a cat, what can we conclude?",
]

print("="*60)
print("üß™ MODEL EVALUATION")
print("="*60)

for q in test_questions:
    print(f"\nüìù Question: {q}")
    print("-"*60)
    response = generate_response(q)
    print(f"ü§ñ Response:\n{response}")
    print("="*60)

## Competition Submission

For actual competition submission, load the test data and generate predictions:

In [None]:
# Uncomment and modify for actual competition submission
# import pandas as pd
# 
# # Load test data
# test_df = pd.read_csv("/kaggle/input/competition-test-data/test.csv")
# 
# # Generate predictions
# predictions = []
# for idx, row in test_df.iterrows():
#     question = row['question']  # adjust column name as needed
#     response = generate_response(question)
#     predictions.append(response)
#     
#     if idx % 10 == 0:
#         print(f"Processed {idx+1}/{len(test_df)}")
# 
# # Save submission
# submission = pd.DataFrame({
#     'id': test_df['id'],
#     'prediction': predictions
# })
# submission.to_csv('submission.csv', index=False)
# print("‚úì Submission saved!")

## Summary

**Model:** Gemma 2B IT + LoRA  
**Training Data:** ~570k reasoning samples  
- GSM8K, OpenThoughts, Stratos, Medical-O1, MetaMathQA

**Output Format:**
```
<reasoning>step-by-step thinking</reasoning>
<answer>final answer</answer>
```

**Author:** Om Borda (omborda2002)