In [None]:
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Download and load model and tokenizer from Huggingface
def load_model(model_name="gpt2"):
    """
    Download and load a model and tokenizer from Huggingface.
    
    Args:
        model_name (str): Name of the model to download from Huggingface
        
    Returns:
        tuple: (model, tokenizer)
    """
    print(f"Loading model: {model_name}")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.to(device)
    model.eval()  # Set model to evaluation mode
    
    return model, tokenizer

# Function to compute log likelihood of a sentence
def compute_log_likelihood(model, tokenizer, sentence, return_token_likelihoods=False):
    """
    Compute the log likelihood of a sentence using a language model.
    
    Args:
        model: Language model
        tokenizer: Tokenizer for the language model
        sentence (str): Input sentence
        return_token_likelihoods (bool): Whether to return token-level likelihoods
        
    Returns:
        float: Log likelihood of the sentence
        list (optional): Token-level log likelihoods
    """
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Get input and target IDs
    input_ids = inputs["input_ids"]
    target_ids = input_ids.clone()
    
    # Calculate log likelihood
    with torch.no_grad():
        outputs = model(input_ids=input_ids, labels=target_ids)
        
        # Get the loss - this is the negative log likelihood
        neg_log_likelihood = outputs.loss.item()
        log_likelihood = -neg_log_likelihood
        
        if return_token_likelihoods:
            # Get token-level log probabilities
            logits = outputs.logits[:, :-1, :]  # Remove last position
            shift_target_ids = target_ids[:, 1:]  # Shift to match prediction positions
            
            # Get log probabilities for each position
            log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
            
            # Extract log probability of each target token
            token_log_probs = []
            for i in range(shift_target_ids.size(1)):
                token_id = shift_target_ids[0, i].item()
                token_log_prob = log_probs[0, i, token_id].item()
                token = tokenizer.decode([token_id])
                token_log_probs.append((token, token_log_prob))
            
            return log_likelihood, token_log_probs
    
    return log_likelihood

# Load and evaluate BLiMP dataset
def evaluate_blimp_paradigm(model, tokenizer, paradigm_name, num_samples=None):
    """
    Evaluate a model on a BLiMP paradigm.
    
    Args:
        model: Language model
        tokenizer: Tokenizer for the model
        paradigm_name (str): Name of the BLiMP paradigm
        num_samples (int, optional): Number of samples to use, None for all
        
    Returns:
        dict: Results including accuracy and score distributions
    """
    # Load the specific BLiMP paradigm from Huggingface datasets
    dataset = load_dataset("nyu-mll/blimp", paradigm_name)
    
    # Extract sentence pairs
    sentence_pairs = [(item['sentence_good'], item['sentence_bad']) 
                      for item in dataset['train']]
    
    if num_samples is not None:
        sentence_pairs = sentence_pairs[:num_samples]
    
    results = {
        'good_scores': [],
        'bad_scores': [],
        'score_diffs': [],
        'correct_predictions': 0
    }
    
    print(f"Evaluating {len(sentence_pairs)} sentence pairs...")
    
    for good_sentence, bad_sentence in tqdm(sentence_pairs):
        # Compute log likelihoods
        good_ll = compute_log_likelihood(model, tokenizer, good_sentence)
        bad_ll = compute_log_likelihood(model, tokenizer, bad_sentence)
        
        # Store results
        results['good_scores'].append(good_ll)
        results['bad_scores'].append(bad_ll)
        results['score_diffs'].append(good_ll - bad_ll)
        
        # Correct prediction if good sentence has higher probability
        if good_ll > bad_ll:
            results['correct_predictions'] += 1
    
    # Calculate accuracy
    results['accuracy'] = results['correct_predictions'] / len(sentence_pairs)
    print(f"Accuracy: {results['accuracy'] * 100:.2f}%")
    
    return results

# Analyze a specific sentence pair in detail
def analyze_sentence_pair(model, tokenizer, good_sentence, bad_sentence):
    """
    Analyze a specific sentence pair in detail.
    
    Args:
        model: Language model
        tokenizer: Tokenizer for the model
        good_sentence (str): Grammatical sentence
        bad_sentence (str): Ungrammatical sentence
    """
    print("\nDetailed Analysis of Sentence Pair:")
    print(f"Grammatical:   '{good_sentence}'")
    print(f"Ungrammatical: '{bad_sentence}'")
    
    # Compute token-level log likelihoods
    good_ll, good_token_lls = compute_log_likelihood(model, tokenizer, good_sentence, return_token_likelihoods=True)
    bad_ll, bad_token_lls = compute_log_likelihood(model, tokenizer, bad_sentence, return_token_likelihoods=True)
    
    print(f"\nOverall Log Likelihood:")
    print(f"Grammatical:   {good_ll:.4f}")
    print(f"Ungrammatical: {bad_ll:.4f}")
    print(f"Difference:    {good_ll - bad_ll:.4f}")
    
    print("\nToken-level Analysis:")
    print("Grammatical sentence token log probabilities:")
    for token, ll in good_token_lls:
        print(f"  '{token}': {ll:.4f}")
    
    print("\nUngrammatical sentence token log probabilities:")
    for token, ll in bad_token_lls:
        print(f"  '{token}': {ll:.4f}")

In [None]:
# Load model and tokenizer
model, tokenizer = load_model("gpt2")  # You can change to other models like "gpt2-medium", "EleutherAI/gpt-neo-1.3B", etc.

# Example with a specific BLiMP paradigm
# For more paradigms, see: https://huggingface.co/datasets/nyu-mll/blimp
paradigm_name = "island_effects"

# Evaluate the paradigm
results = evaluate_blimp_paradigm(model, tokenizer, paradigm_name, num_samples=100)

# Visualize results
visualize_results(results, paradigm_name)

# Get a sample pair from the dataset for detailed analysis
dataset = load_dataset("nyu-mll/blimp", paradigm_name)
sample = dataset['train'][0]
good_sentence = sample['sentence_good']
bad_sentence = sample['sentence_bad']

# Analyze the sample pair
analyze_sentence_pair(model, tokenizer, good_sentence, bad_sentence)