# ðŸ§  01 - Corpus Loading and Model Initialization Test

This notebook verifies the corpus loading, SBERT embedding generation, and initial MinHash creation, which are the foundational steps for the `PlagiarismService`.

In [None]:
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
from datasketch import MinHash

# Define constants (matching those in core/config.py)
CORPUS_PATH = Path("../corpus/fixed_corpus.txt")
MODEL_NAME = 'all-MiniLM-L6-v2'
LSH_PERMUTATIONS = 128

## 1. Load Corpus
The corpus is split by double newline characters (`\n\n`) to treat paragraphs or major sections as individual documents.

In [None]:
def load_corpus(path: Path):
    try:
        with open(path, 'r', encoding='utf-8') as f:
            content = f.read()
        documents = [doc.strip() for doc in content.split('\n\n') if doc.strip()]
        print(f"Loaded {len(documents)} documents.")
        return documents
    except FileNotFoundError:
        print(f"Error: Corpus file not found at {path}")
        return []

corpus_documents = load_corpus(CORPUS_PATH)

if corpus_documents:
    print("\n--- First Document Preview ---")
    print(corpus_documents[0][:150] + "...")

## 2. Test Semantic Embedding (SBERT)
Load the model and generate fixed-size vectors for the corpus documents.

In [None]:
print(f"Loading model: {MODEL_NAME}...")
sbert_model = SentenceTransformer(MODEL_NAME)

if corpus_documents:
    print("Generating embeddings...")
    corpus_embeddings = sbert_model.encode(corpus_documents, convert_to_tensor=False)
    
    print("\n--- Embedding Stats ---")
    print(f"Shape: {corpus_embeddings.shape}")
    # Verify the dimension of the embedding (for MiniLM-L6-v2, it should be 384)
    print(f"Dimension: {corpus_embeddings.shape[1]}")

## 3. Test Lexical Hashing (MinHash)
Generate MinHash signatures for the first two corpus documents and calculate their approximate Jaccard similarity.

In [None]:
if len(corpus_documents) >= 2:
    doc1 = corpus_documents[0]
    doc2 = corpus_documents[1]
    
    m1 = MinHash(num_permutations=LSH_PERMUTATIONS)
    m2 = MinHash(num_permutations=LSH_PERMUTATIONS)
    
    # Update MinHash with tokens (words) from the documents
    for d in doc1.lower().split():
        m1.update(d.encode('utf8'))
    for d in doc2.lower().split():
        m2.update(d.encode('utf8'))
        
    # Calculate Jaccard similarity based on the MinHash signatures
    similarity = m1.jaccard(m2)
    
    print("\n--- MinHash Similarity Test ---")
    print(f"Jaccard Similarity (Approximate): {similarity:.4f}")
else:
    print("Need at least two documents in the corpus to test MinHash similarity.")