## Setup and Imports

In [None]:
from explodingham.models.compression_learning import CompressionKNN
import numpy as np

## Example 1: Simple Binary Classification with Default Parameters

Let's start with a simple example classifying short vs. repeated text patterns.

In [3]:
# Training data: short random text (class 0) vs. repetitive text (class 1)
X_train = [
    "abcdefgh",           # class 0: random
    "ijklmnop",           # class 0: random
    "qrstuvwx",           # class 0: random
    "ababababab",         # class 1: repetitive
    "cdcdcdcdcd",         # class 1: repetitive
    "efefefefefef",       # class 1: repetitive
]

y_train = [0, 0, 0, 1, 1, 1]

# Test data
X_test = [
    "xyzabc",             # Should be class 0 (random)
    "xyxyxyxy",           # Should be class 1 (repetitive)
]

print("Training set:")
for i, (x, y) in enumerate(zip(X_train, y_train)):
    print(f"  {i+1}. '{x}' -> class {y}")

print("\nTest set:")
for i, x in enumerate(X_test):
    print(f"  {i+1}. '{x}'")

Training set:
  1. 'abcdefgh' -> class 0
  2. 'ijklmnop' -> class 0
  3. 'qrstuvwx' -> class 0
  4. 'ababababab' -> class 1
  5. 'cdcdcdcdcd' -> class 1
  6. 'efefefefefef' -> class 1

Test set:
  1. 'xyzabc'
  2. 'xyxyxyxy'


In [4]:
# Create classifier with default parameters (n_neighbors=5, compression='gzip')
clf_default = CompressionKNN()

# Fit and predict
clf_default.fit(X_train, y_train)
predictions = clf_default.predict(X_test)

print("Predictions with default parameters (k=5):")
for i, (x, pred) in enumerate(zip(X_test, predictions)):
    print(f"  '{x}' -> class {pred}")

TypeError: a bytes-like object is required, not 'str'

In [5]:
# Get prediction probabilities
probabilities = clf_default.predict_proba(X_test)

print("\nPrediction probabilities:")
for i, (x, prob) in enumerate(zip(X_test, probabilities)):
    print(f"  '{x}' -> {prob:.2%} confidence")

TypeError: a bytes-like object is required, not 'str'

## Example 2: Varying k (n_neighbors)

Let's see how different values of k affect the predictions.

In [None]:
# Test with different k values
k_values = [1, 3, 5]

print("Predictions with different k values:\n")

for k in k_values:
    clf = CompressionKNN(n_neighbors=k)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    probs = clf.predict_proba(X_test)
    
    print(f"k = {k}:")
    for x, pred, prob in zip(X_test, preds, probs):
        print(f"  '{x}' -> class {pred} ({prob:.2%} confidence)")
    print()

## Example 3: Language Detection

Use CompressionKNN to detect the language of text samples based on character patterns.

In [None]:
# Training data: English vs Spanish vs French
X_lang_train = [
    "the quick brown fox jumps over the lazy dog",          # English
    "hello world how are you doing today",                  # English
    "machine learning is a fascinating subject",            # English
    "el rápido zorro marrón salta sobre el perro perezoso", # Spanish
    "hola mundo cómo estás haciendo hoy",                   # Spanish
    "el aprendizaje automático es un tema fascinante",     # Spanish
    "le renard brun rapide saute par-dessus le chien paresseux", # French
    "bonjour le monde comment allez-vous aujourd'hui",     # French
    "l'apprentissage automatique est un sujet fascinant",  # French
]

y_lang_train = ['en', 'en', 'en', 'es', 'es', 'es', 'fr', 'fr', 'fr']

# Test samples
X_lang_test = [
    "this is a test sentence in english",
    "esta es una oración de prueba en español",
    "ceci est une phrase de test en français",
]

print("Language Detection Demo\n")
print("Training on 9 samples (3 per language: English, Spanish, French)\n")

In [None]:
# Train with k=3 for language detection
clf_lang = CompressionKNN(n_neighbors=3)
clf_lang.fit(X_lang_train, y_lang_train)

# Predict languages
lang_predictions = clf_lang.predict(X_lang_test)
lang_probabilities = clf_lang.predict_proba(X_lang_test)

print("Language predictions (k=3):\n")
for text, pred, prob in zip(X_lang_test, lang_predictions, lang_probabilities):
    print(f"Text: '{text[:50]}...'")
    print(f"  -> Detected language: {pred} ({prob:.2%} confidence)\n")

## Example 4: Code Snippet Classification

Classify code snippets by programming language based on syntax patterns.

In [None]:
# Training data: Python vs JavaScript
X_code_train = [
    "def hello():\n    print('Hello')\n    return True",                    # Python
    "for i in range(10):\n    result = i * 2\n    print(result)",          # Python
    "class MyClass:\n    def __init__(self):\n        self.value = 42",    # Python
    "function hello() {\n  console.log('Hello');\n  return true;\n}",      # JavaScript
    "for (let i = 0; i < 10; i++) {\n  let result = i * 2;\n  console.log(result);\n}", # JavaScript
    "class MyClass {\n  constructor() {\n    this.value = 42;\n  }\n}",   # JavaScript
]

y_code_train = ['python', 'python', 'python', 'javascript', 'javascript', 'javascript']

X_code_test = [
    "import numpy as np\ndef process(data):\n    return np.mean(data)",
    "const arr = [1, 2, 3];\nconst doubled = arr.map(x => x * 2);",
]

print("Code Language Classification Demo\n")

In [None]:
# Try with different k values
for k in [1, 3]:
    clf_code = CompressionKNN(n_neighbors=k)
    clf_code.fit(X_code_train, y_code_train)
    
    code_preds = clf_code.predict(X_code_test)
    code_probs = clf_code.predict_proba(X_code_test)
    
    print(f"\nPredictions with k={k}:")
    for i, (snippet, pred, prob) in enumerate(zip(X_code_test, code_preds, code_probs)):
        print(f"\nSnippet {i+1}:")
        print(f"  {snippet[:60]}...")
        print(f"  -> Detected: {pred} ({prob:.2%} confidence)")

## Example 5: Multiclass Classification with DNA Sequences

Classify DNA sequences into different categories based on their patterns.

In [None]:
# Training data: Different DNA pattern types
X_dna_train = [
    "ATGATGATGATGATG",      # AT-rich (class 0)
    "ATATATATATATATAT",     # AT-rich (class 0)
    "ATTAATTAATTAATTA",     # AT-rich (class 0)
    "GCGCGCGCGCGCGCGC",     # GC-rich (class 1)
    "GGCCGGCCGGCCGGCC",     # GC-rich (class 1)
    "CGCGCGCGCGCGCGCG",     # GC-rich (class 1)
    "ACGTACGTACGTACGT",     # Mixed (class 2)
    "AGTCAGTCAGTCAGTC",     # Mixed (class 2)
    "TACGTACGTACGTACG",     # Mixed (class 2)
]

y_dna_train = [0, 0, 0, 1, 1, 1, 2, 2, 2]
class_names = {0: 'AT-rich', 1: 'GC-rich', 2: 'Mixed'}

X_dna_test = [
    "ATATGTATGTATGTAT",     # Should be AT-rich
    "GCGGCGGCGGCGGCGG",     # Should be GC-rich
    "ACGTCGATCGATCGAT",     # Should be Mixed
]

print("DNA Sequence Classification Demo")
print("Classes: AT-rich (0), GC-rich (1), Mixed (2)\n")

In [None]:
# Classify with k=3
clf_dna = CompressionKNN(n_neighbors=3)
clf_dna.fit(X_dna_train, y_dna_train)

dna_preds = clf_dna.predict(X_dna_test)
dna_probs = clf_dna.predict_proba(X_dna_test)

print("DNA Classification Results (k=3):\n")
for seq, pred, prob in zip(X_dna_test, dna_preds, dna_probs):
    print(f"Sequence: {seq}")
    print(f"  -> Class {pred} ({class_names[pred]}) with {prob:.2%} confidence\n")

## Example 6: Comparison of k=1 vs k=3 vs k=5

Let's visualize how the choice of k affects classification consistency.

In [None]:
# Create a test case with some ambiguous samples
X_compare_train = [
    "aaaaa", "aaaaa", "aaaaa", "aaaaa",  # Class A: 4 samples
    "bbbbb", "bbbbb",                     # Class B: 2 samples
]

y_compare_train = ['A', 'A', 'A', 'A', 'B', 'B']

# Ambiguous test sample (closer to B but A has more samples)
X_compare_test = ["bbbba"]

print("Comparing different k values on an ambiguous sample\n")
print("Training set: 4 samples of class A ('aaaaa'), 2 samples of class B ('bbbbb')")
print(f"Test sample: '{X_compare_test[0]}' (mostly B's but with one A)\n")

for k in [1, 2, 3, 4, 5, 6]:
    clf = CompressionKNN(n_neighbors=k)
    clf.fit(X_compare_train, y_compare_train)
    pred = clf.predict(X_compare_test)[0]
    prob = clf.predict_proba(X_compare_test)[0]
    
    print(f"k={k}: Predicted class {pred} with {prob:.2%} confidence")

## Example 7: Custom Encoding

The classifier supports custom encoding for string data. By default, it uses UTF-8, but you can specify a different encoding if needed.

In [None]:
# Example with UTF-8 (default)
X_encoding_train = [
    "café",
    "naïve", 
    "résumé",
    "hello",
    "world",
    "python"
]

y_encoding_train = ['french', 'french', 'french', 'english', 'english', 'english']

X_encoding_test = ["jalapeño", "computer"]

print("Testing with different encodings:\n")

# UTF-8 encoding (default)
clf_utf8 = CompressionKNN(n_neighbors=3, encoding='utf-8')
clf_utf8.fit(X_encoding_train, y_encoding_train)
preds_utf8 = clf_utf8.predict(X_encoding_test)
probs_utf8 = clf_utf8.predict_proba(X_encoding_test)

print("With UTF-8 encoding (default):")
for text, pred, prob in zip(X_encoding_test, preds_utf8, probs_utf8):
    print(f"  '{text}' -> {pred} ({prob:.2%} confidence)")

# Latin-1 encoding
clf_latin1 = CompressionKNN(n_neighbors=3, encoding='latin-1')
clf_latin1.fit(X_encoding_train, y_encoding_train)
preds_latin1 = clf_latin1.predict(X_encoding_test)
probs_latin1 = clf_latin1.predict_proba(X_encoding_test)

print("\nWith Latin-1 encoding:")
for text, pred, prob in zip(X_encoding_test, preds_latin1, probs_latin1):
    print(f"  '{text}' -> {pred} ({prob:.2%} confidence)")

## Summary

This notebook demonstrated:

1. **Binary classification** with default parameters
2. **Effect of k parameter** on predictions and confidence
3. **Language detection** using string labels
4. **Code language classification** based on syntax patterns
5. **Multiclass classification** with DNA sequences
6. **k-value comparison** showing how neighbor count affects results
7. **Custom encoding** support for different character encodings

Key takeaways:
- CompressionKNN works well with text/string data where compression patterns reveal similarity
- Strings are automatically converted to bytes using the specified encoding (default: UTF-8)
- The classifier can be imported directly: `from explodingham.models.compression_learning import CompressionKNN`
- Lower k values (1-3) are more sensitive to local patterns
- Higher k values (5+) provide more stable, averaged predictions
- The classifier naturally supports both numeric and string labels
- Compression-based distance is effective for pattern recognition without feature engineering