In [None]:
# Install the confusables package (parses Unicode confusable characters)
!pip install confusables

Collecting confusables
  Downloading confusables-1.2.0-py3-none-any.whl.metadata (5.2 kB)
Downloading confusables-1.2.0-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.5/268.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: confusables
Successfully installed confusables-1.2.0


In [None]:
import random
import numpy as np
import pandas as pd

from confusables import confusable_characters

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
# Example word list (expand with dictionary, domains, etc.)
words = ["password", "secure", "exchange", "login", "facebook", "microsoft", "apple", "orange"]

data, labels = [], []

for w in words:
    # Clean sample
    data.append(w)
    labels.append(0)

    # Homoglyph substitution
    noisy = list(w)
    replaced = False
    for i, ch in enumerate(noisy):
        homoglyphs = confusable_characters(ch)
        if homoglyphs and random.random() < 0.5:  # 50% chance to replace
            noisy[i] = random.choice(homoglyphs)
            replaced = True

    noisy_word = "".join(noisy)
    if replaced and noisy_word != w:
        data.append(noisy_word)
        labels.append(1)

print("Sample dataset:")
for d, l in zip(data[:10], labels[:10]):
    print(d, "->", l)


Sample dataset:
password -> 0
pãṦsw𑣈ṛꓓ -> 1
secure -> 0
ŚêcΥre -> 1
exchange -> 0
ｅxḈḩ𝝰ńge -> 1
login -> 0
lo𝓖in -> 1
facebook -> 0
Ꞙac𝑒ꓐỠő𝙺 -> 1


In [None]:
# Use character-level n-grams
vectorizer = CountVectorizer(analyzer="char", ngram_range=(1,2))
X = vectorizer.fit_transform(data)
y = np.array(labels)

print("Feature matrix shape:", X.shape)


Feature matrix shape: (16, 133)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])


Train size: 11
Test size: 5


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, C=1.0),
    "SVM": SVC(kernel="linear", probability=True),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv)
    print(f"{name} CV Accuracy: {scores.mean():.3f} ± {scores.std():.3f}")


Logistic Regression CV Accuracy: 0.367 ± 0.194
SVM CV Accuracy: 0.367 ± 0.194
Random Forest CV Accuracy: 0.433 ± 0.226


In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("="*40)
    print(f"{name} Test Performance")
    print(classification_report(y_test, y_pred))


Logistic Regression Test Performance
              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.67      1.00      0.80         2

    accuracy                           0.80         5
   macro avg       0.83      0.83      0.80         5
weighted avg       0.87      0.80      0.80         5

SVM Test Performance
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.50      0.50      0.50         2

    accuracy                           0.60         5
   macro avg       0.58      0.58      0.58         5
weighted avg       0.60      0.60      0.60         5

Random Forest Test Performance
              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.67      1.00      0.80         2

    accuracy                           0.80         5
   macro avg       0.83      0.83      

In [None]:
test_samples = [
    "secure",       # clean
    "ѕесurе",       # homoglyphs (Cyrillic)
    "password",     # clean
    "раsswоrd",     # homoglyphs
]

X_new = vectorizer.transform(test_samples)
preds = models["Logistic Regression"].predict(X_new)

for s, p in zip(test_samples, preds):
    print(s, "->", "Homoglyph" if p == 1 else "Clean")


secure -> Clean
ѕесurе -> Homoglyph
password -> Clean
раsswоrd -> Clean
