## **Gibberish classifier**

##### **1. Loading the data**

In [None]:
# Uncomment the following line to install dependencies
# %pip install pandas numpy scikit-learn matplotlib

import re
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, 
                             ConfusionMatrixDisplay, 
                             classification_report, 
                             roc_auc_score, 
                             roc_auc_score,)

In [None]:
# Get good data
with open("good.txt", encoding="utf-8") as f:
    good = f.read().splitlines()

# Get bad data
with open("bad.txt", encoding="utf-8") as f:
    bad = f.read().splitlines()

# Put them in a dataframe
g = pd.DataFrame({"X": good, "y": [0 for _ in range(len(good))]})
b = pd.DataFrame({"X": bad, "y": [1 for _ in range(len(bad))]})

df = pd.concat([g, b])
df

##### **2. Preprocessing**

In [None]:
# In practice, you should experiment with different preprocessing steps
# For instance, would it be beneficial to `normalize` the strings?
# Let's only remove duplicates and strings with a length of 1 for now
df.drop_duplicates(subset=["X"], inplace=True)
df.dropna(subset=["X"], inplace=True)
df = df[df["X"].str.len() > 1]
df

##### **3. Feature Engineering**

$$
\text{Entropy} = -\sum_{i=1}^{n} p(i) \cdot \log_2(p(i))
$$

In [None]:
# 3.1 Entropy

def entropy(document: str) -> float:
    char_counts = Counter(document)
    total_chars = len(document)
    entropy = 0.0
    for count in char_counts.values():
        probability = count / total_chars
        entropy -= probability * np.log2(probability)
    return entropy

# Let's test our function
print("The entropy for `Hello world` is:", entropy("Hello world"))
print("The entropy for `xfdsefxfd` is:", entropy("xfdfefxfd"))

In [None]:
# 3.2 Proportion of vowels

def proportion_vowel(document: str) -> float:
    vowels = re.findall("[aeiouyáéíóúýàèìòùäëïöüÿ]", document, re.IGNORECASE)
    return len(vowels) / len(document)

# Let's test our function
print("The proportion of vowels for `Hello world` is:", proportion_vowel("Hello world"))
print("The proportion of vowels for `xfdesefxfd` is:", proportion_vowel("xfdfefxfd"))

In [None]:
# 3.3 Proportion of non-alphabetic

def proportion_non_alpha(document: str) -> float:
    non_alpha = re.findall("[^ a-zA-Z]", document)
    return len(non_alpha) / len(document)

print("The proportion of special characters for `Hello world` is:", proportion_non_alpha("Hello world"))
print("The proportion of special characters for `xfdesefxfd` is:", proportion_non_alpha("xfdfefxfd"))

$$
\text{Euclidean Distance} = \sqrt{{(x_2 - x_1)^2 + (y_2 - y_1)^2}}
$$


In [None]:
# 3.4 Keystroke distance

KEYBOARD_GRAPH ={
    "q": (1, 5),
    "w": (3, 5),
    "e": (5, 5),
    "r": (7, 5),
    "t": (9, 5),
    "y": (11, 5),
    "u": (13, 5),
    "i": (15, 5),
    "o": (17, 5),
    "p": (19, 5),
    "a": (1.5, 3),
    "s": (3.5, 3),
    "d": (5.5, 3),
    "f": (7.5, 3),
    "g": (9.5, 3),
    "h": (11.5, 3),
    "j": (13.5, 3),
    "k": (15.5, 3),
    "l": (17.5, 3),
    "z": (2.5, 1),
    "x": (4.5, 1),
    "c": (6.5, 1),
    "v": (8.5, 1),
    "b": (10.5, 1),
    "n": (12.5, 1),
    "m": (14.5, 1),
    "1": (0.5, 7),
    "2": (2.5, 7),
    "3": (4.5, 7),
    "4": (6.5, 7),
    "5": (8.5, 7),
    "6": (10.5, 7),
    "7": (12.5, 7),
    "8": (14.5, 7),
    "9": (16.5, 7),
    "0": (18.5, 7)
}

VALID_KEYS = set(list(KEYBOARD_GRAPH.keys()))

def keystroke_distance(document: str) -> float:
    # get the coordinates for the input document
    document = document.lower()
    coordinates = [KEYBOARD_GRAPH[char] for char in document if char in VALID_KEYS]
    num_coordinates = len(coordinates)

    # calculate euclidean distance between every character
    if num_coordinates > 1:
        distances = [
            np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
            for (x1, y1), (x2, y2) in zip(coordinates, coordinates[1:])
        ]
        return sum(distances) / (num_coordinates - 1)

    return -1.0


# Let's test our function
print("The keystroke distance for `Hello world` is:", keystroke_distance("Hello world"))
print("The keystroke distance for `xfdesefxfd` is:", keystroke_distance("xfdfefxfd"))

##### **4. Create dataset**

##### **5. Create a train- and a test dataset**

In [None]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(df["X"], df['y'], 
                                                    stratify=df['y'], 
                                                    test_size=0.2, 
                                                    random_state=42)
X_train, y_train

In [None]:

def create_dataset(data):
    dataset = pd.DataFrame()
    dataset['length'] = [len(x) for x in data]
    dataset['entropy'] = [entropy(x) for x in data]
    dataset['vowels'] = [proportion_vowel(x) for x in data]
    dataset['non_alpha'] = [proportion_non_alpha(x) for x in data]
    dataset['distance'] = [keystroke_distance(x) for x in data]
    return dataset

X_train_1 = create_dataset(X_train)
X_test_1 = create_dataset(X_test)

X_train_1

##### **6. Fit the model**

In [None]:
model = RandomForestClassifier(random_state = 7)
model.fit(X_train_1, y_train)

##### **7. Evaluate the model**

In [None]:

# Class predictions
rf_predictions = model.predict(X_test_1)

# Probabilities for each class
rf_probs = model.predict_proba(X_test_1)[:, 1]

# Calculate roc auc
roc_value = roc_auc_score(y_test, rf_probs)
print("ROC: ", roc_value)


# Confusion matrix
cm = confusion_matrix(y_test, rf_predictions, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=model.classes_)
disp.plot()
plt.show()


# Classification report
print("classification report:\n", 
classification_report(y_test, rf_predictions))


# Feature importance
importances = model.feature_importances_
forest_importances = pd.Series(importances)

fig, ax = plt.subplots()
forest_importances.plot.bar(ax=ax)
ax.set_xticklabels(X_train_1.columns)
ax.set_title("Feature importance")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

##### **9. Make predictions using the model**

In [None]:
text = [
    "akvnzxkjcvhiu",
    "ojafupavuihuiaevhu",
    "zxvcmbnxjvhgxjvg",
    "This is a good response! :)",
    "F ofsowi7v97",
    "slkjfseiuf sljefsefk jslekfjseklfjsel"
]

data = create_dataset(text)

model.predict(data).tolist()

Let's compare it to our 'real' model

In [None]:
from gibberish import model as clf

In [None]:
proba = clf.predict(X_test.tolist())

In [None]:
roc_value = roc_auc_score(y_test, proba)
print("ROC: ", roc_value)


class_pred = [round(p) for p in proba]
acc = classification_report(y_test.tolist(), class_pred)
print(acc)

cm = confusion_matrix(y_test, class_pred, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=[0,1])
disp.plot()
plt.show()

##### **10. Explainable machine learning using [SHAP](https://github.com/shap/shap) (extra)**

In [None]:
import shap
shap.initjs()

In [None]:
sample = X_train_1.sample(1000)
tree_explainer = shap.TreeExplainer(model)
shap_values = tree_explainer.shap_values(sample)

In [None]:
# Plot the summary plot to show feature importance
shap.summary_plot(shap_values[1], sample, plot_type="violin")

In [None]:
test_strings = ["2739847293472390482 09230423947 29347"]
print(test_strings, end="\n\n")
test_data = create_dataset(test_strings)
print(test_data)
shap_values = tree_explainer.shap_values(test_data)
shap.force_plot(tree_explainer.expected_value[1], shap_values[1][0], test_data.iloc[0])

In [None]:
# the 'real' model
clf.predict(test_strings)