In [None]:
# Setting random seed to avoid randomness
from sniffer.training import seed_everything
seed_everything(0)

# Example Notebook

This notebook provides a few examples of the code that we ran for the CipherSniffer paper.

In this notebook, we use the `Cipherdata_sample` (a very small subset of the Cipherdata). Its important to note that the results in the paper are based on models trained on the full dataset

## Ciphers

These are the 5 ciphers that we applied to create the CipherData.

In [None]:
from sniffer.ciphers import substitution, transposition, reverse, shift, wordflip

In [None]:
text = "canada wins the world cup"

print("Substitution: ", substitution(text))
print("Transposition: ", transposition(text))
print("Shift: ", shift(text))
print("Reverse: ", reverse(text))
print("Wordflip: ", wordflip(text))

# Custom GloVe

To train GloVe embeddings on your own corpus, navigate to the [official GloVe Repository](https://github.com/stanfordnlp/GloVe) and replace the `demo.sh` file with `modified_demo.sh` found in this repository. You will also have to create a copy of the `embedding.txt` file with all the text on a single line. This can be done in the terminal with the following command `tr '\n' ' ' < input.txt > output.txt`.

# Tokenizers

In this section, we show how word-level and subword-level tokenizers are trained. This example skips over a small detail where we remove the labels from the txt file. This step can be done with the following command `sed 's/^..//' input.txt > output.txt`.

In [None]:
from sniffer.tokenizers import bpe_train, wordpiece_train, tokenizer_train

In [None]:
infpath = "data/cipherdata_sample/embedding.txt"
outfpath = "./test"

bpe_train(infpath, outfpath)
wordpiece_train(infpath, outfpath)
tokenizer_train(infpath, outfpath)

## GRU + BPE Model Training Example

In this example, we show how to train a GRU model with a BPE tokenizer. 

In [None]:
from sniffer.training import load_data, ohe_labels, keras_train, evaluate
from sniffer.tokenizers import subword_level
from sniffer.models import GRU_model_trainable

In [None]:
# Config
N_LABELS = 6
MAX_SEQUENCE_LENGTH = 158 # 76 -> normal, 158 -> subword, 443 -> character level
cipher_data = "data/cipherdata_sample"
tokenizer_file = "data/tokenizers/BPE_trained.json"

# Loading Data
train, valid, test = load_data(cipher_data)

# Tokenizing Data
x_train, x_valid, x_test, vocab_size = subword_level(train, valid, test, tokenizer_file, MAX_SEQUENCE_LENGTH)

# Encoding Labels
y_train, y_valid, y_test = ohe_labels(train, valid, test, N_LABELS)

# Defining Models
model = GRU_model_trainable(MAX_SEQUENCE_LENGTH, vocab_size)

In [None]:
# Training Model
keras_train(model, x_train, y_train, x_valid, y_valid)

In [None]:
# Evaluating model
evaluate(model, x_train, x_valid, x_test, y_train, y_valid, y_test)

## BERT Training Example

In this section, we show how the BERT model was trained. Note that the computational resources needed to train this model are much more than the GRU and LSTM architectures. 

In [None]:
from sniffer.training import load_data
from sniffer.bert import BertClassifier, bert_evaluate, bert_train
from transformers import BertTokenizer

In [None]:
# Data file path
cipher_data = "data/cipherdata_sample"

# Loading Data
train, valid, test = load_data(cipher_data)

# Loading Model and tokenizer
model = BertClassifier()
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
# Training Model
bert_train(model, train, valid, tokenizer)

In [None]:
# Evaluating Model
bert_evaluate(model, test, tokenizer)