In [1]:
%run "../code/translator.py"
#%run "../code/validation.py"

In [2]:
import re
import pickle
import numpy as np
import pandas as pd
from itertools import chain, permutations
from collections import Counter
import string
import math
import random
import matplotlib.pyplot as plt
import operator
%matplotlib inline

In [3]:
character_set = string.ascii_lowercase

In [4]:
def process_word(original_word):
    processed_word = original_word.lower()
    
    # Check if letters in character_set
    valid_word = True
    for letter in list(processed_word):
        if letter not in character_set:
            valid_word = False

    if valid_word:
        return processed_word
    else:
        return None

In [5]:
def get_encoded_words_from_msg(msg_enc):
    word_enc_list = set()

    # Run through each unique encoded word and process it accordingly
    for word_enc in set(re.findall(r"[\w']+", msg_enc)):
        processed_word = process_word(word_enc)
        if processed_word is not None:
            word_enc_list.add(processed_word)
            
    return word_enc_list

In [6]:
import unittest

class Testjes(unittest.TestCase):

    def test_process_word(self):
        test_word = "Nederland"
        self.assertEqual('nederland', process_word(test_word))
        
    def test_get_encoded_words_from_msg(self):
        test_sentence = "bla die bla dat Zeker -+"
        self.assertEqual({"bla","die","dat","zeker"}, get_encoded_words_from_msg(test_sentence))
        
suite = unittest.TestLoader().loadTestsFromModule(Testjes())
unittest.TextTestRunner().run(suite)

..
----------------------------------------------------------------------
Ran 2 tests in 0.002s

OK


<unittest.runner.TextTestResult run=2 errors=0 failures=0>

In [7]:
class PossibilityGenerator(object):
    _character_set = string.ascii_lowercase
    _word_dictionary = None
    
    def __init__(self, character_set, language):
        """
        Get the character frequency for a given language
        """
        self._character_set = character_set
        
        if language not in ['en', 'nl']:
            raise NotImplementedError(
                'Language {} not supported'.format(language))
        
        self._load_language_words(language)

    def _load_language_words(self, language):
        # Load the words of the language and process them into a dict
        words = pickle.load(
            open("../data/{}wiktionary.p".format(language), 
                 "rb"))

        self._word_dictionary = dict()
        for word in words:
            processed_word = process_word(word)
            if processed_word is not None:
                try:
                    self._word_dictionary[self._get_key_for_word(processed_word)].append(processed_word)
                except KeyError:
                    self._word_dictionary[self._get_key_for_word(processed_word)] = [processed_word]
    
    def _get_key_for_word(self, word):
        # Determine duplicates
        duplicate_indices = []
        for letter, letter_count in Counter(word).items():
            if letter_count > 1:
                indices = []
                # Find indices of letter
                from_index = 0
                while len(indices) < letter_count:
                    new_index = word.index(letter, from_index)
                    indices.append(new_index)
                    from_index = new_index + 1

                # Add to duplicate indices list
                duplicate_indices.append(tuple(indices))
        duplicate_indices = tuple(sorted(duplicate_indices))

        return (len(word),len(Counter(word)),duplicate_indices)
    
    def get_possible_words(self, word_enc, cipher={}):
        possible_words = self._word_dictionary[self._get_key_for_word(word_enc)]
        cipher_copy = cipher.copy()

        # Determine what to fill cipher with
        # Replace this cipher with a negation of the already known values
        # Those values cannot become the result of the cipher anymore
        if len(cipher.values()) <= 0:
            cipher_fill = '.'
        else:
            cipher_fill = '[^'+''.join(cipher.values())+']'

        # Fill cipher with missing keys
        for c in list(self._character_set):
            if c not in cipher_copy.keys():
                cipher_copy[c] = cipher_fill

        regex_string = decipher_text(word_enc, cipher_copy)

        regex = re.compile(regex_string)

        # Filter out possibilities with current cipher keys
        possible_words = list(filter(regex.search, possible_words))
        return possible_words
    
    def generate_possible_words(self, word_enc, cipher={}):  
        possible_words = self.get_possible_words(word_enc, cipher)

        for word_dec in possible_words:
            cipher_for_this = create_cipher(word_enc, word_dec)
            yield word_dec, {**cipher_for_this, **cipher}

In [8]:
import unittest

class Testjes2(unittest.TestCase):
    
    def setUp(self):
        self.pg = PossibilityGenerator(character_set, 'nl')

    def test_get_possible_words(self):
        word_enc = "Nederland"
        self.assertContains('nederland', self.pg.get_possible_words(word_enc))
        
suite = unittest.TestLoader().loadTestsFromModule(Testjes2())
unittest.TextTestRunner().run(suite)

E
ERROR: test_get_possible_words (__main__.Testjes2)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-8-b95023caad9b>", line 10, in test_get_possible_words
    self.assertContains('nederland', self.pg.get_possible_words(word_enc))
AttributeError: 'Testjes2' object has no attribute 'assertContains'

----------------------------------------------------------------------
Ran 1 test in 11.332s

FAILED (errors=1)


<unittest.runner.TextTestResult run=1 errors=1 failures=0>

# Test message letter frequency

In [9]:
msg = '''
this is a test message to see if everything is working properly
we should be able to see if we can encrypt this one
english is a hard language to comprehend in such a way as it is written now
nobody really knows how many more lines we have to type before this damn thing
finally translates to something useful
let us just keep on trying
'''
msg = '''
dit is een test berichtje om te kijken of alles een beetje werkt enzovoorts
misschien is dit nog niet voldoende maar iets is nog altijd beter dan niets toch
'''
# msg = '''
# dit is een test bericht om te kijken of het ontcijferen van een soortgelijk bericht
# toepasselijk is in de nederland taal 
# contentmanagementsystemen frituurpannen ijdelheid familietrekjes betrekkelijk gezelligheid
# '''
#msg = '''contentmanagementsystemen'''

# Remove linebreaks
msg = msg.replace('\n', ' ').replace('\r', '')

#Encode to something
random_shuffle_characters = random.sample(character_set, len(character_set)) 
cipher_enc = create_cipher(character_set, random_shuffle_characters)
msg_enc = encipher_text(msg, cipher_enc)

#Print encoding
print(msg_enc)

 jwp wc qqr pqcp sqewlbpyq gx pq vwyvqr go zffqc qqr sqqpyq dqevp qrkgaggepc xwcclbwqr wc jwp rgn rwqp agfjgqrjq xzze wqpc wc rgn zfpwyj sqpqe jzr rwqpc pglb 


In [10]:
# Test the actual cipher for decoding
decipher_text(msg_enc, cipher_enc)

' dit is een test berichtje om te kijken of alles een beetje werkt enzovoorts misschien is dit nog niet voldoende maar iets is nog altijd beter dan niets toch '

# Find actual cipher

Cycle through each group of words one by one, decipher them and see if it solves the puzzle. If not, try this with a different group first.

In [13]:
class Solver(object):
    _pg = None
    _character_set = None
    
    def __init__(self, character_set, language):
        self._character_set = character_set
        self._pg = PossibilityGenerator(character_set=character_set, language=language)
        
    def _calc_complexity(self, word_list, cipher={}):
        complexity = 1
        for w in word_list:
            complexity *= len(self._pg.get_possible_words(w,cipher))

        return complexity
    
    def _get_best_cipher_and_words_correct(self, ordered_list_words_enc, cipher_so_far={}):
        if len(ordered_list_words_enc) <= 0:
            return cipher_so_far, 0
        else:
            best_cipher=None
            best_score=0
            for possible_word_dec, cipher_used in self._pg.generate_possible_words(ordered_list_words_enc[0], cipher_so_far):
                new_cipher_so_far, num_correct = self._get_best_cipher_and_words_correct(ordered_list_words_enc[1:], cipher_used)
                if num_correct+1 > best_score:
                    best_cipher = new_cipher_so_far
                    best_score = num_correct+1

            if best_score <= 0:
                # We didn't find anything, ignore this word from now on
                return self._get_best_cipher_and_words_correct(ordered_list_words_enc[1:], cipher_so_far)
            else:
                return best_cipher, best_score
            
    def _generate_subset_words_per_letter(self, word_list, cipher_fixed, max_complexity=1e11, min_set_size=4):
        # Determine the best order to walk through the letters (certain onces will be easier to workout)
        # Now we choose the letters that occur in most words first
        letter_scores = list()
        for char in list(self._character_set):
            word_set = [w for w in word_list if char in w]
            if len(word_set) == 0:
                avg_word_set_complexity = 0
            else:
                avg_word_set_complexity = min([self._calc_complexity([w],{}) for w in word_set])
            score = len(word_set) - avg_word_set_complexity
            letter_scores.append((char, score, word_set))

        letter_scores = sorted(letter_scores, key=lambda x: x[1], reverse=True)

        for char, char_score, word_set in letter_scores:
            complexity = self._calc_complexity(word_set, cipher_fixed)

            # As long as the complexity is too high, reduce the set size
            while complexity > max_complexity and len(word_set) >= min_set_size:
                word_set = word_set[:-1]
                complexity = self._calc_complexity(word_set, cipher_fixed)

            # We only take sets of minimum x words
            if len(word_set) >= min_set_size:
                yield char, word_set
                
    def solve(self, msg_enc):
        word_enc_possibility_dict = dict()
        
        for word_enc in get_encoded_words_from_msg(msg_enc=msg_enc):
            possibilities = self._pg.get_possible_words(word_enc)
            word_enc_possibility_dict[word_enc] = possibilities
        
        # Sort the word_encoded list based on possibilities
        word_enc_list_ordered = sorted(word_enc_possibility_dict, key=lambda k: len(word_enc_possibility_dict[k]), reverse=False)

        cipher_already_fix = {}
        min_set_size = 10
        while min_set_size > 0:
            print("---- Start deciphering with minimum set of {}".format(min_set_size))

            gen_subsets = self._generate_subset_words_per_letter(word_enc_list_ordered, cipher_already_fix, max_complexity=1e12, min_set_size=min_set_size)
            for used_letter, used_set in gen_subsets:
                if used_letter not in cipher_already_fix.keys():
                    print("\t-- Letter {}".format(used_letter))
                    found_cipher, correct_words = self._get_best_cipher_and_words_correct(used_set,cipher_already_fix)
                    correctly_translated = correct_words/len(used_set)

                    try:
                        if found_cipher[used_letter] in cipher_already_fix.values():
                            print("\t\t!!!! PROBLEM: this letter ({}) has been used to translate to already".format(found_cipher[used_letter]))
                        else:
                            cipher_already_fix[used_letter] = found_cipher[used_letter]

                            print("\t\tDecided for {0}:{1}".format(used_letter, found_cipher[used_letter]))
                            print("\t\tNumber of correctly translated words: {0}/{1} ({2}%)".format(correct_words, len(used_set), int(100*correctly_translated)))
                    except KeyError:
                        print("\t\tWe'd hoped to find this one, but couldnt: {}".format(used_letter))

            min_set_size -= 1

        print()
        print("Number of keys found: {}".format(len(cipher_already_fix)))
        return cipher_already_fix

In [14]:
slv = Solver(character_set, language='nl')

In [16]:
found_cipher = slv.solve(msg_enc)

---- Start deciphering with minimum set of 10
---- Start deciphering with minimum set of 9
---- Start deciphering with minimum set of 8
---- Start deciphering with minimum set of 7
---- Start deciphering with minimum set of 6
	-- Letter q
		Decided for q:e
		Number of correctly translated words: 6/6 (100%)
	-- Letter r
		Decided for r:n
		Number of correctly translated words: 7/7 (100%)
	-- Letter p
		Decided for p:t
		Number of correctly translated words: 6/6 (100%)
	-- Letter g
		Decided for g:o
		Number of correctly translated words: 6/6 (100%)
	-- Letter c
		Decided for c:s
		Number of correctly translated words: 7/7 (100%)
	-- Letter w
		Decided for w:i
		Number of correctly translated words: 9/9 (100%)
---- Start deciphering with minimum set of 5
	-- Letter e
		Decided for e:r
		Number of correctly translated words: 5/5 (100%)
---- Start deciphering with minimum set of 4
	-- Letter j
		Decided for j:d
		Number of correctly translated words: 4/4 (100%)
	-- Letter y
		Decided for y

In [17]:
found_cipher_copy = found_cipher.copy()

# Figure out which values are missing

# Fill cipher with missing keys
for c in list(character_set):
    if c not in found_cipher_copy.keys():
        found_cipher_copy[c] = '_'

print(decipher_text(msg_enc, found_cipher_copy))
msg_enc

 dit is een test berichtje om te kijken ou alles een beetje werkt enzovoorts misschien is dit nog niet voldoende maar iets is nog altijd beter dan niets toch 


' jwp wc qqr pqcp sqewlbpyq gx pq vwyvqr go zffqc qqr sqqpyq dqevp qrkgaggepc xwcclbwqr wc jwp rgn rwqp agfjgqrjq xzze wqpc wc rgn zfpwyj sqpqe jzr rwqpc pglb '