In [85]:
from itertools import *
from copy import copy, deepcopy
from heapq import *

Let $\mathcal{A}$ be our alphabet:

In [191]:
def make_alphabet_entry(i):
    alpha = i + ord('a')
    return chr(alpha),i

alphabet = dict(map(make_alphabet_entry, range(26)))
alphabet.update({' ':26})
n = len(alphabet)
alphabet, n

({' ': 26,
  'a': 0,
  'b': 1,
  'c': 2,
  'd': 3,
  'e': 4,
  'f': 5,
  'g': 6,
  'h': 7,
  'i': 8,
  'j': 9,
  'k': 10,
  'l': 11,
  'm': 12,
  'n': 13,
  'o': 14,
  'p': 15,
  'q': 16,
  'r': 17,
  's': 18,
  't': 19,
  'u': 20,
  'v': 21,
  'w': 22,
  'x': 23,
  'y': 24,
  'z': 25},
 27)

We define a function `encode` that consumes a string and produces a list of integer in our field:

In [192]:
def encode(s): return list(map(lambda i: alphabet[i], s))

inverse_alphabet = {v:k for k,v in alphabet.items()}
def decode(e): return "".join(map(lambda i: inverse_alphabet[i], e))

The following plaintext is prose taken from [here]:

[here]:http://www.bartleby.com/209/2.html

In [194]:
with open('plain_text.txt', 'r') as f:
    plain_text = f.read().lower()

remove_chars = [',', '.', ';', ':', '\n', '-', '(', ')', "'", '"']
for rc in remove_chars: 
    plain_text = plain_text.replace(rc, '')

plain_text = "".join(filter(lambda c: not c.isdigit(), plain_text))
m = len(plain_text)
plain_text, m

('the attraction of medieval literature comes perhaps more stronglyfrom some other countries than from england in france and provencein germany and iceland there were literary adventurers more daringand achievements more distinguished it was not in england that themost wonderful things were produced there is nothing in old englishthat takes hold of the mind with that masterful and subduing powerwhich still belongs to the lyrical stanzas of the troubadours andminnesingers to welsh romance or to the epic prose of the icelandhistories   the norman conquest degraded the english language fromits literary rank and brought in a new language for the politerliterature it did not destroy in one sense it did not absolutelyinterrupt english literature but it took away the english literarystandard and threw the country back into the condition of italybefore dantean anarchy of dialects when a new literary languagewas established in the time of chaucer the middle ages were nearlyover and so it happen

In [195]:
assert decode(encode(plain_text)) == plain_text

In [196]:
encoded_plain_text = encode(plain_text)
encoded_plain_text

[19,
 7,
 4,
 26,
 0,
 19,
 19,
 17,
 0,
 2,
 19,
 8,
 14,
 13,
 26,
 14,
 5,
 26,
 12,
 4,
 3,
 8,
 4,
 21,
 0,
 11,
 26,
 11,
 8,
 19,
 4,
 17,
 0,
 19,
 20,
 17,
 4,
 26,
 2,
 14,
 12,
 4,
 18,
 26,
 15,
 4,
 17,
 7,
 0,
 15,
 18,
 26,
 12,
 14,
 17,
 4,
 26,
 18,
 19,
 17,
 14,
 13,
 6,
 11,
 24,
 5,
 17,
 14,
 12,
 26,
 18,
 14,
 12,
 4,
 26,
 14,
 19,
 7,
 4,
 17,
 26,
 2,
 14,
 20,
 13,
 19,
 17,
 8,
 4,
 18,
 26,
 19,
 7,
 0,
 13,
 26,
 5,
 17,
 14,
 12,
 26,
 4,
 13,
 6,
 11,
 0,
 13,
 3,
 26,
 8,
 13,
 26,
 5,
 17,
 0,
 13,
 2,
 4,
 26,
 0,
 13,
 3,
 26,
 15,
 17,
 14,
 21,
 4,
 13,
 2,
 4,
 8,
 13,
 26,
 6,
 4,
 17,
 12,
 0,
 13,
 24,
 26,
 0,
 13,
 3,
 26,
 8,
 2,
 4,
 11,
 0,
 13,
 3,
 26,
 19,
 7,
 4,
 17,
 4,
 26,
 22,
 4,
 17,
 4,
 26,
 11,
 8,
 19,
 4,
 17,
 0,
 17,
 24,
 26,
 0,
 3,
 21,
 4,
 13,
 19,
 20,
 17,
 4,
 17,
 18,
 26,
 12,
 14,
 17,
 4,
 26,
 3,
 0,
 17,
 8,
 13,
 6,
 0,
 13,
 3,
 26,
 0,
 2,
 7,
 8,
 4,
 21,
 4,
 12,
 4,
 13,
 19,
 18,
 26,
 12,
 14,
 17,

Let $k \in \left(\frac{\mathbb{Z}}{n\mathbb{Z}}\right)^{m}$ be a *key* of length $m\in\mathbb{N}$, for example:

In [197]:
# it should be better to randomly generate it to keep it safe
k = encode("ericsmullyan")
k, len(k)

([4, 17, 8, 2, 18, 12, 20, 11, 11, 24, 0, 13], 12)

Now we are in the position to define the `encrypt` function:

In [198]:
def encrypt(message, key):
    return [(p+v)%n for p,v in zip(message, cycle(key))]

def decrypt(cipher, key):
    return [(c-v)%n for c,v in zip(cipher, cycle(key))]

In [203]:
cipher_text = encrypt(encoded_plain_text, k)
decode(cipher_text)

'xymbsembl tvsdhqxlfpofehebhn eyblqudiqkqdqlk bruef bd kpkptdsdonprkzxxsaqvhqktybk ogrjzkwdtdsynmjhworqgrwynqdzvbxcuynb nruhri opy evrqogiyuyixa hqqewxuyoxtuihmbnqkpkiifihitpluofbnfyhmtjlfzbb qehqpymgokycumvcgdqgdcxmavvhf dmtyduvwymfrumkgysmreab ztpydlnruhvzmmkdeezsiabn gopofgpqaj z cktediqxtfpnnpa flvzgrulkyltumdob ztzwa rrxtkjtmslq feamurthwoxosdjpgrybyoxwvxyhvzmmkxysfihnwcluyoxsgfubkest zted yqezlldtilmfvtqeslkdl flvhnpcbnli exrvasdtzqxtuiqatffvloludwqipvybyybsvrxmtjlmzkteywyhtfyuynb avqaqreapkbpvgqxtfdykzc flvhkuqelyahvwjwt qlkkxtuiqvqiyuyk o ukmuklxproaqiuhvzqtpydlvwyhnsz eldemjhwo elkwftrvrz rcuyvxa hqjtff sdxi drhpwhtwlkggexmbx kkdeemtetkkqkwtqedejbtwlbdkaiqddwvrpycdookdzvbfzykcbneiqqvrpbokkofdrjufxndpiyvrjmtifidkbntpz jrxbdpoafyhmbtfmktq fsesbshuikqhrdvviculskiifihitpdmlyaadhqipvlmsbbwmxymbu nydoymfrkmrugdzxtuiqkqepbdtlnmswhkkmeimbfavvhfszmplk nrrzezjtzqxdvebmekdtgsbnmeqvgnletdbrnvohnsz eldeieihgjeumwfsuiuhkelmspxtvqvhqxlwslrcrvqajwlftoalrdrogjlppbb  irznp opbxa hq qrumksypbidm

In [204]:
assert decode(decrypt(cipher_text, k)) == plain_text

In [205]:
def frequency(elem, lst):
    return lst.count(elem)

def coincidence_index(lst, A=alphabet):
    sums = 0
    freqs = {}
    denom = len(lst)
    #if denom == 0 or denom == 1: return 0,{}
    for k,v in A.items(): freqs[k] = frequency(v, lst)
    ci = sum(v*(v-1) for v in freqs.values())/(denom*(denom-1))
    return ci, freqs

def mutual_coincidence_index(fst, snd, A=alphabet):
    fst_ci = coincidence_index(fst, A)[1]
    snd_ci = coincidence_index(snd, A)[1]
    return sum(fst_ci[k] * snd_ci[k] for k in A.keys())/(len(fst)*len(snd))
    #return sum((fst_ci[k]/m)*(snd_ci[k]/m) for k,v in A.items())

In [206]:
coincidence_index(encoded_plain_text)

(0.07593726979645456,
 {' ': 305,
  'a': 115,
  'b': 19,
  'c': 40,
  'd': 67,
  'e': 215,
  'f': 34,
  'g': 39,
  'h': 81,
  'i': 113,
  'j': 0,
  'k': 9,
  'l': 57,
  'm': 29,
  'n': 140,
  'o': 120,
  'p': 21,
  'q': 1,
  'r': 125,
  's': 93,
  't': 162,
  'u': 35,
  'v': 14,
  'w': 29,
  'x': 0,
  'y': 26,
  'z': 1})

In [207]:
coincidence_index(cipher_text)

(0.039283963688410486,
 {' ': 81,
  'a': 44,
  'b': 87,
  'c': 33,
  'd': 98,
  'e': 86,
  'f': 62,
  'g': 45,
  'h': 67,
  'i': 78,
  'j': 46,
  'k': 91,
  'l': 90,
  'm': 92,
  'n': 47,
  'o': 45,
  'p': 76,
  'q': 95,
  'r': 83,
  's': 55,
  't': 92,
  'u': 56,
  'v': 72,
  'w': 61,
  'x': 60,
  'y': 91,
  'z': 57})

In [208]:
def spread(message, block_length):
    return [message[i:i+block_length] for i in range(0, len(message), block_length)]
    
def col(spreaded, c, joiner=""):
    column = [lst[c] if c < len(lst) else None for lst in spreaded]
    ready = list(filter(lambda i: i is not None, column))
    return joiner.join(ready) if joiner is not None else ready

In [209]:
spreaded = spread(cipher_text, 12)
spreaded

[[23, 24, 12, 1, 18, 4, 12, 1, 11, 26, 19, 21],
 [18, 3, 7, 16, 23, 11, 5, 15, 14, 5, 4, 7],
 [4, 1, 7, 13, 26, 4, 24, 1, 11, 16, 20, 3],
 [8, 16, 10, 16, 3, 16, 11, 10, 26, 1, 17, 20],
 [4, 5, 26, 1, 3, 26, 10, 15, 10, 15, 19, 3],
 [18, 3, 14, 13, 15, 17, 10, 25, 23, 23, 18, 0],
 [16, 21, 7, 16, 10, 19, 24, 1, 10, 26, 14, 6],
 [17, 9, 25, 10, 22, 3, 19, 3, 18, 24, 13, 12],
 [9, 7, 22, 14, 17, 16, 6, 17, 22, 24, 13, 16],
 [3, 25, 21, 1, 23, 2, 20, 24, 13, 1, 26, 13],
 [17, 20, 7, 17, 8, 26, 14, 15, 24, 26, 4, 21],
 [17, 16, 14, 6, 8, 24, 20, 24, 8, 23, 0, 26],
 [7, 16, 16, 4, 22, 23, 20, 24, 14, 23, 19, 20],
 [8, 7, 12, 1, 13, 16, 10, 15, 10, 8, 8, 5],
 [8, 7, 8, 19, 15, 11, 20, 14, 5, 1, 13, 5],
 [24, 7, 12, 19, 9, 11, 5, 25, 1, 1, 26, 16],
 [4, 7, 16, 15, 24, 12, 6, 14, 10, 24, 2, 20],
 [12, 21, 2, 6, 3, 16, 6, 3, 2, 23, 12, 0],
 [21, 21, 7, 5, 26, 3, 12, 19, 24, 3, 20, 21],
 [22, 24, 12, 5, 17, 20, 12, 10, 6, 24, 18, 12],
 [17, 4, 0, 1, 26, 25, 19, 15, 24, 3, 11, 13],
 [17, 20, 7, 2

In [210]:
col(spreaded,2,None)

[12,
 7,
 7,
 10,
 26,
 14,
 7,
 25,
 22,
 21,
 7,
 14,
 16,
 12,
 8,
 12,
 16,
 2,
 7,
 12,
 0,
 7,
 0,
 0,
 23,
 25,
 14,
 19,
 12,
 15,
 7,
 13,
 1,
 16,
 19,
 7,
 21,
 0,
 8,
 12,
 7,
 0,
 23,
 7,
 22,
 21,
 12,
 7,
 7,
 22,
 25,
 9,
 7,
 12,
 19,
 1,
 22,
 21,
 16,
 9,
 12,
 26,
 12,
 18,
 21,
 8,
 8,
 12,
 10,
 10,
 7,
 7,
 25,
 12,
 21,
 7,
 7,
 7,
 7,
 0,
 14,
 25,
 26,
 12,
 0,
 7,
 12,
 26,
 15,
 12,
 0,
 12,
 7,
 8,
 25,
 21,
 21,
 1,
 7,
 21,
 25,
 3,
 7,
 12,
 12,
 7,
 21,
 16,
 22,
 25,
 7,
 13,
 25,
 7,
 0,
 22,
 9,
 22,
 25,
 15,
 26,
 21,
 13,
 22,
 22,
 3,
 19,
 12,
 7,
 3,
 8,
 12,
 12,
 0,
 1,
 9,
 11,
 11,
 26,
 2,
 9,
 12,
 22,
 3,
 26,
 15,
 7,
 12,
 26,
 7,
 12,
 21,
 21,
 7,
 7,
 12,
 0,
 0]

In [211]:
def analyze(cipher_text):
    res = {}
    # we discard the case where the key length equals the
    # length of the cipher text, since it is the case of 
    # OneTimePad chiper, which is unbreakable!
    for d in range(2, len(cipher_text)):
        spreaded = spread(cipher_text, d)
        res[d] = []
        for c in range(d):
            column = col(spreaded, c, joiner=None)
            if len(column) < 2: continue
            mci = coincidence_index(column)[0]
            res[d].append(mci)
    return res

def guess_key_length(analysis):
    candidates = {}
    for k,v in analysis.items():
        cs = list(filter(lambda i: i > .06, v))
        if cs and len(cs) > k/2: candidates[k] = cs
    return candidates

In [212]:
analysis = analyze(cipher_text)

In [213]:
guess_key_length(analysis)

{12: [0.0759493670886076,
  0.08248004515036685,
  0.08876884624687575,
  0.07861001370636136,
  0.0708699508183504,
  0.07941627025719585,
  0.07839294463498285,
  0.07251347378735913,
  0.0720235178833905,
  0.07618814306712396,
  0.07235015515270292,
  0.06663400293973543],
 24: [0.0688088283024992,
  0.07400194741966894,
  0.09087958455047063,
  0.07367737747484583,
  0.07075624797143784,
  0.06491398896462187,
  0.08179162609542356,
  0.06134371957156767,
  0.0688088283024992,
  0.06426484907497566,
  0.07530022719896137,
  0.06815968841285297,
  0.08503732554365466,
  0.08471275559883155,
  0.0863356053229471,
  0.07659850697825381,
  0.06751054852320675,
  0.09607270366764038,
  0.07958707958707958,
  0.08658008658008658,
  0.07392607392607392,
  0.08824508824508824,
  0.06859806859806859,
  0.06293706293706294],
 36: [0.07474600870827286,
  0.1204644412191582,
  0.08127721335268505,
  0.08708272859216255,
  0.06458635703918723,
  0.06386066763425254,
  0.08708272859216255,
  0.

In [224]:
def normalize_heap(heap):
    
    def normalize(pair): 
        p,d = pair
        return d, 1-p
    
    return list(map(normalize, heap))

def find_key(cipher_text, key_length, threshold=.06):
    key = {}
    spreaded = spread(cipher_text, key_length)
    for c in range(key_length):
        column = col(spreaded, c, joiner=None)
        for v in range(0,n):
            shifted = list(map(lambda x: (x-v)%n, column))
            key[c,v]={}
            for a in range(key_length):
                #if a == c and v == 0: continue
                another = col(spreaded, a, joiner=None)
                mci = mutual_coincidence_index(shifted, another)
                key[c,v].update({a:mci})
    
    filtered = {}
    for (c, v), cols_mci in key.items():
        if c not in filtered: filtered.update({c:set()})
        for a, mci in cols_mci.items():
            if mci > threshold:# and v not in filtered[c]: 
                #heappush(filtered[c], (1-mci,v))
                filtered[c].add(v)
    
    #return {k:normalize_heap(v) for k,v in filtered.items()}
    return filtered

In [225]:
key_combinations = find_key(cipher_text, key_length=12)
key_combinations

{0: {0, 2, 4, 7, 11, 13, 14, 18, 19, 20, 23},
 1: {0, 4, 5, 6, 9, 13, 15, 17, 20, 24, 26},
 2: {0, 4, 6, 8, 11, 15, 17, 18, 22, 23, 24},
 3: {0, 2, 5, 9, 11, 12, 16, 17, 18, 21, 25},
 4: {0, 1, 5, 6, 7, 10, 14, 16, 18, 21, 25},
 5: {0, 1, 4, 8, 10, 12, 15, 19, 21, 22, 26},
 6: {0, 2, 3, 7, 8, 9, 12, 16, 18, 20, 23},
 7: {0, 3, 7, 9, 11, 14, 18, 20, 21, 25, 26},
 8: {0, 3, 7, 9, 11, 14, 18, 20, 21, 25, 26},
 9: {0, 4, 6, 7, 11, 12, 13, 16, 20, 22, 24},
 10: {0, 3, 7, 9, 10, 14, 15, 16, 19, 23, 25},
 11: {0, 1, 2, 5, 9, 11, 13, 16, 20, 22, 23}}

In [226]:
def discover_key(key_combinations, cipher_text, threshold=.06, sols=1):
    candidates = []
    shifts = [[]] * len(key_combinations)
    for i, vs in key_combinations.items(): shifts[i] = vs
    for key in product(*shifts):
        decrypted = decrypt(cipher_text, key)
        ci = coincidence_index(decrypted)[0]
        if ci > threshold: 
            heappush(candidates, (1-ci, decode(key), decrypted))
            if len(candidates) >= sols: break
    return candidates

In [227]:
# it takes a long time...
discovered = discover_key(key_combinations, cipher_text)

KeyboardInterrupt: 