# Question 2
Find most common digraphs in a latin based alphabet

In [None]:
import nltk
from nltk.corpus import gutenberg
from nltk import bigrams
import pandas as pd

In [None]:
files = gutenberg.fileids()
files

In [None]:
file_shakespeare = []
for file in files:
    if 'shakespeare' in file:
        print(file)
        file_shakespeare.append(file)

In [None]:
# words = gutenberg.words(file_shakespeare)
words = gutenberg.words(files)
len(words)

In [None]:
latin = 'abcdefghijklmnopqrstuvwxyz'
chars = ''.join(words).lower()
alphabets = []
for char in chars:
    if char in latin:
        alphabets.append(char)

In [None]:
alphabets

In [None]:
digraphs = list(bigrams(alphabets))
digraphs_set = set(digraphs)
digraphs

In [None]:
fdist = nltk.FreqDist(digraphs)
common_digraphs = fdist.most_common()

In [None]:
n_digraphs = len(common_digraphs)
data = {}
common_digraph_strings, common_digraph_freqs = [], []
for i in range(n_digraphs):
    common_digraph_strings.append(''.join(common_digraphs[i][0]))
    common_digraph_freqs.append(common_digraphs[i][1])
sum_freqs = sum(common_digraph_freqs)
common_digraph_probs = [100 * f / sum_freqs for f in common_digraph_freqs]
data = {"Digraphs": common_digraph_strings,
       "Probability": common_digraph_probs}
df = pd.DataFrame(data)
df

In [None]:
df.to_csv('English_digraph_probabilities.csv')

# Question 3
Find most common digraphs in a latin based language

In [None]:
import nltk
from nltk.corpus import gutenberg
from nltk.corpus import indian
from nltk import trigrams

In [None]:
words = indian.words('hindi.pos')
len(words)

In [None]:
chars = ''.join(words)
chars

In [None]:
charset = list(set(chars))
charset.sort()

In [None]:
chars_without_punctuations = charset[9:-11]
nonlatinchars = ''.join(chars_without_punctuations)
nonlatinchars

In [None]:
alphabets = []
for char in chars:
    if char in nonlatinchars:
        alphabets.append(char)

In [None]:
trigraphs = list(trigrams(alphabets))
trigraphs

In [None]:
fdist_tri = nltk.FreqDist(trigraphs)
fdist_tri.most_common()

# Question 4
Use language dictionary to break single transposition cipher

In [None]:
Y = 'AKPKNLLALENLLNASYBWDYJAOONMODYROAHU'

In [None]:
import nltk
from nltk.corpus import gutenberg
from math import floor, ceil
import numpy as np

In [None]:
files = gutenberg.fileids()
file_shakespeare = []
for file in files:
    if 'shakespeare' in file:
        print(file)
        file_shakespeare.append(file)

In [None]:
words = gutenberg.words(file_shakespeare)
len(words)

In [None]:
vocab = [word.lower() for word in set(words)]
len(vocab)

In [None]:
def transpose_text(Y, key):
    n = len(Y)
    rows = ceil(n / key)
    y_stripped = list(Y)
    columns = []
    for k in range(key-1):
        column = y_stripped[k * rows : (k + 1) * rows]
        columns.append(column)
    last_column = y_stripped[(key - 1) * rows :]
    empty_elts = rows - len(last_column)
    for i in range(empty_elts):
        last_column.append('')
    columns.append(last_column)
    trial = np.asarray(columns)
    x_trial = []
    for r in range(rows):
        for k in range(key):
            if trial[k][r]:
                x_trial.append(trial[k][r])
    x_trial_joined = ''.join(x_trial).lower()
    return x_trial_joined

In [None]:
for key in range(1, 10):
    X_key = transpose_text(Y, key)
    word_cnt = 0
    for word in vocab:
        if word in X_key:
            word_cnt += 1
    print('For key = {}, Word count = {}'.format(key, word_cnt))

# Question 5
Kasiski's method for breaking Vigenère polyalphabetic cipher
Hint: The first step should be producing a candidate list of numbers that could be the key
length. Then, assuming that the underlying cipher is a Vigenère polyalphabetic
cipher, attempt to break the ciphertext into multiple ciphertexts and perform a
frequency analysis on each. The program should produce a reasonable guess to a
certain selection of keys, as well as accompanying plaintexts. Use of a dictionary
file is encouraged to increase the precision.

In [109]:
from nltk.corpus import PlaintextCorpusReader
import numpy as np
import pandas as pd
import os
import nltk
from nltk.corpus import gutenberg
from nltk import bigrams, trigrams, ngrams

In [85]:
def alphabet_positions():
    # Given
    charspace = 'abcdefghijklmnopqrstuvwxyz'
    
    # Find alphabets' numerical positions
    alpha_num = {}
    for i, alphabet in enumerate(charspace.strip()):
        alpha_num[alphabet] = i
        
    # Find inverse of alpha_num
    num_alpha = {v: k for k, v in alpha_num.items()}
    
    return alpha_num, num_alpha

In [86]:
# Function to encrypt using Vigenère cipher
def encrypt_viginere(X, key):
        
    # Find alphabets' numerical positions and its inverse
    alpha_num, num_alpha = alphabet_positions()
    
    # Convert key into numeric form
    k_num = []
    for k in key:
        k_num.append(alpha_num[k])
                        
    # Convert plaintext 'X' into numeric form
    x_num = []
    for c in X:
        x_num.append(alpha_num[c])
    
    # Add extra fake values to plaintext to match the size of integral multiple of keylength
    n = len(X)
    nkey = len(key)
    nfake = nkey - (n % nkey)
    for i in range(nfake):
        x_num.append(0)
        
    # Encrypt x_num using k_num
    y_num = [0 for i in range(len(x_num))]
    for i_x, c_num in enumerate(x_num):
        for rem in range(nkey):
            if i_x % nkey == rem:
                y_num[i_x] = (x_num[i_x] + k_num[rem]) % 26
    
    # Delete extra fake values from y_num
    if nfake>0:
        del y_num[-nfake:]
    
    # Convert y_num to Y (Alphabets space)
    y_stripped = []
    for c_num in y_num:
        y_stripped.append(num_alpha[c_num])
        
    # Return concatenated string
    Y = ''.join(y_stripped)
    return Y

In [87]:
# Function to encrypt using Vigenère cipher
def decrypt_viginere(Y, key):
        
    # Find alphabets' numerical positions and its inverse
    alpha_num, num_alpha = alphabet_positions()
    
    # Convert key into numeric form
    k_num = []
    for k in key:
        k_num.append(alpha_num[k])
                        
    # Convert ciphertext 'Y' into numeric form
    y_num = []
    for c in Y:
        y_num.append(alpha_num[c])
    
    # Add extra fake values to ciphertext to match the size of integral multiple of keylength
    n = len(Y)
    nkey = len(key)
    nfake = nkey - (n % nkey)
    for i in range(nfake):
        y_num.append(0)
        
    # Decrypt y_num using k_num
    x_num = [0 for i in range(len(y_num))]
    for i_y, c_num in enumerate(y_num):
        for rem in range(nkey):
            if i_y % nkey == rem:
                x_num[i_y] = (y_num[i_y] - k_num[rem]) % 26
    
    # Delete extra fake values from x_num
    if nfake>0:
        del x_num[-nfake:]
    
    # Convert x_num to X (alphabets space)
    x_stripped = []
    for c_num in x_num:
        x_stripped.append(num_alpha[c_num])
        
    # Return concatenated string
    X = ''.join(x_stripped)
    return X

In [88]:
def key_schedule(nkey):
    import itertools
    _, num_alpha = alphabet_positions()
    k_num_domain = [[i for i in range(26)] for k in range(nkey)]
    trial_keys = []
    for element in itertools.product(*k_num_domain):
        k_num_trial = element
        k_stripped = []
        for c_num in k_num_trial:
            k_stripped.append(num_alpha[c_num])
        trial_key = ''.join(k_stripped)
        trial_keys.append(trial_key)
    return trial_keys

In [89]:
def load_digraph_lookup():
    import pandas as pd
    from collections import defaultdict
    data =  pd.read_csv('English_digraph_probabilities.csv')
    common_digraphs = list(data["Digraphs"])
    common_digraph_probs = list(data["Probability"])
    digraph_lookup = defaultdict(float)
    for i, digraph in enumerate(common_digraphs):
        digraph_lookup[digraph] = common_digraph_probs[i]
    return digraph_lookup

In [90]:
# Frequency analysis
def freq_anal(Y, nkey):
    import nltk
    from nltk import bigrams 
    digraph_lookup = load_digraph_lookup()
    trial_keys = key_schedule(nkey)
    results = {}
    for trial_key in trial_keys:
        X_trial = decrypt_viginere(Y, trial_key)
        digraphs = bigrams(X_trial)
        fdist_di = nltk.FreqDist(digraphs)
        score = 0
        for digraph_chars in fdist_di.most_common(10):
            digraph_trial = ''.join(digraph_chars[0])
            score += (digraph_lookup[digraph_trial])
        results[trial_key] = score
    return results

In [91]:
def find_gcf(list):
    from math import gcd
    from functools import reduce
    x = reduce(gcd, list)
    return x

In [92]:
def guess_keylength(n_ngram):
    ngraphs = list(ngrams(Y, n_ngram))
    fdist_n = nltk.FreqDist(ngraphs)
    repeated_ngrams = [w for w in fdist_n.most_common() if w[1] > 1]
    guesses = []
    for ngram in repeated_ngrams[:10]:
        ngram_pos = [i for i in range(len(ngraphs)) if ngraphs[i] == ngram[0]]
        ngram_dist = [s - f for f, s in zip(ngram_pos, ngram_pos[1:])]
        if len(ngram_dist)>1:
            guesses.append(find_gcf(ngram_dist))
        else:
            guesses.append(ngram_dist[0])
    return guesses

In [93]:
def find_factors(x):
    factors = []
    for i in range(1, x + 1):
        if x % i == 0:
            factors.append(i)
    return factors

In [65]:
# Load plaintext 'X' and join to feed into encryption function
latin = 'abcdefghijklmnopqrstuvwxyz'
corpus_root = 'C:/Users/Usuario/Python/Cryptanalysis/Corpus'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
fname = wordlists.fileids()[0]
even_tokens = wordlists.words(os.path.join(corpus_root, fname))
# X = ''.join(even_tokens)
# print(X)
chars = ''.join(even_tokens).lower()
X_disjoint = []
for c in chars:
    if c in latin:
        X_disjoint.append(c)
X = ''.join(X_disjoint)
print(X)



In [83]:
len(X)

84136

In [80]:
# Given
key = 'romeo'

# Encrypt using Vigenère cipher
Y = encrypt_viginere(X, key)

In [81]:
dist_gcfs = []
n_ngram = 1
while guess_keylength(n_ngram):
    dist_gcfs.append(guess_keylength(n_ngram))
    n_ngram += 1
dist_gcfs = [item for sublist in dist_gcfs for item in sublist]
fdist_dist_gcfs = nltk.FreqDist(dist_gcfs)
fdist_dist_gcfs.most_common()

[(65, 207),
 (5, 144),
 (230, 81),
 (1, 28),
 (55, 6),
 (25, 3),
 (15, 3),
 (9525, 2),
 (10, 1),
 (50610, 1)]

In [97]:
# key_scores = freq_anal(Y[:300], 3)
# key_scores
# df = pd.read_csv("trial_key_scores.csv")

In [100]:
# key_trials = df["key_trial"]
# key_trial_scores = df["key_score"]
# key_scores = {}
# for i, key_trial in enumerate(key_trials):
#     key_scores[key_trial] = key_trial_scores[i]

In [None]:
# ranked_keys = {}
# for w in sorted(key_scores, key=key_scores.get, reverse=True):
#     ranked_keys[w] = key_scores[w]

In [None]:
# key_trials = list(ranked_keys.keys())[:200]
# key_trial_scores = []
# for w in key_trials:
#     key_trial_scores.append(ranked_keys[w])
# data = {"key_trial": key_trials,
#        "key_score": key_trial_scores}
# df = pd.DataFrame(data)
# df.to_csv("ranked_key_scores_200.csv")

In [111]:
top_keys = list(ranked_keys)[:200]
top_keys

['romoi',
 'romco',
 'roqee',
 'romoo',
 'romfc',
 'romjt',
 'romso',
 'roqeo',
 'romot',
 'romec',
 'romoc',
 'rodoi',
 'romeo',
 'romce',
 'romsn',
 'roqeu',
 'rodho',
 'romsi',
 'romwi',
 'romdi',
 'romoa',
 'rovfo',
 'roqro',
 'romjp',
 'romwc',
 'rodro',
 'romgv',
 'romne',
 'rommo',
 'roddo',
 'roqzo',
 'romom',
 'romwo',
 'romjb',
 'romze',
 'rocco',
 'romfp',
 'roxdp',
 'roqek',
 'romfo',
 'roqey',
 'rodlp',
 'rodeo',
 'romro',
 'romok',
 'rohuo',
 'rodjo',
 'romow',
 'romgt',
 'romsk',
 'rocwo',
 'rolfo',
 'romjf',
 'roqio',
 'romnt',
 'roijp',
 'rodco',
 'romus',
 'rodwo',
 'rozfo',
 'romsg',
 'rokiv',
 'rodio',
 'romfs',
 'roqyo',
 'rowfo',
 'rovzo',
 'romvi',
 'romob',
 'romyt',
 'roqso',
 'romna',
 'romsv',
 'roqle',
 'roqwo',
 'rodpo',
 'rozso',
 'roduo',
 'romlc',
 'roqco',
 'rodfo',
 'romek',
 'rodiu',
 'roqew',
 'romhc',
 'romwf',
 'romon',
 'romde',
 'romwl',
 'rodqo',
 'romjx',
 'romzo',
 'romte',
 'romge',
 'rojco',
 'romfx',
 'rodvo',
 'romto',
 'rodcc',
 'romui',


In [110]:
files = gutenberg.fileids()
file_shakespeare = []
for file in files:
    if 'shakespeare' in file:
        print(file)
        file_shakespeare.append(file)
words = gutenberg.words(file_shakespeare)
vocab = [word.lower() for word in set(words)]
len(vocab)

shakespeare-caesar.txt
shakespeare-hamlet.txt
shakespeare-macbeth.txt


8960

In [118]:
Y_sec = Y[:500]
performance = {}
for key in top_keys[:20]:
    X_key = decrypt_viginere(Y_sec, key)
    word_cnt = 0
    for word in vocab:
        if word in X_key:
            word_cnt += 1
    performance[key] = word_cnt
    print('For key = {}, Word count = {}'.format(key, word_cnt))

For key = romoi, Word count = 154
For key = romco, Word count = 195
For key = roqee, Word count = 133
For key = romoo, Word count = 179
For key = romfc, Word count = 150
For key = romjt, Word count = 139
For key = romso, Word count = 186
For key = roqeo, Word count = 194
For key = romot, Word count = 150
For key = romec, Word count = 189
For key = romoc, Word count = 152
For key = rodoi, Word count = 122
For key = romeo, Word count = 327
For key = romce, Word count = 166
For key = romsn, Word count = 150
For key = roqeu, Word count = 127
For key = rodho, Word count = 131
For key = romsi, Word count = 145
For key = romwi, Word count = 156
For key = romdi, Word count = 143


In [119]:
ranked_key_perf = {}
for w in sorted(performance, key=performance.get, reverse=True):
    ranked_key_perf[w] = performance[w]
ranked_key_perf

{'romeo': 327,
 'romco': 195,
 'roqeo': 194,
 'romec': 189,
 'romso': 186,
 'romoo': 179,
 'romce': 166,
 'romwi': 156,
 'romoi': 154,
 'romoc': 152,
 'romfc': 150,
 'romot': 150,
 'romsn': 150,
 'romsi': 145,
 'romdi': 143,
 'romjt': 139,
 'roqee': 133,
 'rodho': 131,
 'roqeu': 127,
 'rodoi': 122}