In [12]:
## Imports

import PyPDF2
from numpy import zeros
from os import listdir
from os.path import join
from tqdm import tqdm
from math import exp, log
from random import shuffle, uniform, sample, randint

In [2]:
## Methods

def clean_text(text, chars, header):
    '''Filters an input text to only contain lowercase letters and the space 
    character.'''
    text = text.lower()
    if header:
        text = ' '.join(text.split(':')[1:])
    text = text.replace('\n', ' ')
    text = text.replace(' .', '.')
    cleaned = filter(chars.__contains__, text)
    return ''.join(cleaned) + ' '

def get_text(path, char_dict, header=False):
    '''Reads in and cleans the text from a .pdf or .txt file.'''
    text_type = path[-3:]
    if text_type == 'pdf':
        doc = PyPDF2.PdfReader(path)
        text = ''
        for page in doc.pages:
            text += clean_text(page.extract_text(), \
                               list(char_dict.keys()), header)
    elif text_type == 'txt':
        with open(path, 'r') as file:
            text = clean_text(file.read(), list(char_dict.keys()), header)
    else:
        raise RuntimeError('Import file must be of type .pdf or .txt!')
    return text

def get_q(text, char_dict):
    '''Builds Q and P from a given text and char_dict using the digram model.'''
    char_len = len(list(char_dict.keys()))
    q = zeros((char_len, char_len))
    p = zeros((char_len))
    p[char_dict[text[0]]] += 1
    for i in range(1, len(text)):
        q[char_dict[text[i - 1]]][char_dict[text[i]]] += 1
        p[char_dict[text[i]]] += 1
    for i in range(len(q)):
        for j in range(len(q[i])):
            q[i][j] = max(q[i][j], 1)
        q[i] = q[i] / sum(q[i])
    return q, p / len(text)

def new_perm(permutation, permutations, var=0):
    '''Computes a new random permutation. Allows for some random variance in the
    number of changes made for the new permutations.'''
    changes = permutations
    if var != 0:
        rand = randint(0, 100)
        if rand >= 90:
            changes = permutations + var
        elif rand <= 10:
            changes = permutations - var
        else:
            changes = permutations
    for _ in range(changes):
        ab = sample(range(0, len(permutation) - 1), 2)
        a, b = ab[0], ab[1]
        permutation = list(permutation)
        permutation[a], permutation[b] = permutation[b], permutation[a]
        new_permutation = ''.join(permutation)
    return new_permutation

def transition(perm, char_dict, encoded, display_amount=None):
    '''Computes transition on a given text.'''
    data = ''
    display_amount = len(encoded) if display_amount == None else display_amount
    for i in range(display_amount):
        data += perm[char_dict[encoded[i]]]
    return data

def energy_func(perm1, perm2, char_dict, encoded, q, p):
    '''Computes the energy delta on a permuted texts.'''
    trans1 = transition(perm1, char_dict, encoded)
    trans2 = transition(perm2, char_dict, encoded)
    delta = log(p[char_dict[trans1[0]]]) - log(p[char_dict[trans2[0]]])
    for j in range(1, len(encoded)):
        delta -= log(q[char_dict[trans1[j-1]]][char_dict[trans1[j]]]) - \
                        log(q[char_dict[trans2[j-1]]][char_dict[trans2[j]]])
    return delta

In [3]:
## Get q and p
char_dict = {x: i for i, x in enumerate(' abcdefghijklmnopqrstuvwxyz')}
X0 = ''
text_dir = './text_data'
decode_dir = './decoded_text'
print('Building q and p...')
for filename in tqdm(listdir(text_dir)):
    try:
        X0 += get_text(join(text_dir, filename), char_dict)
    except RuntimeError as e:
        None
q, p = get_q(X0, char_dict)

Building q and p...


100%|██████████| 12/12 [01:21<00:00,  6.81s/it]


In [14]:
## Run

chars = list(char_dict.keys())
shuffle(chars)
perm = ''.join(chars)

# hyper parameters
beta = 0.65 # tunable hyperparameter (best for all = 0.63)
permutations = 2 # number of times text is permuted before scoring
var = 0
convergence_delta = 2000 # max number of worse iterations before stopping
max_epochs = 15000 # maximum number of iterations to run MCMC

# specific_text = None
specific_text='student_20_text1.txt'
# specific_text='student_219_text2.txt'
# specific_text='student_102_text3.txt'

verbose=True
save = True

print('Running mcmc on encoded texts...')
encoded_dir = './encoded_text'
for filename in tqdm(listdir(encoded_dir), desc='File:'):
    print(f'File: {filename}...')
    if specific_text != None and filename != specific_text:
        continue
    header = filename.split('.')[0]
    try:
        encoded = get_text(join(encoded_dir, filename), char_dict, True)
        convergence_counter = 0
        for i in range(max_epochs):
            curr = new_perm(perm, permutations)
            e_delta = energy_func(curr, perm, char_dict, encoded, q, p)
            if e_delta < 0 or uniform(0, 1) < exp((-beta) * e_delta):
                perm = curr
                if verbose:
                    print(f'{i}: ' + \
                            transition(perm, char_dict, encoded, 80))
                convergence_counter = 0
            else:
                convergence_counter += 1
                if convergence_counter >= convergence_delta:
                    break
        print(f'Permutation: {perm}')
        print('Decoded text: \n')
        print(transition(perm, char_dict, encoded, 80))
        # print('Enter any key to continue: ')
        # _ = input()
        ## Save
        if save:
            with open(join(decode_dir, f'{header}_decoded.txt'), 'w') as f:
                f.write(transition(perm, char_dict, encoded))
            print(f'Saved {header}_decoded.txt')
    except RuntimeError as e:
        None


Running mcmc on encoded texts...


File::   0%|          | 0/3 [00:00<?, ?it/s]

File: student_219_text2.txt...
File: student_20_text1.txt...
0: bfkpxbgkpdipnfkpecbnofvrptfbhkpofbofbvd pnqzehkgprjgpihdqjgkckgpredqnpbjpnfkpzqg
2: bfkoxbgkodionfkoecbnpfvrotfbhkopfbpfbvd onqzehkgorjgoihdqjgkckgoredqnobjonfkozqg
3: bfko bgkodionfkoexbnpfvrotfbhkopfbpfbvdconqzehkgorjgoihdqjgkxkgoredqnobjonfkozqg
7: bf okbg ohionf oexbnpfvrotfbd opfbpfbvhconqzed gorjgoidhqjg x gorehqnobjonf ozqg
12: bl okbg ohionl oexbnplvroclbd oplbplbvhtonqzed gorjgoidhqjg x gorehqnobjonl ozqg
16: wl oewg ohionl okxwnplvroclwd oplwplwvhtonqzkd gorjgoidhqjg x gorkhqnowjonl ozqg
19: wl oewg ohronl ouxwnplvioclwd oplwplwvhtonqzud goijgordhqjg x goiuhqnowjonl ozqg
22: wj oewg olronj ouxwnpjviocjwd opjwpjwvltonqzud goihgordlqhg x goiulqnowhonj ozqg
23: wj oewg olronj ouxwnpjviocjwd opjwpjwvltonbzud goihgordlbhg x goiulbnowhonj ozbg
24: wj oewg olronj ouxwnpjviocjwd opjwpjwvltonbsud goihgordlbhg x goiulbnowhonj osbg
35: wj oewg olronj ouswnpjdiocjwv opjwpjwdltonbxuv goihgorvlbhg s goiulbnowhonj oxbg
36: wj o

File::  33%|███▎      | 1/3 [00:06<00:12,  6.47s/it]


KeyboardInterrupt: 