In [42]:
import random
from glob import glob
from collections import defaultdict

## 1. Preparation

In [43]:
wiki_path = "lab1/norm_wiki_sample.txt"

In [44]:
with open(wiki_path, "r") as wiki_file:
    wiki = wiki_file.readlines()

In [45]:
len(wiki)

1

In [46]:
wiki = wiki[0]

In [47]:
wiki[:100], len(wiki)

(' albert of prussia 17 may 1490 20 march 1568 was the last grand master of the teutonic knights who a',
 10788941)

## 2. Zeroth-order approximation 

In [48]:
def generator(lang, weights, length=100):
    return random.choices(lang, weights=weights, k=length)

In [49]:
lang = [chr(i) for i in range(97, 123)] + [" "]
print(lang)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ']


In [50]:
weights = [1/27] * 27

In [51]:
print(*generator(lang, weights), sep="")

 flwtnpkcawoyakcgmewmaclcmlrjvzhkvntxsnqnwnovqv hyeyuifzafnqgahnywkwqpkdhqzgnrynfz u qgoczsbqj xdnnz


In [52]:
def avg_length_of_word(lang, weights):
    c = 0
    words = []
    seq = generator(lang, weights, length=10000000)
    for i in range(len(seq)):
        if seq[i] == " ":
            if c != 0:
                words.append(c)
    
            c = 0
        else:
            c += 1
    
    print(sum(words) / len(words))

avg_length_of_word(lang, weights)

27.023805448060457


As expected, average length of each word is 27 characters

## 3. Frequency of letters

In [53]:
glob("lab1/*")

['lab1/norm_wiki_sample.txt',
 'lab1/norm_romeo_and_juliet.txt',
 'lab1/norm_hamlet.txt']

In [54]:
stats = []
for idx, file in enumerate(glob("lab1/*")):
    di = {" ": 0}
    di.update({chr(i): 0 for i in range(97, 123)})
    di.update({str(i): 0 for i in range(10)})
    with open(file, "r") as f:
        for line in f:
            for ch in line:
                di[ch.lower()] += 1

    stats.append(di)

for idx, stat in enumerate(stats):
    print(f"Stats for {glob('lab1/*')[idx]}")
    norm = sum(stat.values())
    for key in stat:
        stat[key] /= norm
        print(f"{key}: {stat[key]:.6f}")

    print("\n")

Stats for lab1/norm_wiki_sample.txt
 : 0.170592
a: 0.072099
b: 0.013456
c: 0.027571
d: 0.031610
e: 0.093536
f: 0.017618
g: 0.016283
h: 0.036466
i: 0.060955
j: 0.002128
k: 0.006031
l: 0.035055
m: 0.021529
n: 0.059656
o: 0.058116
p: 0.017077
q: 0.000853
r: 0.054323
s: 0.053081
t: 0.066296
u: 0.021310
v: 0.008546
w: 0.012854
x: 0.001634
y: 0.012443
z: 0.001291
0: 0.004675
1: 0.005870
2: 0.003481
3: 0.001765
4: 0.001607
5: 0.001651
6: 0.001528
7: 0.001531
8: 0.001923
9: 0.003560


Stats for lab1/norm_romeo_and_juliet.txt
 : 0.204069
a: 0.062207
b: 0.012872
c: 0.016187
d: 0.030488
e: 0.094896
f: 0.015761
g: 0.014222
h: 0.053920
i: 0.051868
j: 0.001247
k: 0.006629
l: 0.036273
m: 0.024853
n: 0.049176
o: 0.065640
p: 0.010939
q: 0.000513
r: 0.046730
s: 0.050913
t: 0.074353
u: 0.026368
v: 0.008184
w: 0.020101
x: 0.001018
y: 0.020149
z: 0.000237
0: 0.000000
1: 0.000103
2: 0.000063
3: 0.000024
4: 0.000000
5: 0.000000
6: 0.000000
7: 0.000000
8: 0.000000
9: 0.000000


Stats for lab1/norm_hamlet.txt


In [55]:
wiki_stats = [(v, k) for k, v in stats[0].items()]
wiki_stats.sort(reverse=True)
wiki_stats

[(0.17059199786151394, ' '),
 (0.0935363350304724, 'e'),
 (0.07209938398958711, 'a'),
 (0.06629621943432631, 't'),
 (0.06095500939341498, 'i'),
 (0.05965627210307295, 'n'),
 (0.05811617655523373, 'o'),
 (0.05432303318740922, 'r'),
 (0.053081113336332086, 's'),
 (0.03646613694522938, 'h'),
 (0.03505543315140939, 'l'),
 (0.03160977523187864, 'd'),
 (0.027571009981424498, 'c'),
 (0.02152852629372985, 'm'),
 (0.021310247224449554, 'u'),
 (0.017617762484751748, 'f'),
 (0.017076930905452165, 'p'),
 (0.016282506318275353, 'g'),
 (0.013455630167965513, 'b'),
 (0.012853532149262843, 'w'),
 (0.012442741136502646, 'y'),
 (0.008546343890470808, 'v'),
 (0.006031361187349157, 'k'),
 (0.005869806869830876, '1'),
 (0.004674786895210568, '0'),
 (0.0035601269855864443, '9'),
 (0.003480693795619051, '2'),
 (0.002127734316092747, 'j'),
 (0.0019228022472270448, '8'),
 (0.0017645846798124117, '3'),
 (0.0016506717387739908, '5'),
 (0.001634080675758631, 'x'),
 (0.0016072939874265694, '4'),
 (0.00153147561007

In Morse code, frequently used characters are encoded with shorter codes

## 4. First Order approximation

In [56]:
lang = []
weights = []
for key, value in stats[0].items():
    lang.append(key)
    weights.append(value)

print(*generator(lang, weights, length=200), sep="")

fbiam actlgvmeaarmse12oeilnu f plm c ca mrico nnsar mooeaaccsmcabcr n0  r s eor hndveostveaamonct cinig0rku ar2 unitin6boru tusaataai nsaamnbh rgiahne ate e h8ctiutn ta ioha oiltootheydgb hd0 aonoqclc


In [57]:
avg_length_of_word(lang, weights)

5.861373858274839


## 5. Conditional probability of letters

In [58]:
chars = ["a", "e"]
stats = []
for idx, letter in enumerate(chars):
    di = {letter + " ": 0}
    di.update({letter + chr(i): 0 for i in range(97, 123)})
    di.update({letter + str(i): 0 for i in range(10)})
    prev = "start"
    with open(file, "r") as f:
        for line in f:
            for ch in line:
                if prev == letter:
                    di[letter + ch.lower()] += 1

                prev = ch

    stats.append(di)

for idx, stat in enumerate(stats):
    print(f"Stats for {next(iter(stat.keys()))[0]}")
    norm = sum(stat.values())
    for key in stat:
        stat[key] /= norm
        print(f"{key}: {stat[key]:.6f}")

    print("\n")

Stats for a
a : 0.066905
aa: 0.000000
ab: 0.010398
ac: 0.026048
ad: 0.036761
ae: 0.005252
af: 0.005882
ag: 0.017750
ah: 0.000735
ai: 0.034030
aj: 0.002206
ak: 0.026153
al: 0.068480
am: 0.036236
an: 0.205126
ao: 0.000000
ap: 0.012079
aq: 0.000000
ar: 0.106396
as: 0.078878
at: 0.148409
au: 0.011448
av: 0.039177
aw: 0.011133
ax: 0.001050
ay: 0.048104
az: 0.001365
a0: 0.000000
a1: 0.000000
a2: 0.000000
a3: 0.000000
a4: 0.000000
a5: 0.000000
a6: 0.000000
a7: 0.000000
a8: 0.000000
a9: 0.000000


Stats for e
e : 0.381699
ea: 0.067987
eb: 0.001081
ec: 0.014665
ed: 0.028925
ee: 0.039603
ef: 0.007569
eg: 0.003379
eh: 0.001352
ei: 0.010340
ej: 0.000135
ek: 0.001081
el: 0.042373
em: 0.019531
en: 0.090356
eo: 0.001757
ep: 0.008988
eq: 0.001081
er: 0.127256
es: 0.068054
et: 0.040752
eu: 0.003176
ev: 0.011354
ew: 0.005339
ex: 0.009934
ey: 0.012029
ez: 0.000203
e0: 0.000000
e1: 0.000000
e2: 0.000000
e3: 0.000000
e4: 0.000000
e5: 0.000000
e6: 0.000000
e7: 0.000000
e8: 0.000000
e9: 0.000000




## 6. Approximations based on Markov sources 

In [59]:
def generate_input(order, text_path):
    order += 1
    chars = [chr(i) for i in range(97, 123)] + [" "] + [str(i) for i in range(10)]
    stats = defaultdict(int)

    context=""
    with open(text_path, "r") as f:
        for line in f:
            for ch in line:
                context += ch
                if len(context) == order:
                    stats[context] += 1
                    context = context[1:]

    return stats

In [60]:
def generator_v2(context, stats, order, length=250):
    chars = [chr(i) for i in range(97, 123)] + [" "] + [str(i) for i in range(10)]
    out = context
    context = context[-order:]
    for _ in range(length):
        weights = []
        for ch in chars:
            weights.append(stats[context+ch])

        context += random.choices(chars, weights=weights)[0]
        out += context[-1]
        if len(context) > order: 
            context = context[1:]

    return out

In [61]:
def avg_length_of_word_v2(context, stats, order):
    c = 0
    words = []
    seq = generator_v2(context, stats, order, length=1000000)
    for i in range(len(seq)):
        if seq[i] == " ":
            if c != 0:
                words.append(c)
    
            c = 0
        else:
            c += 1
    
    print(sum(words) / len(words))

### First order

In [62]:
%%time
stats = generate_input(0, wiki_path)
stats.update(generate_input(1, wiki_path))

CPU times: user 2.54 s, sys: 10.9 ms, total: 2.55 s
Wall time: 2.56 s


In [63]:
generator_v2("", stats, 1)

'ezia tucla arol nicte t n ongare berseion n tididiginan rigurly otir al wharavethas s trel te wh souc icud a ong 40angenl ork athy mathofathestad watang s finowheverctheld ionk he ol f ovin oeantizelakishealmus fuer r n w ve acasovinceel rigeampe d 3'

In [64]:
avg_length_of_word_v2("", stats, 1)

4.864689875199399


### Third order

In [65]:
%%time
stats = generate_input(0, wiki_path)
for i in range(1, 4):
    stats.update(generate_input(i, wiki_path))

CPU times: user 5.55 s, sys: 14.1 ms, total: 5.57 s
Wall time: 5.58 s


In [66]:
generator_v2("", stats, 3)

'e coat to volve eparattlevict s s gamential publistraymentatestarry pers land cutiestude saarlia basilvyn the new definal premorg inval as to the conton funder carby harast s now the afterry vip ally gidanistart down will hough i in had in to had she'

In [67]:
avg_length_of_word_v2("", stats, 3)

4.8594029238566785


### Fifth order

In [68]:
%%time
stats = generate_input(5, wiki_path)

CPU times: user 2.32 s, sys: 20.3 ms, total: 2.34 s
Wall time: 2.34 s


In [69]:
generator_v2("probability", stats, 5)

'probability graduated subgroup to be to keefe planta romania isotope and jackson greel one person of the greece 386 he albanian defend to preschool mixing and illing after the berth west suspected by his world which enabling border thomas name out on feed it pe'

In [70]:
avg_length_of_word_v2("probability", stats, 5)

4.861047128397189
