In [1]:
import numpy as np
from collections import defaultdict

In [2]:
train_datasets = ['gutenberg']
test_datasets  = ['gutenberg']
prefix = '_'.join(train_datasets) + '-' + '_'.join(test_datasets)

word_to_id = np.load('%s_word_to_id.npy' % prefix).item()
id_to_word = np.load('%s_id_to_word.npy' % prefix).item()
n_grams = np.load('%s_n_grams.npy' % prefix).item()
train_sentences = np.load('%s_train.npy' % prefix)
valid_sentences = np.load('%s_valid.npy' % prefix)

In [3]:
# Store conditional counts
conditional_n_grams = dict()

for n in range(1,2+1):
    conditional_n_grams[n] = defaultdict(dict)
    
    for n_gram, count in n_grams[n].iteritems():
        conditional_n_grams[n][n_gram[:-1]][n_gram[-1]] = count

In [4]:
# Store continuation counts
continuations = dict()

for n in range(1,2+1):
    continuations[n] = defaultdict(set)
    
    for n_gram, count in n_grams[n].iteritems():        
        if count > 0:
            continuations[n][n_gram[1:]].add(n_gram[0])

In [5]:
continuation_prob = np.zeros((len(word_to_id) + 1,))
denominator = sum([len(x) for x in continuations[n].values()])

for i in range(len(word_to_id) + 1):
    continuation_prob[i] = float(len(continuations[2][(i,)])) / denominator

In [6]:
def get_distribution(context, d=0.75):
    n = len(context) + 1
    assert n == 2
    
    # create probability distribution by copying continuation probability
    counts = np.array(continuation_prob)
    
    # premultiply with lambda_
    lambda_ = float(d)/n_grams[n-1][context] * len(conditional_n_grams[n][context])
    counts *= lambda_
    
    for i in conditional_n_grams[n][context]:
        bigram_score = float(max(conditional_n_grams[n][context][i] - d, 0))/n_grams[n-1][context]
        counts[i] += bigram_score
        
    return counts

In [7]:
def get_probability(context, next_word):
    probabilities = get_distribution(context)
    return probabilities[next_word]

In [8]:
def compute_perplexity(sentence):
    running_prob = 0

    for pos in range(1, len(sentence)):
        query_n = 1
        if query_n == 0:
            context = tuple()
        else:
            context = tuple(map(lambda x: word_to_id[x] if x in word_to_id else 0, sentence[:pos][-1 * query_n:]))
        next_word = word_to_id[sentence[pos]] if sentence[pos] in word_to_id else 0

        running_prob += np.log(get_probability(context, next_word))
        #print running_prob

    return np.exp(-running_prob/len(sentence))

In [9]:
len(valid_sentences)

9856

In [10]:
ppls = []

for i, sentence in enumerate(valid_sentences):
    temp = compute_perplexity(sentence)
    ppls += [temp]
    print i, temp, np.mean(ppls)

0 292.402342273 292.402342273
1 32.6345676956 162.518454984
2 637.840716286 320.959208752
3 246.764771599 302.410599463
4 287.84997607 299.498474785
5 167.314997541 277.467895244
6 75.369049795 248.596631609
7 352.858405043 261.629353288
8 282.914301194 263.9943475
9 138.564060229 251.451318773
10 66.9508896443 234.678552488
11 266.962166777 237.368853679
12 81.7610070796 225.399019325
13 189.666674624 222.846708989
14 787.32748565 260.478760767
15 30.3019807325 246.092712015
16 171.121347271 241.682631736
17 27.7237870333 229.796029252
18 690.42664506 254.039745874
19 112.460433823 246.960780271
20 615.785121749 264.523844151
21 136.338266851 258.697227001
22 149.545555386 253.951502148
23 131.182996126 248.836147731
24 107.617530354 243.187403035
25 78.2236344927 236.842642707
26 387.102014879 242.407804639
27 412.2809526 248.474702781
28 198.473781531 246.750533082
29 124.347057713 242.670417237
30 214.136330047 241.749962811
31 131.622672164 238.308484979
32 200.324882145 237.15746

262 61.8757199083 188.616929432
263 202.533077993 188.669642116
264 176.376571522 188.623253171
265 7.20113707157 187.94121514
266 103.242816316 187.623992673
267 218.478795824 187.739122535
268 138.13571911 187.554723266
269 139.357857928 187.376216357
270 265.791569873 187.665571905
271 217.782192427 187.776294775
272 165.179349587 187.693522082
273 134.388606258 187.498978594
274 321.291341998 187.985496279
275 60.5730052328 187.523856818
276 131.017172634 187.319861569
277 39.5846620433 186.788439988
278 55.5507903407 186.318054147
279 160.206174255 186.224797433
280 142.535830431 186.069320682
281 51.1006769697 185.59070847
282 182.570093065 185.580034917
283 118.917472668 185.345307586
284 105.351110267 185.064626191
285 87.6034421948 184.723852821
286 65.6750129767 184.309048501
287 211.854923355 184.4046939
288 40.4091522484 183.90643943
289 145.197088309 183.772958909
290 99.5810706833 183.483639706
291 39.6121103884 182.990928989
292 72.9523966009 182.615370858
293 120.695369

527 134.631869042 174.394230107
528 240.106526164 174.518449948
529 66.8947954458 174.315386449
530 176.841787409 174.320144266
531 80.7857298952 174.144327698
532 134.428779102 174.069814474
533 235.943166037 174.185682173
534 248.637460473 174.324844376
535 127.581556926 174.23763675
536 100.801939616 174.100884986
537 163.829081239 174.081792414
538 221.089914097 174.169005998
539 111.753543515 174.053421808
540 226.994918703 174.151280398
541 592.802933321 174.923700421
542 213.260138905 174.994301597
543 765.407223229 176.079619468
544 115.968217256 175.969323317
545 25.9537151532 175.694569456
546 109.1572145 175.572928953
547 42.5104612004 175.330114231
548 179.870570033 175.338384642
549 66.0586113043 175.139694145
550 127.904686819 175.053968179
551 121.808689295 174.957509341
552 328.498764746 175.235160797
553 80.399267148 175.063976874
554 74.6890941859 174.883121229
555 145.129287586 174.82960714
556 91.3349699924 174.679706535
557 77.3463897794 174.505274067
558 196.74410

788 115.524082308 170.617561545
789 71.8607291514 170.492552896
790 83.724466306 170.382858729
791 63.3755266456 170.247748461
792 355.590110687 170.48147149
793 182.613694825 170.496751368
794 837.318388691 171.335520723
795 15.3347086965 171.139539804
796 41.256665151 170.976575093
797 434.139223401 171.306352848
798 41.4204095364 171.143792218
799 92.9249920444 171.046018717
800 119.681655998 170.981893421
801 72.9002918653 170.859597159
802 259.082788814 170.969464148
803 37.6387634725 170.803629943
804 72.0751451446 170.680985862
805 81.1615476486 170.569919562
806 114.029403284 170.499856964
807 55.4497853694 170.357468262
808 26.768860139 170.179979253
809 83.588432759 170.073076109
810 238.823290239 170.15784826
811 103.964025143 170.076328773
812 102.873449781 169.993668405
813 46.6399238237 169.842128179
814 131.06306521 169.794546506
815 129.141618173 169.744726741
816 7.20113707157 169.545774979
817 25.7067579438 169.36993266
818 42.6497338353 169.215207142
819 78.109815247

1049 53.1949617886 165.37165793
1050 56.8486435573 165.268401018
1051 231.249392074 165.331120592
1052 13.5126823423 165.186943537
1053 177.228360554 165.198368032
1054 14.395902926 165.055427306
1055 342.602457519 165.223558964
1056 85.6411024172 165.148268087
1057 63.8189681661 165.052493702
1058 146.72748862 165.035189636
1059 101.520907154 164.975270502
1060 333.730341953 165.13432335
1061 75.0142000526 165.049464477
1062 94.6863754888 164.983271542
1063 81.8929714178 164.905179155
1064 127.326530382 164.869894039
1065 158.793303748 164.864193673
1066 100.773071818 164.804127017
1067 163.623702529 164.80302175
1068 91.0924548678 164.734068928
1069 7.20113707157 164.586841889
1070 127.039644906 164.551783815
1071 146.221666289 164.534684825
1072 137.991168201 164.509947158
1073 219.697844204 164.561332537
1074 38.5248726226 164.444089319
1075 79.3914914883 164.365044153
1076 73.2106877027 164.280406868
1077 64.1832305168 164.187552344
1078 94.7832178609 164.123229514
1079 77.0345845

1302 143.696513362 158.059796319
1303 270.629198296 158.146122548
1304 127.446278116 158.122597763
1305 235.980798818 158.182213537
1306 40.7150891145 158.092338155
1307 11.5954260103 157.980337457
1308 105.382366749 157.940155662
1309 174.336904315 157.952672264
1310 293.05538471 158.055725439
1311 17.848720137 157.948860343
1312 429.510550404 158.155685697
1313 92.5076471839 158.105725242
1314 57.7953155759 158.029443562
1315 112.200562814 157.994619184
1316 346.995709282 158.138127984
1317 24.8008285128 158.036961596
1318 199.613231048 158.06848265
1319 91.3783047088 158.017959788
1320 249.71419982 158.08737405
1321 888.306031035 158.639733094
1322 229.61181501 158.693377903
1323 123.31685195 158.666658473
1324 182.088124487 158.684335051
1325 49.8753908845 158.602277023
1326 61.1810572325 158.528862389
1327 33.3674605515 158.434614345
1328 125.175980041 158.409589038
1329 333.458700339 158.541204911
1330 45.3940032547 158.456195743
1331 262.51082148 158.534314832
1332 161.256126637

1555 148.802160479 159.093093373
1556 159.32964446 159.093245301
1557 297.398206844 159.182016136
1558 44.0391430502 159.108159258
1559 61.2777478183 159.045447456
1560 37.570257883 158.967628628
1561 115.113323614 158.939552889
1562 1830.70615623 160.009141247
1563 61.6239890326 159.946235139
1564 54.5160906991 159.878867635
1565 130.328329979 159.859997559
1566 280.448287078 159.936952435
1567 122.335975474 159.91297222
1568 223.361594997 159.953411113
1569 385.213390242 160.096888806
1570 48.0310990318 160.025554758
1571 187.252007146 160.042874384
1572 59.2080424221 159.978770867
1573 151.488663773 159.973376898
1574 27.3668181032 159.889182258
1575 97.1707400263 159.849386292
1576 27.3668181032 159.765377054
1577 428.810123799 159.935874359
1578 53.5852464126 159.868521206
1579 71.4018117014 159.812529618
1580 163.389403092 159.81479203
1581 40.846317908 159.739590719
1582 248.01033383 159.795352401
1583 250.238765501 159.852450516
1584 123.864574831 159.829745231
1585 218.8376531

1806 227.708803613 159.550361982
1807 32.9192604977 159.480322656
1808 75.3850759652 159.43383551
1809 44.3298850075 159.370242167
1810 12.3269729084 159.289047651
1811 122.393247864 159.26868573
1812 79.0587057965 159.224444153
1813 313.452719331 159.309465253
1814 95.5528113326 159.27433762
1815 72.5731467336 159.226594673
1816 128.203605417 159.209520931
1817 76.0261705924 159.163765513
1818 206.711618366 159.189905069
1819 1708.60451072 160.041231776
1820 96.0434366376 160.006087462
1821 1149.23720875 160.549024411
1822 80.8004486298 160.505278621
1823 218.292752911 160.53696035
1824 56.3365449316 160.479864232
1825 28.2045452582 160.407424298
1826 21.8568830659 160.3315893
1827 35.259963436 160.263169374
1828 44.4527498363 160.199850391
1829 25.8288030328 160.126423589
1830 184.874902182 160.139939962
1831 48.2190413666 160.078847768
1832 31.9804880713 160.008963229
1833 55.4993303499 159.951978697
1834 166.57761425 159.955589397
1835 38.5185030311 159.889447193
1836 209.882551077

2060 112.30608533 160.107146579
2061 121.510482703 160.088428508
2062 204.198752588 160.109810148
2063 551.223778725 160.29930335
2064 441.564442642 160.435509228
2065 15.0425825655 160.365135111
2066 279.42068416 160.422733345
2067 116.424459519 160.401457584
2068 116.497864268 160.380237867
2069 596.759231395 160.591048975
2070 94.9802695314 160.559368251
2071 69.2058906022 160.515278735
2072 67.8104108042 160.470558586
2073 23.6016681691 160.404565872
2074 70.4737934876 160.36122574
2075 80.4144675346 160.322715741
2076 36.9897847317 160.263335418
2077 77.333685024 160.22342702
2078 30.6756482883 160.161114477
2079 226.088865874 160.192810511
2080 30.7754704141 160.130620535
2081 293.380251872 160.194621318
2082 160.148376168 160.194599117
2083 47.5124383032 160.140528982
2084 68.9436797655 160.096789486
2085 187.415394666 160.109885654
2086 70.6338391836 160.067012608
2087 172.726090152 160.073075385
2088 40.1035892277 160.015646239
2089 76.3987997138 159.975638178
2090 11.96325303

2312 23.3628127769 157.050541818
2313 58.3018920412 157.00786738
2314 43.4526834723 156.958815465
2315 206.174103244 156.980065589
2316 226.343779719 157.010002453
2317 131.287711302 156.998905693
2318 28.9058700762 156.943669369
2319 115.728663147 156.92590428
2320 81.5540712331 156.893430418
2321 62.783653284 156.852900798
2322 81.1404118503 156.82030825
2323 9.36620737091 156.756859842
2324 85.0888876805 156.726034908
2325 77.9547439652 156.692169348
2326 243.774724997 156.729592019
2327 60.9034672818 156.688429595
2328 44.674255716 156.6403342
2329 72.753528914 156.604331279
2330 67.5281028032 156.566117539
2331 102.330990361 156.542860624
2332 190.488139492 156.557410679
2333 82.5844562537 156.525717039
2334 104.128777483 156.503277237
2335 155.214822389 156.502725672
2336 242.143420375 156.539371241
2337 20.2492471484 156.481077775
2338 223.637968621 156.509789571
2339 117.438146515 156.493092287
2340 170.266721814 156.49897594
2341 68.5782810869 156.461435079
2342 112.493106418 

2564 575.936258818 156.912446028
2565 105.01052985 156.892219248
2566 23.487434073 156.840250107
2567 194.568139102 156.854941653
2568 55.4306392372 156.815461582
2569 266.810275323 156.85826112
2570 227.947259428 156.88591145
2571 853.634456795 157.156809018
2572 423.863286404 157.260464859
2573 107.247517969 157.24103481
2574 69.1062971756 157.206807727
2575 255.63766719 157.245018464
2576 53.7422962903 157.204854428
2577 123.792930086 157.191894023
2578 78.5470720658 157.161399714
2579 138.053456967 157.153993535
2580 80.462242621 157.124279567
2581 119.881998264 157.109855755
2582 196.137711054 157.124965262
2583 58.2370959385 157.086695963
2584 527.440307588 157.229966219
2585 144.088678404 157.224884514
2586 56.3698749329 157.185899199
2587 264.288993577 157.227283702
2588 89.0254991814 157.200940796
2589 218.266435354 157.224518207
2590 74.929736518 157.192756423
2591 189.092856928 157.205063561
2592 384.632672639 157.292771856
2593 308.055260189 157.350891551
2594 95.6965371697

2818 104.533189447 157.515674214
2819 46.4315501009 157.476282681
2820 57.1664021213 157.440724411
2821 30.1570304101 157.395620338
2822 211.028013639 157.414618706
2823 133.49676783 157.406149212
2824 249.587992904 157.438779953
2825 274.269956134 157.480121488
2826 240.872390257 157.509619991
2827 64.3374846172 157.476673691
2828 425.908891778 157.571559594
2829 73.2369583978 157.541759381
2830 205.076964151 157.55855034
2831 56.6823034832 157.522930196
2832 148.213180016 157.519644016
2833 110.469274781 157.503041909
2834 55.2363729223 157.46696901
2835 85.7787582158 157.44169108
2836 113.843319858 157.426323307
2837 309.653168742 157.479962083
2838 27.251624429 157.434090882
2839 17.8156600213 157.384929463
2840 42.4970419006 157.344490221
2841 48.4020690105 157.306157208
2842 90.0868251454 157.282513405
2843 81.4022455944 157.25583258
2844 28.0443102825 157.210415525
2845 42.2548251528 157.170023539
2846 257.788618358 157.205365511
2847 79.765570657 157.178174572
2848 69.872416306

3068 104.276754382 158.572253615
3069 84.3294270208 158.548070284
3070 451.306023954 158.643400129
3071 272.176574388 158.680357543
3072 828.915383501 158.898462009
3073 35.3032983848 158.858255385
3074 165.30992616 158.860353489
3075 52.7432526687 158.825855082
3076 77.4115966006 158.799396109
3077 69.7649940995 158.770470053
3078 352.275722734 158.833316838
3079 65.8778735806 158.8031365
3080 187.237604494 158.812365473
3081 242.116726263 158.839394792
3082 141.664525147 158.833823962
3083 260.566838955 158.866811321
3084 78.6519260443 158.840809737
3085 193.404683501 158.852009956
3086 120.049064511 158.839440164
3087 107.160372506 158.822704715
3088 41.2862215512 158.784654704
3089 408.251976817 158.865388466
3090 28.9454898525 158.823356793
3091 18.2648590734 158.77789803
3092 112.187870338 158.762834975
3093 108.571820504 158.746612928
3094 87.2976362624 158.723527636
3095 121.065403311 158.71136416
3096 26.2465343456 158.668592177
3097 108.306343611 158.652335802
3098 157.652600

3323 173.801230202 164.15996127
3324 180.161384645 164.164773728
3325 221.585994835 164.182038076
3326 46.6110234722 164.146699629
3327 115.354112297 164.132038395
3328 405.467669047 164.204533327
3329 42.9081691641 164.168107993
3330 8.50409316799 164.121376076
3331 30.6160271689 164.081308444
3332 66.4548108636 164.052017566
3333 186.311045873 164.058693939
3334 439.966485618 164.141424911
3335 209.704231568 164.155082827
3336 76.8109912059 164.128908391
3337 180.42215039 164.13378953
3338 179.77128154 164.138472816
3339 197.865370174 164.14857069
3340 175.465753639 164.151958054
3341 364.382344201 164.211871395
3342 192.715149045 164.220397652
3343 185.281296163 164.226695767
3344 62.0869473697 164.196160716
3345 104.484975158 164.178315173
3346 382.331220557 164.243493812
3347 161.213979993 164.24258894
3348 128.935736051 164.232046433
3349 346.183934982 164.28636043
3350 63.7798159954 164.25636743
3351 144.294280002 164.250412153
3352 179.98689982 164.255105409
3353 135.93544183 1

3576 102.805018324 168.704814939
3577 191.424925547 168.711164886
3578 225.455524897 168.727019694
3579 91.0522815055 168.70532284
3580 111.362707893 168.689309823
3581 89.4230540163 168.667180773
3582 1121.44785608 168.933097791
3583 398.454879552 168.997138467
3584 35.1133685057 168.959792924
3585 384.442060203 169.019882793
3586 174.419725509 169.021388185
3587 206.851810169 169.031931781
3588 55.4001737936 169.000270661
3589 91.2883030479 168.978623874
3590 355.982227925 169.030699508
3591 436.312756889 169.105109881
3592 278.369563024 169.135520249
3593 133.745847566 169.125673373
3594 94.5823144733 169.104938085
3595 285.708257453 169.137363925
3596 201.540005811 169.146372166
3597 445.999317749 169.22331851
3598 63.6892582892 169.193995347
3599 31.1664427722 169.155654361
3600 189.634567678 169.161341368
3601 155.848249763 169.15764534
3602 131.378060083 169.147159749
3603 38.8605290174 169.111009186
3604 606.607798103 169.232367518
3605 59.756255528 169.202008086
3606 154.94595

3828 395.834055024 172.135019246
3829 81.9584358352 172.111474446
3830 539.4240729 172.207353485
3831 110.471576929 172.191242896
3832 352.888361113 172.238385374
3833 309.786958259 172.274261371
3834 242.855038428 172.292665746
3835 832.945586892 172.464890178
3836 129.112814892 172.453591748
3837 362.258967434 172.503045989
3838 21.5397814257 172.463722398
3839 123.215925306 172.450897451
3840 1456.22772527 172.785127294
3841 62.8096406593 172.756502753
3842 119.377179853 172.74261274
3843 1404.27799738 173.062991352
3844 786.642803107 173.222569976
3845 33.6185284082 173.186271474
3846 691.851307348 173.321094722
3847 107.411063039 173.303966335
3848 426.378387891 173.369717029
3849 333.940364537 173.411423691
3850 138.999247597 173.402487784
3851 35.3036756058 173.366636587
3852 541.283370446 173.462124968
3853 165.245198015 173.459992917
3854 114.543172776 173.444709695
3855 117.768845912 173.430270934
3856 963.826613646 173.635196094
3857 121.791521927 173.621758127
3858 108.7343

4079 369.529521785 176.695631774
4080 114.929545973 176.680496737
4081 283.496137189 176.706664214
4082 105.848291264 176.689309726
4083 136.476751826 176.679463361
4084 330.582899319 176.71713862
4085 298.036036253 176.74682998
4086 254.870780647 176.765945212
4087 71.7391086383 176.740253716
4088 46.0862444251 176.708301158
4089 79.0646004675 176.684427392
4090 148.713810655 176.677590282
4091 805.593593283 176.83128432
4092 81.5698108185 176.808010078
4093 100.119576758 176.789278169
4094 59.7973055491 176.760708701
4095 315.308697331 176.794533894
4096 201.079155498 176.800461309
4097 26.5514577388 176.763797326
4098 214.023259196 176.772887217
4099 334.41953947 176.81133762
4100 90.6164641369 176.790319606
4101 408.245635935 176.846744598
4102 72.27401202 176.821257702
4103 385.620531017 176.872134718
4104 47.7997429939 176.840691992
4105 100.158795681 176.82201642
4106 168.551877218 176.820002751
4107 269.881395892 176.84265645
4108 124.893089363 176.830013576
4109 852.651171982 

4333 149.116589078 180.939201878
4334 108.516255085 180.922495316
4335 24.1678902676 180.886343424
4336 42.9286609101 180.854533951
4337 80.9197264922 180.831496881
4338 80.837619304 180.808451507
4339 136.197169662 180.79817241
4340 68.0619334862 180.772202302
4341 51.8313474855 180.742506113
4342 157.995214223 180.737268422
4343 500.533254874 180.810886282
4344 157.729708974 180.805574159
4345 196.86354842 180.809269045
4346 210.374144946 180.816070258
4347 65.4631558379 180.789540149
4348 72.2896381897 180.764591908
4349 327.126345467 180.798238288
4350 322.030801045 180.830698082
4351 191.369852888 180.833119763
4352 206.750167046 180.839073598
4353 54.3267901891 180.810017033
4354 222.554569927 180.819602465
4355 549.949468251 180.904343022
4356 69.4310274584 180.878758143
4357 56.5404378475 180.850227092
4358 46.4165373408 180.819386603
4359 40.7985461147 180.787271732
4360 64.1067577049 180.760516283
4361 772.390344485 180.89614898
4362 59.1220064469 180.868238336
4363 260.87991

4584 574.766467794 184.345225731
4585 1600.61658016 184.65405071
4586 145.870353614 184.645595577
4587 87.4567823295 184.624412313
4588 47.4390670185 184.594517925
4589 200.403827144 184.597962219
4590 64.5212250672 184.571807408
4591 383.628683059 184.615156031
4592 70.7081348115 184.590355896
4593 199.365061843 184.593571983
4594 39.5310974061 184.562002348
4595 189.109155111 184.56299172
4596 161.835618995 184.558047762
4597 305.288733326 184.58430498
4598 68.0552557532 184.558967069
4599 386.510209405 184.602869513
4600 237.11218988 184.614282102
4601 198.901210699 184.617386606
4602 177.991867055 184.615947215
4603 138.92259265 184.606022507
4604 40.7249134329 184.574777966
4605 1916.93707923 184.950887888
4606 169.153393529 184.947458869
4607 930.857284501 185.109331661
4608 324.17089671 185.139503404
4609 393.507835999 185.184702608
4610 304.81562545 185.210647289
4611 47.3801584281 185.180762101
4612 214.291682717 185.187072727
4613 194.622407994 185.189117663
4614 64.425713886

4836 45.0704113485 187.36804979
4837 32.2769616529 187.33599293
4838 879.647204955 187.479061997
4839 111.636008801 187.463391944
4840 95.5782074211 187.444411324
4841 326.507832853 187.473131568
4842 334.509119475 187.503492086
4843 153.025959796 187.496374511
4844 49.1318872398 187.467816309
4845 422.381642662 187.51629213
4846 119.529981809 187.502265658
4847 290.614320715 187.523534646
4848 378.540943757 187.562927801
4849 605.207495284 187.649040083
4850 363.444405613 187.685279078
4851 242.766942434 187.696631441
4852 69.1480739569 187.672203549
4853 162.900491076 187.667100189
4854 209.296607034 187.671555288
4855 189.305631132 187.671891794
4856 703.377845273 187.778069672
4857 141.155140057 187.768472527
4858 179.172038119 187.76670335
4859 92.5549812297 187.74711246
4860 528.158960039 187.817141641
4861 174.446874142 187.814391689
4862 332.263168531 187.844095324
4863 720.736857166 187.953653869
4864 33.3345209418 187.92187193
4865 226.695139588 187.929840131
4866 231.4472209

5088 65.1638586051 189.942920765
5089 91.3956071268 189.9235598
5090 327.290078274 189.950542027
5091 26.6980078257 189.918481435
5092 143.337498491 189.909335355
5093 189.921404778 189.909337725
5094 90.1135512179 189.889750721
5095 50.5681301695 189.862411313
5096 83.8553401174 189.841613379
5097 681.077294047 189.937971888
5098 304.158388719 189.960372441
5099 139.194465385 189.950418341
5100 201.921055911 189.952765065
5101 64.4987006717 189.928175872
5102 81.0467839238 189.90683913
5103 18.5560710243 189.873267271
5104 263.945987345 189.887777108
5105 61.2617512783 189.862585956
5106 89.0466039206 189.842845211
5107 268.082829184 189.858162358
5108 115.278362398 189.843564628
5109 87.2699871323 189.823491521
5110 1180.16192606 190.017257601
5111 133.110095351 190.006125527
5112 445.306665962 190.05605718
5113 75.0119135509 190.033561258
5114 418.868998979 190.078299369
5115 41.0839707031 190.049176162
5116 62.7011162036 190.024288911
5117 477.668771306 190.080491428
5118 83.144121

5339 60.9477751736 190.951812823
5340 113.126827844 190.937241584
5341 590.795494014 191.012093373
5342 151.285454082 191.004658104
5343 17.2137217697 190.972137345
5344 44.9126317601 190.944810964
5345 208.592436092 190.948112054
5346 118.416885467 190.934547209
5347 664.510306991 191.023099146
5348 88.3587909761 191.003905968
5349 135.700424318 190.993568869
5350 151.064272847 190.986106844
5351 51.7882855856 190.960098282
5352 190.50193625 190.960012692
5353 51.7459296704 190.934010809
5354 249.080088921 190.944869087
5355 36.5706390813 190.916046415
5356 417.682004394 190.95837719
5357 92.8256597672 190.940062013
5358 617.910847281 191.019735606
5359 199.701499734 191.021355338
5360 86.4640500568 191.001852017
5361 87.8407994053 190.98261273
5362 129.961966295 190.97123465
5363 479.891950523 191.025097572
5364 229.306171202 191.032232907
5365 246.140682101 191.042502838
5366 22.1478444578 191.011033739
5367 445.86244191 191.058509783
5368 45.4214899349 191.031384244
5369 101.969761

5591 184.756818054 190.174558826
5592 100.191718597 190.158470351
5593 129.251849644 190.147582503
5594 410.950645761 190.187046858
5595 27.4705923401 190.157969578
5596 46.3512577024 190.132276044
5597 24.836190486 190.10274834
5598 110.840618409 190.088591861
5599 51.2043253452 190.063791099
5600 76.4046885464 190.043498454
5601 81.7664940428 190.024170178
5602 59.8762542083 190.000941922
5603 43.3229640937 189.974768122
5604 309.093604365 189.996020367
5605 96.231738673 189.979294666
5606 146.648815607 189.97156674
5607 80.6214189677 189.952067784
5608 44.2915116418 189.926098706
5609 19.3315455887 189.895689695
5610 58.1697885134 189.872213327
5611 1026.48962223 190.021289843
5612 57.5048013086 189.997680991
5613 155.750388319 189.991580653
5614 1071.86592243 190.148637526
5615 69.5705152355 190.127167063
5616 429.9742278 190.16986727
5617 99.613754465 190.153748346
5618 1045.34802114 190.305945227
5619 17.2169740236 190.275146478
5620 422.825371854 190.31651816
5621 154.051209482 

5842 456.113289315 189.228668309
5843 81.9612877279 189.210313178
5844 55.8159140177 189.187491211
5845 58.1736102143 189.165080352
5846 268.279168544 189.178611067
5847 100.213791074 189.163398204
5848 73.8475804675 189.143682729
5849 104.187008731 189.12916022
5850 227.573352847 189.135730754
5851 34.2259370351 189.109259497
5852 130.685206808 189.099277599
5853 57.0314644975 189.07671733
5854 97.8350773699 189.061133788
5855 32.7402061837 189.03443964
5856 90.4050182008 189.01760006
5857 34.4178725994 188.991208847
5858 12.397020866 188.961068176
5859 2398.73815373 189.338163242
5860 396.245709209 189.373465673
5861 88.2226476121 189.35621033
5862 35.1849466641 189.329914703
5863 128.536854432 189.319547537
5864 94.807226088 189.303432904
5865 26.0121778804 189.275596004
5866 146.138934739 189.268243582
5867 72.8959884947 189.248411909
5868 19.5623940497 189.219499655
5869 69.1747384944 189.1990491
5870 102.204672947 189.184231458
5871 22.8889804876 189.155911422
5872 65.3278793832 

6095 1982.82731519 194.425277273
6096 40.8321375373 194.400085681
6097 165.327380964 194.395318101
6098 21.7969472149 194.367018646
6099 34.6364397716 194.340833306
6100 412.613923886 194.376609914
6101 86.2898728186 194.358896585
6102 58.7815371287 194.336681714
6103 106.352468972 194.322267524
6104 28.5558687974 194.295114961
6105 1392.09744333 194.491283046
6106 909.190770318 194.6083126
6107 415.270063977 194.644439279
6108 1188.93865243 194.807198194
6109 72.9453899056 194.787253544
6110 162.779348503 194.782015792
6111 449.122302796 194.823629059
6112 103.925071923 194.808759345
6113 195.842278383 194.808928387
6114 362.007643619 194.836270777
6115 254.035855979 194.845950238
6116 621.619039074 194.915718603
6117 66.4186616947 194.894715488
6118 38.0058393016 194.869075862
6119 41.5942586736 194.844030957
6120 990.229718993 194.973974706
6121 86.967392355 194.956332337
6122 327.153433896 194.977922587
6123 29.943145311 194.950973734
6124 203.026136538 194.952292128
6125 128.27374

6348 68.5340656329 199.061820541
6349 72.5344995097 199.041894979
6350 69.3134565145 199.02146852
6351 180.261598167 199.01851514
6352 74.3907293374 198.998897985
6353 40.2211230232 198.973909352
6354 179.778667984 198.970888858
6355 2296.87513214 199.300955605
6356 46.1674940871 199.276866654
6357 61.0475845801 199.255125653
6358 616.646887068 199.320763609
6359 202.64947856 199.321286992
6360 198.119153476 199.321098007
6361 84.3105437106 199.303020271
6362 842.516030788 199.40410671
6363 194.680954572 199.403364543
6364 164.946205861 199.397951006
6365 74.5019145419 199.378331773
6366 109.713251454 199.36424899
6367 38.59142119 199.339002001
6368 78.0521681712 199.319958692
6369 152.564051714 199.312618675
6370 6.73898408883 199.282392081
6371 74.9304066361 199.262876703
6372 491.718834503 199.308766544
6373 39.3510935045 199.283671208
6374 77.3715350604 199.264547736
6375 129.78389019 199.253650519
6376 74.7981296195 199.234134207
6377 25.9425084154 199.206963993
6378 209.082730929

6604 113.732124141 199.98288518
6605 256.186296574 199.991393114
6606 43.5302733641 199.967712
6607 110.090733951 199.954110763
6608 14.7575033119 199.926088882
6609 103.158145958 199.911449254
6610 509.91509411 199.95834135
6611 72.4790559982 199.939061361
6612 14.7575033119 199.911058706
6613 441.317072875 199.947557952
6614 78.6307693192 199.929218302
6615 293.467341175 199.94335647
6616 17.1606466302 199.915733271
6617 150.520054596 199.908269433
6618 330.069992955 199.927934295
6619 66.5034939442 199.907779546
6620 15.7440926274 199.87996446
6621 15.7440926274 199.852157774
6622 57.7597181945 199.830703382
6623 39.4512689722 199.806491511
6624 62.0603892064 199.785699646
6625 29.9219686427 199.760063707
6626 25.2790301384 199.733734896
6627 215.870216966 199.736169489
6628 97.6561076199 199.720770475
6629 83.7827133427 199.703283589
6630 87.2391963831 199.686323238
6631 75.4412825417 199.667589064
6632 81.3586492626 199.649752649
6633 110.300149519 199.636284213
6634 152.422440308

6854 126.993186268 201.071933806
6855 196.440665828 201.0712583
6856 116.556564962 201.058932984
6857 47.1605582017 201.036492276
6858 21.2369439169 201.010278608
6859 95.4905445773 200.994896723
6860 77.3529683338 200.976875745
6861 27.715547719 200.951626353
6862 1197.33181044 201.096807787
6863 50.6559308752 201.07489041
6864 94.5376005186 201.059371504
6865 145.783354044 201.051320817
6866 90.5237387237 201.035225349
6867 13.4935810741 201.007918761
6868 216.890435156 201.010230963
6869 145.543645516 201.002157224
6870 63.3395004508 200.982121908
6871 57.5832485578 200.961254785
6872 10.9994353089 200.933615934
6873 586.186530977 200.989660874
6874 45.9911971206 200.967115643
6875 243.520443246 200.973304317
6876 155.787262947 200.966733714
6877 327.230506108 200.985091343
6878 8.74693198366 200.957145688
6879 76.1295410667 200.939002141
6880 126.9813567 200.928254045
6881 8.74693198366 200.900328831
6882 33.3204216178 200.875981903
6883 140.737760899 200.867245962
6884 1086.814478

7105 299.667379401 200.652075969
7106 166.49845685 200.647270338
7107 168.225709735 200.642709061
7108 63.0312350302 200.6233517
7109 944.072916138 200.727915634
7110 77.1661926796 200.710539495
7111 118.985608491 200.699048363
7112 59.5004746404 200.679197586
7113 79.4996056587 200.662163626
7114 559.350654218 200.712576625
7115 254.083422303 200.720076744
7116 85.3129759187 200.703861049
7117 85.6431907771 200.687696302
7118 449.475955294 200.722643382
7119 166.284238609 200.717806527
7120 131.841948292 200.70813431
7121 17.4727862545 200.682406235
7122 440.76589789 200.716111625
7123 71.3256273158 200.697949008
7124 1772.56466818 200.918561881
7125 99.8359367253 200.904376837
7126 85.6916596345 200.888211168
7127 174.846358 200.88455771
7128 114.409247892 200.872427634
7129 162.797698834 200.86708756
7130 26.7894786904 200.842676172
7131 59.2043878032 200.822816625
7132 65.9364491891 200.803906437
7133 20.1212626524 200.778579462
7134 207.686180268 200.779547591
7135 14.8487182653 2

7356 352.986004396 200.142687648
7357 79.2188389618 200.126253311
7358 49.9350222641 200.105844121
7359 139.981905609 200.097675108
7360 53.8441871359 200.077806409
7361 782.281503149 200.156888683
7362 92.4865101213 200.142265516
7363 99.675463741 200.12862255
7364 48.0553778745 200.107974451
7365 69.0737817782 200.090185395
7366 41.1275721733 200.068607736
7367 480.805765653 200.106709956
7368 52.2648758873 200.086647283
7369 82.3153094463 200.070667454
7370 179.747187697 200.067910233
7371 792.839366581 200.148318732
7372 403.733049 200.175930929
7373 12.6751991436 200.150503654
7374 54.3864757207 200.13073904
7375 22.1591798903 200.106610574
7376 90.4658159748 200.091748057
7377 87.4596040455 200.076482111
7378 299.616701201 200.089971774
7379 136.3421624 200.081333859
7380 309.167603564 200.096113194
7381 908.685488425 200.192102001
7382 103.318101907 200.178980777
7383 12.6751991436 200.153587524
7384 135.372553523 200.144815549
7385 32.2457332467 200.122083477
7386 470.224485853

7607 338.099095886 199.655544987
7608 1100.08637637 199.773882591
7609 285.857333194 199.785194477
7610 41.4435201352 199.764390158
7611 504.677936098 199.804447113
7612 490.164556565 199.842587151
7613 333.934870927 199.860198431
7614 41.9184144809 199.839457553
7615 6.73898408883 199.814102974
7616 20.6575420661 199.790582354
7617 366.9261091 199.812521909
7618 85.7615370717 199.797552624
7619 25.0250089856 199.774616594
7620 12.397020866 199.750029585
7621 94.0408163538 199.736160625
7622 12.397020866 199.711585112
7623 382.276546059 199.735531198
7624 154.639181493 199.729616923
7625 38.5412135378 199.708480232
7626 67.9541154871 199.691205502
7627 421.956388617 199.72034357
7628 66.2894496145 199.702853612
7629 846.284893959 199.787595688
7630 391.46220766 199.812713577
7631 126.771047851 199.803143128
7632 65.2907496972 199.785520647
7633 300.395763879 199.798699878
7634 275.565790666 199.808623531
7635 12.2708638848 199.784063845
7636 281.112657803 199.79471313
7637 37.573275068

7859 59.2603885842 199.169961062
7860 51.6401699289 199.151193756
7861 58.8282079681 199.1333455
7862 108.391611664 199.121805155
7863 197.287876659 199.12157195
7864 551.537191451 199.166380039
7865 47.2026193922 199.147060974
7866 83.0445864417 199.13230281
7867 38.3680830373 199.111870144
7868 60.9191827431 199.094308486
7869 12.0611851366 199.070543159
7870 137.549532961 199.062726997
7871 370.271784065 199.084476115
7872 12.0611851366 199.060721093
7873 34.3376471929 199.03980122
7874 40.285117075 199.019641895
7875 117.230029184 199.009257232
7876 274.526523089 199.018844291
7877 67.8179650158 199.002190206
7878 23.1608528497 198.979872483
7879 186.176347609 198.978247671
7880 301.504468519 198.991256961
7881 56.8502346431 198.973223338
7882 71.4388177594 198.957044928
7883 570.52978256 199.004174905
7884 81.1493990894 198.989228199
7885 63.2584821078 198.97201659
7886 144.906196554 198.965161535
7887 487.101188188 199.001689936
7888 304.680155002 199.015085609
7889 619.067497812

8110 182.277976767 199.792555153
8111 48.753998251 199.773936002
8112 231.844258096 199.777888957
8113 9.02804206183 199.754380225
8114 20.7984034049 199.732327733
8115 250.741813547 199.738612785
8116 948.511919092 199.830860328
8117 370.136489621 199.851839095
8118 333.855298219 199.868344017
8119 1416.64422786 200.018193263
8120 58.0943662604 200.000717112
8121 162.745814975 199.996130199
8122 179.138415932 199.993562464
8123 863.852529735 200.07527824
8124 74.6133098929 200.059836767
8125 226.335051663 200.063070242
8126 646.41683737 200.117992571
8127 158.988199056 200.112932311
8128 71.1058089436 200.097062324
8129 412.379630226 200.123173341
8130 9.02804206183 200.099671296
8131 104.244551018 200.087883898
8132 417.420669747 200.114606237
8133 85.6437948897 200.10053311
8134 169.222381545 200.096737394
8135 408.497757358 200.122352072
8136 784.093930482 200.194119502
8137 183.627395202 200.192083778
8138 92.2174938112 200.178817456
8139 4355.87009405 200.68934464
8140 174.462655

8363 65.2367186447 200.413921265
8364 70.1086945522 200.398343832
8365 62.23873787 200.381829415
8366 46.5651493058 200.363445684
8367 75.8841632226 200.348570053
8368 103.515188862 200.336999569
8369 401.250563371 200.361003579
8370 558.498136234 200.403786655
8371 166.185544029 200.399699431
8372 164.506574697 200.395412661
8373 379.569700173 200.41680916
8374 1135.05355386 200.528407578
8375 194.809640483 200.527724821
8376 335.966512427 200.543892756
8377 34.1718531503 200.524034551
8378 52.2907025458 200.506343498
8379 270.723931727 200.514722685
8380 53.8771306564 200.497226254
8381 205.178874947 200.49778479
8382 226.338866246 200.500867348
8383 91.1623065836 200.487826012
8384 33.6856792962 200.46793309
8385 16.4366662848 200.445988031
8386 96.8711167867 200.433638577
8387 357.999611493 200.452423266
8388 45.6831662237 200.433974195
8389 408.202471991 200.458738021
8390 1068.69022335 200.562209774
8391 140.157422544 200.555011873
8392 42.94997958 200.536233721
8393 23.451253006

8613 113.343544767 200.740761582
8614 9.02804206183 200.718508219
8615 47.5376869597 200.700729572
8616 64.319365916 200.68490256
8617 161.857396732 200.680397164
8618 67.0441154414 200.664892316
8619 1123.61788473 200.771963429
8620 209.15646607 200.772935996
8621 581.309664659 200.817071548
8622 944.867558182 200.90335828
8623 85.691065381 200.889998784
8624 90.3950439055 200.877187775
8625 37.4248522283 200.858238976
8626 836.689434034 200.931941444
8627 531.432808695 200.970247062
8628 347.820973436 200.987265341
8629 93.8585063845 200.974851811
8630 127.965979173 200.966392899
8631 16.0267490316 200.94496801
8632 16.0267490316 200.923548084
8633 261.44132207 200.930557323
8634 15.1036086956 200.909037121
8635 131.783936512 200.901032825
8636 585.980053635 200.945617637
8637 286.210788539 200.955488576
8638 332.587092052 200.970725479
8639 229.76727419 200.974058412
8640 412.479492874 200.998535375
8641 49.802086322 200.981039836
8642 650.646644228 201.033066401
8643 198.102467682 

8866 63.5475805557 202.125720509
8867 113.34268279 202.11570889
8868 132.095751484 202.10781398
8869 157.891636997 202.102829067
8870 148.044060552 202.096735191
8871 39.5367272592 202.078412377
8872 74.3317789296 202.064015146
8873 120.930199394 202.054872277
8874 100.70743262 202.043452847
8875 44.5358288224 202.025707509
8876 10.6794046075 202.00415222
8877 531.386559124 202.041253189
8878 213.776097243 202.04257483
8879 220.201670957 202.044619773
8880 260.2768587 202.051176719
8881 370.625852102 202.070156079
8882 685.348097527 202.124560891
8883 120.661512654 202.115391254
8884 70.3948802815 202.100566211
8885 57.0586184699 202.084243687
8886 1168.49522621 202.192988031
8887 345.234652788 202.209081827
8888 167.165656652 202.205139491
8889 157.151549435 202.200071596
8890 225.93670556 202.202741333
8891 161.948964281 202.198214368
8892 78.4142361224 202.184295108
8893 279.487318929 202.1929867
8894 30.9268186734 202.173732494
8895 662.595862565 202.225488578
8896 46.9274021502 20

9120 50.4329507025 202.297087166
9121 469.975054598 202.326431385
9122 240.20726569 202.33058362
9123 292.600419565 202.340477289
9124 115.226739286 202.330930578
9125 41.8260665383 202.313342931
9126 70.2274719428 202.298870939
9127 40.5741914008 202.281153511
9128 688.412014662 202.334404783
9129 324.374068527 202.347771669
9130 40.8422349122 202.330084062
9131 65.6595802122 202.315117953
9132 618.648480193 202.360703562
9133 377.928268779 202.379924885
9134 407.945907747 202.402428003
9135 79.8800140525 202.389017056
9136 50.0432171842 202.372343552
9137 357.154715827 202.389281873
9138 261.344726352 202.395732846
9139 95.7660575431 202.384066579
9140 1997.87642509 202.580488454
9141 94.2367353377 202.568637245
9142 284.403958588 202.577587844
9143 310.005192352 202.589336269
9144 30.6383372945 202.570533536
9145 1678.49648364 202.731907464
9146 57.9307064212 202.716077006
9147 181.490030835 202.713756713
9148 746.087474769 202.773148309
9149 57.6387631581 202.757286628
9150 1091.97

9374 86.279928422 203.186472652
9375 1003.30661882 203.271809698
9376 80.2588448761 203.258691114
9377 852.689508245 203.327941574
9378 345.606105731 203.343111439
9379 219.626970495 203.344847458
9380 139.215457114 203.338011365
9381 66.7471279826 203.323452541
9382 184.324130004 203.321427675
9383 128.787969151 203.313485064
9384 244.112662001 203.317832339
9385 359.206916619 203.334441021
9386 66.7049352714 203.319885837
9387 187.101177011 203.318158237
9388 37.9658672753 203.300546959
9389 123.351881939 203.292032724
9390 146.600796874 203.285995962
9391 62.7906668528 203.271036919
9392 24.913382592 203.25204856
9393 1221.84645368 203.360478878
9394 168.360862029 203.356753533
9395 66.4737512606 203.342185312
9396 99.682119993 203.331154125
9397 164.668180582 203.327040168
9398 93.4616515999 203.315351117
9399 26.295539687 203.296519222
9400 119.637905086 203.287620316
9401 284.399448469 203.296247398
9402 292.609092033 203.305745733
9403 39.3201909217 203.288307882
9404 56.6292675

9624 42.548250142 204.324691857
9625 61.8701931892 204.309892927
9626 232.844363812 204.312856932
9627 100.474430671 204.302071885
9628 1566.09150857 204.443497728
9629 80.9107207045 204.430669817
9630 83.2959402208 204.418092231
9631 96.9687936472 204.406936781
9632 833.461546774 204.472238827
9633 22.6493327989 204.453365783
9634 87.9690882532 204.441276081
9635 195.68156716 204.440367021
9636 85.0614177196 204.427979457
9637 210.254738045 204.428584018
9638 392.121851622 204.448056294
9639 73.8898630586 204.434512913
9640 137.890132439 204.427610685
9641 461.113772716 204.454232357
9642 74.3829073135 204.440743679
9643 132.865409261 204.433321931
9644 154.817130049 204.428177691
9645 164.087622431 204.423995589
9646 64.3800250113 204.409478748
9647 470.765367544 204.437086116
9648 66.2959740879 204.422769491
9649 241.247425359 204.426585518
9650 45.1566155457 204.410082568
9651 373.335021027 204.427584116
9652 96.0439805444 204.416356145
9653 500.949643961 204.447072251
9654 52.0459

In [11]:
np.mean(ppls)

205.50842274089328

In [12]:
def sample_from_distrib(context):
    n = len(context) + 1
    
    # normalize to construct distribution
    probabilities = get_distribution(context)
    print context, sum(probabilities)
    choice = np.random.choice(len(word_to_id) + 1, p=probabilities)
    
    return choice

In [115]:
tokens = [word_to_id['@@start@@'],]

while True:
    try:
        while len(tokens) < 10:
            try:
                tokens.append(sample_from_distrib((tokens[-2],tokens[-1],)))
            except:
                try:
                    tokens.append(sample_from_distrib((tokens[-1],)))
                except KeyError:
                    tokens.append(sample_from_distrib(tuple()))

            if id_to_word[tokens[-1]] == '@@end@@':
                break
    except KeyError:
        continue
    else:
        break

(7332,) 1.0
(22598,) 1.0
(7999,) 1.0
(318,) 1.0
(4402,) 1.0
(11050,) 1.0
(559,) 1.0
(15944,) 1.0
(11853,) 1.0


In [116]:
for id_ in tokens:
    print id_to_word[id_]

@@start@@
@@number@@
they
had
reached
but
one
side
of
death
