|<h2>Substack post:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/zipfs-law-in-famous-fiction-characters" target="_blank">Zipf's law in famous fiction: characters and GPT4 tokens</a></h1>|
|-|:-:|
|<h2>Teacher:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the post may lead to confusion or errors.</i>

In [None]:
# all the libraries
import requests
import numpy as np
import matplotlib.pyplot as plt

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

!pip install tiktoken
import tiktoken

In [None]:
### Run this cell only if you're using "dark mode"

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    'figure.facecolor': '#383838',#'#020617',#
    'figure.edgecolor': '#020617',#'#383838',#
    'axes.facecolor':   '#020617',#'#383838',#
    'axes.edgecolor':   '#DDE2F4',
    'axes.labelcolor':  '#DDE2F4',
    'xtick.color':      '#DDE2F4',
    'ytick.color':      '#DDE2F4',
    'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
})

# Getting text data

In [None]:
url = 'https://www.gutenberg.org/cache/epub/829/pg829.txt'
text = requests.get(url).text

print(text[10000:12000])

In [None]:
# show the unique characters
import textwrap
uniq_chars = sorted(set(text))
print(textwrap.fill(' '.join(uniq_chars),58))

In [None]:
# sorted counts of all unique character appearances
counts = np.sort([ text.count(u) for u in set(text) ])[::-1]

# visualization!
plt.figure(figsize=(10,4))
plt.plot(counts,'ko',markerfacecolor=[.7,.7,.9,.5],markersize=10)

plt.gca().set(xscale='log',yscale='log',xlabel='Sorted character index',ylabel='Character frequency')
plt.show()

In [None]:
# GPT-4's tokenizer
tokenizer = tiktoken.get_encoding('cl100k_base')
toks = tokenizer.encode('pomegranate')
for t in toks:
  print(f'Token index {t:>5} is "{tokenizer.decode([t])}"')

# Tokenize the book

In [None]:
tokens = tokenizer.encode(text)
print(f'There are {len(tokens):,} tokens in the text, {len(set(tokens)):,} of which are unique.')
print(f'There are {len(text):,} characters in the text, {len(uniq_chars):,} of which are unique.')

In [None]:
unitokens,counts = np.unique(tokens,return_counts=True)

# visualization!
plt.figure(figsize=(10,4))
plt.plot(np.sort(counts)[::-1],'ko',markerfacecolor=[.9,.7,.7,.5],markersize=10)

plt.gca().set(xscale='log',yscale='log',xlabel='Sorted token index',ylabel='Token frequency')
plt.show()

# Repeat the analysis for many books

In [None]:
# all books have the same url format; they are unique by numerical code
baseurl = 'https://www.gutenberg.org/cache/epub/'

bookurls = [
    # code       title
    ['84',    'Frankenstein'    ],
    ['64317', 'GreatGatsby'     ],
    ['11',    'AliceWonderland' ],
    ['1513',  'RomeoJuliet'     ],
    ['76',    'HuckFinn'        ],
    ['219',   'HeartDarkness'   ],
    ['2591',  'GrimmsTales'     ],
    ['2148',  'EdgarAllenPoe'   ],
    ['36',    'WarOfTheWorlds'  ],
    ['829',   'GulliversTravels']
]

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,4))

for code,title in bookurls:

  # get the text
  fullurl = baseurl + code + '/pg' + code + '.txt'
  text = requests.get(fullurl).text

  # Zipf's law for characters
  counts = [ text.count(u) for u in set(text) ]
  axs[0].plot(np.sort(counts)[::-1],'.',markersize=4,alpha=.6,label=title)

  # and for tokens
  tokens = tokenizer.encode(text)
  unitokens,counts = np.unique(tokens,return_counts=True)
  axs[1].plot(np.sort(counts)[::-1],'.',markersize=4,alpha=.3,label=title)


# axis adjustments
for a in axs:
  a.legend(fontsize=9)
  a.set(xscale='log',yscale='log',xlabel='Sorted token index (log)',ylabel='Frequency in text (log)')

axs[0].set_title('Frequency of characters')
axs[1].set_title('Frequency of GPT tokens')

plt.tight_layout()
plt.show()