In [19]:
import re

In [20]:
def extractWordsInParagraph(paragraph: str) -> list[str]:
    return re.split(r'[\s,\.\(\)\[\]\{\}:;"]', paragraph)

def extractCodexPages(f) -> list[str]:
    END_PAGE = '79'
    paragraphs = []
    current_paragraph = ''
    for line in f:
        if line.startswith('# page '):
            page_number = line[6:].strip()

            current_paragraph = current_paragraph.strip()

            if current_paragraph != '':
                paragraphs.append(current_paragraph)
            current_paragraph = ''

            if page_number == END_PAGE:
                break
        elif not line.startswith('#'):
            last_character = line[-2]

            if last_character in ['-', '=']:
                current_paragraph += line[:-2]
            else:
                current_paragraph += line[:-1]

            if last_character == '=':
                paragraphs.append(current_paragraph.strip())
                current_paragraph = ''

    return paragraphs


def readCodex(path: str) -> list[str]:
    with open(path, 'r') as f:
        return map(extractWordsInParagraph, extractCodexPages(f))

In [24]:
def countNGrams(text: list[str], n: int) -> dict[str, int]:
    ngrams = {}

    for paragraph in text:
        for i in range(len(paragraph) - n + 1):
            ngram = ' '.join(paragraph[i:i+n])
            if ngram not in ngrams:
                ngrams[ngram] = 1
            ngrams[ngram] += 1

    return ngrams

In [26]:
def getSortedNGrams(ngrams: dict[str, int]) -> list[tuple[str, int]]:
    return list(sorted(ngrams.items(), key=lambda x: x[1], reverse=True))

In [27]:
for p in readCodex('FSG.txt'):
    print(p)

['FGAG2', 'GDAE', 'AR', 'GHAM', 'SOE', 'SORG', '0D0RC2', 'GDOR', 'SOE8G2ORG', 'DZAR', 'ORG', 'DAIR', 'THAM', 'SOR', 'AR', 'HZAR', 'HZAR', '8ALA2GAIIR', 'SCDG', 'OR', 'GDAM', 'SO8', 'HZOARG', 'HZC2', '8ARAM', '2G0OM', 'OHCCG', 'OHCAR', 'ROEOHG', 'HZAAR', '8AM', 'ODAM', 'OR', 'ODAL2AIRG', 'TCAR', 'HZAM', 'PZAR', 'FZAM']
['G8ARAISG']
['O8AR', 'SG', 'SOE', 'PZOG', 'OG8AR', 'S', '2', 'FZOAM', 'SO8ARGGSCG', 'SO8G', 'ODTO', 'G', 'OHTOE', 'TOHZG', 'O2TG', '8AN', 'TOR', 'DO28AM', 'SOR', 'FZOE', 'SO8G']
['8AN', '02', 'HCO8G']
['G8AN', 'PZC2AM', 'OE2', 'PZCG', 'GHAN', 'SOSG', 'PZO8AEC2ODSO', 'DSOG', 'OHAIRN', 'OHCOE', 'ODAL', 'SO8AN', '2DZCG', '8AMSOG', 'DZCG', 'DO8AM', 'PZG', 'PZ08AIIE2', 'HZCG', 'SC', 'OE8AN', '88AN', 'OM', 'TOE', 'O8AM', 'TO8AN', 'T8G', 'ODON', '80R', 'HZG', 'DO88AM', 'SDZCG', 'DZOR', 'TOR', 'SCG', 'DOE', 'TOE', 'TOE', 'DOR', 'TOESO', 'TOE', 'SO8AL', 'DSG', 'DTG', '8OR', 'TO8AM', 'SO', 'DOCAKGTO', 'HTCG', 'TCDAN', 'SCO', 'PSOE', '8G8G8', 'HZG', '8AIHZGGHO', 'SOE', 'SC', 'DO8SC

In [28]:
print(countNGrams(readCodex('FSG.txt'), 1))

{'FGAG2': 2, 'GDAE': 4, 'AR': 27, 'GHAM': 8, 'SOE': 58, 'SORG': 3, '0D0RC2': 2, 'GDOR': 5, 'SOE8G2ORG': 2, 'DZAR': 2, 'ORG': 3, 'DAIR': 3, 'THAM': 2, 'SOR': 40, 'HZAR': 11, '8ALA2GAIIR': 2, 'SCDG': 6, 'OR': 46, 'GDAM': 13, 'SO8': 7, 'HZOARG': 2, 'HZC2': 2, '8ARAM': 2, '2G0OM': 2, 'OHCCG': 4, 'OHCAR': 2, 'ROEOHG': 2, 'HZAAR': 2, '8AM': 241, 'ODAM': 32, 'ODAL2AIRG': 2, 'TCAR': 14, 'HZAM': 9, 'PZAR': 5, 'FZAM': 3, 'G8ARAISG': 2, 'O8AR': 5, 'SG': 48, 'PZOG': 3, 'OG8AR': 2, 'S': 5, '2': 39, 'FZOAM': 2, 'SO8ARGGSCG': 2, 'SO8G': 15, 'ODTO': 3, 'G': 9, 'OHTOE': 10, 'TOHZG': 13, 'O2TG': 2, '8AN': 40, 'TOR': 120, 'DO28AM': 3, 'FZOE': 4, '02': 2, 'HCO8G': 2, 'G8AN': 5, 'PZC2AM': 2, 'OE2': 3, 'PZCG': 3, 'GHAN': 3, 'SOSG': 2, 'PZO8AEC2ODSO': 2, 'DSOG': 2, 'OHAIRN': 2, 'OHCOE': 5, 'ODAL': 2, 'SO8AN': 2, '2DZCG': 2, '8AMSOG': 2, 'DZCG': 6, 'DO8AM': 2, 'PZG': 6, 'PZ08AIIE2': 2, 'HZCG': 18, 'SC': 10, 'OE8AN': 2, '88AN': 2, 'OM': 7, 'TOE': 146, 'O8AM': 13, 'TO8AN': 3, 'T8G': 21, 'ODON': 2, '80R': 2, 'HZ

In [31]:
getSortedNGrams(countNGrams(readCodex('FSG.txt'), 1))

[('8AM', 241),
 ('TOE', 146),
 ('TOR', 120),
 ('TG', 66),
 ('SOE', 58),
 ('HZG', 55),
 ('8AR', 53),
 ('8G', 51),
 ('SG', 48),
 ('OR', 46),
 ('TCG', 44),
 ('SO', 43),
 ('SOR', 40),
 ('8AN', 40),
 ('2', 39),
 ('HZOR', 36),
 ('HZOE', 34),
 ('ODAM', 32),
 ('8OE', 30),
 ('OE', 29),
 ('SCG', 28),
 ('AM', 28),
 ('4OHTG', 28),
 ('AR', 27),
 ('DTG', 27),
 ('8AE', 27),
 ('TCOR', 27),
 ('4ODTG', 26),
 ('TC8G', 25),
 ('ODG', 24),
 ('8OR', 23),
 ('TDZG', 23),
 ('OHTG', 22),
 ('T8G', 21),
 ('TAR', 21),
 ('TO', 21),
 ('4ODC8G', 21),
 ('TAM', 20),
 ('HTG', 20),
 ('2AM', 19),
 ('OHAM', 19),
 ('HZCG', 18),
 ('4ODAM', 18),
 ('DZG', 17),
 ('4OHG', 17),
 ('OHG', 17),
 ('8AIR', 16),
 ('SCCG', 16),
 ('4ODG', 16),
 ('SC8G', 16),
 ('SO8G', 15),
 ('TO8AM', 15),
 ('TOHG', 15),
 ('8TG', 15),
 ('4ODOE', 15),
 ('TCAR', 14),
 ('TCOE', 14),
 ('TO8G', 14),
 ('THZG', 14),
 ('DTOR', 14),
 ('GDAM', 13),
 ('TOHZG', 13),
 ('O8AM', 13),
 ('DOE', 13),
 ('OHOE', 13),
 ('SAM', 13),
 ('ODAE', 13),
 ('ODTG', 13),
 ('TCDG', 13),
