---
title: Torah Recurring Neural Network Demo
author: mad0perator
date: 2025-02-16
---

In [1]:
import numpy as np
import pandas as pd
import unicodedata2 as ucd
from fastai.text.all import *

In [2]:
HOST = "https://github.com"
ORG = "ETCBC"
REPO = "bhsa"
APP = "tf"
VERSION = "2021"
FEATURE = "g_cons_utf8"

In [3]:
url = "/".join([HOST, ORG, REPO])

In [4]:
!git clone {url}

In [5]:
prefix = "/".join([REPO, APP, VERSION, FEATURE])
ext = APP
path = ".".join([prefix, ext])

In [6]:
with open(path, "r") as file:
    doc = file.read()

In [7]:
lines = doc.splitlines()
header = lines.pop(0)
assert header == "@node"

In [8]:
tokens = []
for line in lines:
    if not line.startswith("@") and len(line) > 0:
        tokens.append(line)
s = pd.Series(tokens, name="token", dtype="string")

In [9]:
def remove_punct(token: str) -> str:
    return "".join([c for c in token if ucd.category(c).startswith("L")])

In [10]:
def tfm_no_punct(s: pd.Series) -> pd.Series:
    return s.apply(remove_punct)

In [11]:
s = s.pipe(tfm_no_punct)

In [12]:
doc = " . ".join(s)

In [13]:
tokenss = pd.Series(doc.split(" "))

In [14]:
vocab = {w: i for i, w in enumerate(tokenss.unique())}
idx = tuple(tokenss.apply(vocab.get))

In [15]:
seqs = list((tensor(idx[i:i+3]), idx[i+3]) for i in range(0, len(idx)-4, 3))

In [16]:
class LMModel3(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0
        
    def forward(self, x):
        for i in range(3):
            self.h = self.h + self.i_h(x[:, i])
            self.h = F.relu(self.h_h(self.h))
        out = self.h_o(self.h)
        self.h = self.h.detach()
        return out
    
    def reset(self):
        self.h = 0   

In [19]:
def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = list()
    for i in range(m):
        new_ds += list(ds[i + (m*j)] for j in range(bs))
    return new_ds

In [20]:
bs = 64
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs=bs,
    drop_last=True,
    shuffle=False)

In [None]:
learn = Learner(dls, LMModel3(len(vocab), bs), loss_func=F.cross_entropy,
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
