In [4]:
import random
import re
from collections import defaultdict
from pathlib import Path
import networkx as nx
from manim import *
from selma import BACKGROUND
from selma.graph import MGraph, gvlayout_factory, test_draw
from selma.mklm import MarkovChainTextGenerator
from pathlib import Path

config.background_color = BACKGROUND

In [5]:
corpora = {
  'mc': """
Markov chains are mathematical systems that undergo transitions from one state to another 
on a state space. A sequence of possible events in which the probability of each event 
depends only on the state attained in the previous event is called a Markov chain.
""",
  'ttls': """
Twinkle, twinkle, little star,
How I wonder what you are!
Up above the world so high,
Like a diamond in the sky.
""",
  'gadda': 'Come poco pepe a coppia cuoce e scoppia',  # Gadda
  'giovanni': 'In principio era il Verbo, e il Verbo era presso Dio, e il Verbo era Dio',  # dal Vangelo di Giovanni, 1:1
  'commedia': Path('../../data/commedia.txt').read_text(),
  'promessi': Path('../../data/papini.txt').read_text(),
}

In [6]:
def clean_text(corpus):
  res = [' ']
  for c in corpus:
    l = c.lower()
    if l.isalpha():
      res.append(l)
    elif l == ' ' and res[-1] != ' ':
      res.append(' ')
  return ''.join(res[1:])

In [7]:
CORPUS = 'giovanni'

ORDER = 1
generator = MarkovChainTextGenerator(
  order=ORDER, token_type='word', smoothing_alpha=0, interpolation=False
)
generator.train(clean_text(corpora[CORPUS]))

In [8]:
generator.generate('dio', length=100)

'dio e il verbo era presso dio e il verbo era presso dio e il verbo era il verbo e il verbo era il verbo era il verbo era dio e il verbo e il verbo era presso dio e il verbo e il verbo era il verbo era il verbo e il verbo era dio e il verbo e il verbo era presso dio e il verbo e il verbo e il verbo era dio e il verbo era presso dio e il verbo era dio e il verbo era dio e il verbo era il verbo era dio'

In [9]:
def build_markov_chain(scene, corpus, by_char, layout, node_scale, order=1):
  data = (
    [c for c in clean_text(corpus) if c != ' ']
    if by_char
    else clean_text(corpus).split()
  )

  T = Tex(*[ts for t in data for ts in (t, ' ')], color=BLACK).scale(0.9)
  T.to_edge(UL)
  scene.add(T)

  tokens = (
    data
    if order == 1
    else [
      ('' if by_char else ' ').join(data[i : i + order]) for i in range(len(data) - 1)
    ]
  )

  edges = list(zip(tokens, tokens[1:]))
  G = nx.DiGraph(set(edges))
  weight = {e: 0 for e in G.edges()}
  G.remove_edges_from(nx.selfloop_edges(G))

  MG = MGraph(G, layout=layout, node_scale=node_scale)

  def highlight(t):
    mt = MG.mnode(t)
    mt.z_index = 1
    mt.set_stroke(color=PURE_GREEN)
    tg = VGroup(*[T[i] for i in range(highlight.step, highlight.step + 1 + order)])
    tg.set_color(PURE_GREEN)
    scene.add(mt)
    scene.wait(0.5)
    mt.set_stroke(color=DARK_BROWN)
    tg.set_color(BLACK)
    highlight.step += 2

  highlight.step = 0

  highlight(edges[0][0])
  for s, t in edges:
    weight[(s, t)] += 1
    if s != t:
      me = MG.medge(s, t)
      me.z_index = 0
      me.set_stroke(width=weight[(s, t)] * 2, color=PURE_GREEN)
      scene.add(me)
    else:
      me = None
    highlight(t)
    if me:
      me.set_stroke(color=BLACK)
      
  return weight

In [49]:
%%manim --hide-splash -qm -v WARNING MarkovByChar

results = {}

class MarkovByChar(Scene):
  def construct(self):
    results['gadda3'] = build_markov_chain(self, corpora['gadda'], by_char=True, layout = gvlayout_factory('neato', heightscale=.5), node_scale=.8, order = 3)


In [41]:
%%manim --hide-splash -qm -v WARNING MarkovByWord

class MarkovByWord(Scene):
  def construct(self):
    results['giovanni2'] = build_markov_chain(self, corpora['giovanni'], by_char=False, layout = gvlayout_factory('neato', heightscale=.6), node_scale=.6, order = 2)


In [75]:
w = results['gadda3']
w

{('cop', 'ope'): 1,
 ('cop', 'opp'): 2,
 ('ope', 'pep'): 1,
 ('ome', 'mep'): 1,
 ('mep', 'epo'): 1,
 ('epe', 'pea'): 1,
 ('pea', 'eac'): 1,
 ('ppi', 'pia'): 2,
 ('pia', 'iac'): 1,
 ('pia', 'ia'): 1,
 ('oco', 'cop'): 1,
 ('acu', 'cuo'): 1,
 ('cuo', 'uoc'): 1,
 ('eac', 'aco'): 1,
 ('aco', 'cop'): 1,
 ('epo', 'poc'): 1,
 ('iac', 'acu'): 1,
 ('sco', 'cop'): 1,
 ('poc', 'oco'): 1,
 ('ees', 'esc'): 1,
 ('esc', 'sco'): 1,
 ('opp', 'ppi'): 2,
 ('pep', 'epe'): 1,
 ('uoc', 'oce'): 1,
 ('oce', 'cee'): 1,
 ('cee', 'ees'): 1,
 ('com', 'ome'): 1}

In [99]:
import random

def next_weight(weight):
  return {s: dict(sorted({t: weight[(s, t)] for ss, t in w.keys() if ss == s}.items())) for s, _ in weight}

def mk_rnd_next(next_weight, seed = None):
  rng = random.Random(seed)
  def rnd_next(s):
    if s not in next_weight: return None
    total = sum(next_weight[s].values())
    r = rng.uniform(0, total)
    upto = 0
    for t, w in next_weight[s].items():
      if upto + w >= r:
        return t
      upto += w
  return rnd_next



In [109]:
nw = next_weight(w)

In [110]:
rn = mk_rnd_next(nw, 42)

In [111]:
def generate(rn, s, max_len=100):
  res = [s]
  while len(res) <= max_len:
    n = rn(res[-1])
    if n is None:
      break
    res.append(n)
  return res

In [112]:
''.join(g[0] for g in generate(rn, 'cop', 30))

'coppi'

In [113]:
nw0 = {'.': {c: 1 for c in clean_text(corpora['gadda']) if c != ' '}}

In [114]:
rn = mk_rnd_next(nw0, 42)

In [115]:
''.join(rn('.') for _ in range(10))

'acmmiiscec'