In [1]:
import random
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import pandas as pd

In [2]:
def is_balanced(text):
    text = text.strip()
    n_open, n_closed = 0, 0
    for s in text:
        if s == '(':
            n_open += 1
        if s == ')':
            n_closed += 1
        if n_closed > n_open:
            return False
    if n_open > n_closed:
        return False
    return True

In [3]:
assert is_balanced('(())()')
assert is_balanced('()')
assert is_balanced('') # arbitrary choice to say empty string is balanced
assert not is_balanced(')(')
assert not is_balanced('(())(')
assert not is_balanced('(()()')
assert not is_balanced('(()))')

In [4]:
def generate_example(length=10):
    assert length % 2 == 0
    n_open, n_closed = 0, 0
    result = ''
    for _ in range(length):
        if n_open >= length // 2:
            result += ')'
            n_closed += 1
        elif n_open > n_closed:
            s = random.choice('()')
            if s == '(':
                n_open += 1
            if s == ')':
                n_closed += 1
            result += s
        else:
            result += '('
            n_open += 1
    return result

In [5]:
for i in range(3):
    ex = generate_example(10 + i * 2)
    assert is_balanced(ex)
    print(ex)

((((()))))
(((()()())))
()((()()()()))


In [6]:
with open('input.txt', 'w') as f:
    f.write('\n'.join([generate_example(random.randint(5, 10) * 2) for _ in range(50000)]))

In [7]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
text[:100]

'()()((())()(()))\n()()()(())((()))\n(((())(()(()))))\n((((((((()))))))))\n(((()(()))(())))\n(())((((((()('

In [8]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [9]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [10]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [11]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [12]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [13]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [14]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [15]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [16]:
class GPT(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [17]:
# # hyperparameters
# batch_size = 64 # how many independent sequences will we process in parallel?
# block_size = 256 # what is the maximum context length for predictions?
# max_iters = 5000
# eval_interval = 500
# learning_rate = 3e-4
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# eval_iters = 200
# n_embd = 384
# n_head = 6
# n_layer = 6
# dropout = 0.2

reduce the block size to be a bit larger than the largest sample we train/test with - this means that the model can see the complete examle

reduce embedding size, number of heads and layers to make the model smaller - faster to train

large batch size also makes training faster

In [18]:
# hyperparameters
f = 10 # increase batch size and reduce iterations by this ammount
batch_size = int(64*f) # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions? 
max_iters = int(5000/f)
eval_interval = int(500/f)
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = int(200/f)
n_embd = 32
n_head = 4
n_layer = 4
dropout = 0.2
model = GPT()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

0.051715 M parameters


In [19]:
model = GPT()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

torch.save(model.state_dict(), 'gpt_0.pt')

0.051715 M parameters
step 0: train loss 1.2791, val loss 1.2784
step 50: train loss 0.8281, val loss 0.8280
step 100: train loss 0.8006, val loss 0.8004
step 150: train loss 0.7837, val loss 0.7836
step 200: train loss 0.7597, val loss 0.7598
step 250: train loss 0.7250, val loss 0.7245
step 300: train loss 0.6958, val loss 0.6959
step 350: train loss 0.6767, val loss 0.6781
step 400: train loss 0.6624, val loss 0.6614
step 450: train loss 0.6481, val loss 0.6489
step 499: train loss 0.6363, val loss 0.6357


In [20]:
# plt.plot(torch.tensor(lossi).view(-1, 10).mean(1));

In [45]:
def generate_sample(f='gpt_0.pt', max_new_tokens=500):
    model = GPT()
    model.load_state_dict(torch.load(f))
    model.eval().to(device)
    context = torch.zeros((1, 1), dtype=torch.long, device=device)
    # print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))
    with torch.no_grad():
        return decode(model.generate(context, max_new_tokens=max_new_tokens)[0].tolist())

In [40]:
generated_sample = generate_sample()
generated_sample

'\n(((()())()))()()\n()(((((())(()))\n(()))()((()))))\n()(()((()))()\n((())()))()\n(()((())()(((())))\n((()))((()))\n()()(\n())(()((()((()))))\n(()()()))((((()))))\n())\n()()(()())((())))()(()\n()()(()())()()\n\n()(()(((())(()((\n((())(()()(()))))))))\n()(()(((()(()))\n()(())((())))\n((()()()()((()(())()))()\n)(()())\n(()()(()())((((())))())\n(((())))))\n()()((((())()\n()()(()))()(())\n()(())()()(())((()(())))()\n()((())(((()))))\n(()))()()()()()\n(()()()\n\n()()(()())())(()(())(())\n((((()))))\n()(()(()(((())()))\n()((()())(())('

In [22]:
def _is_balanced(text):
    text = text.strip()
    n_open, n_closed = 0, 0
    for i, s in enumerate(text):
        if s == '(':
            n_open += 1
        if s == ')':
            n_closed += 1
        if n_closed > n_open:
            return False, i, 'early close'
    if n_open > n_closed:
        return False, f'{n_open=} {n_closed=}'
    return True

In [23]:
for sample in generated_sample.split('\n'):
    print(_is_balanced(sample), sample)

True 
True ()()()(())
(False, 'n_open=8 n_closed=7') (()(((())(())))
(False, 6, 'early close') (())()))(()
(False, 22, 'early close') ((()(())(()))(()())()))
(False, 'n_open=9 n_closed=7') ()(()(()))()((()
(False, 2, 'early close') ()))()()()(()())
(False, 20, 'early close') ()(()()()(()(()()))))
True ((((()))()))
True ()((()())((((())))))
True (())()()((())(()()))
True ((())(())())
True (()(()))()()
(False, 16, 'early close') ((()()((())))()))
True (((())()()))
(False, 14, 'early close') ()()()(())()())((()
(False, 8, 'early close') (())()()))
(False, 16, 'early close') ()((()(()))(())))
(False, 'n_open=8 n_closed=7') ()((()))((()())
True ()(()())()((()))()
True ()(())()(())
True (((()()((()))))())
(False, 'n_open=8 n_closed=7') (()(()(()(())))
(False, 14, 'early close') ()((((())))()))
True ((()(()())(())))
True ()
(False, 'n_open=10 n_closed=9') ()(()(()()(())()())
(False, 'n_open=7 n_closed=5') ()()((()(())
(False, 4, 'early close') ()()))()(()())((((()))
(False, 6, 'early close') 

In [43]:
def generate_sample_as_df(f='gpt_0.pt', max_new_tokens=50):
    model = GPT()
    model.load_state_dict(torch.load(f))
    model.eval().to(device)
    idx = torch.zeros((1, 1), dtype=torch.long, device=device)
    data = []
    text = ''
    for _ in range(max_new_tokens):
        # crop idx to the last block_size tokens
        idx_cond = idx[:, -block_size:]
        # get the predictions
        with torch.no_grad():
            logits, loss = model(idx_cond)
        # focus only on the last time step
        logits = logits[:, -1, :] # becomes (B, C)
        # apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1) # (B, C)
        # sample from the distribution
        idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
        char_next = decode([idx_next.item()])
        text += char_next
        if '\n' == char_next:
            text = ''
            balanced = ''
        else:
            balanced = _is_balanced(text)
        data.append({
            'end': probs[0][0].item(),
            '(': probs[0][1].item(),
            ')': probs[0][2].item(),
            'char': char_next, 
            'balanced': balanced,
            'text': text,
            'len': len(text),
    #         'idx_next': idx_next.item()
        })
        # append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

    return pd.DataFrame(data)

In [44]:
generate_sample_as_df()

Unnamed: 0,end,(,),char,balanced,text,len
0,0.002552,0.978812,0.018636,),"(False, 0, early close)",),1
1,0.010967,0.934429,0.054604,(,"(False, 0, early close)",)(,2
2,0.002438,0.62702,0.370542,),"(False, 0, early close)",)(),3
3,0.016744,0.909389,0.073867,(,"(False, 0, early close)",)()(,4
4,0.002459,0.55246,0.44508,(,"(False, 0, early close)",)()((,5
5,0.002783,0.471748,0.525469,),"(False, 0, early close)",)()((),6
6,0.00434,0.679378,0.316282,(,"(False, 0, early close)",)()(()(,7
7,0.002565,0.42983,0.567605,(,"(False, 0, early close)",)()(()((,8
8,0.002647,0.411337,0.586016,(,"(False, 0, early close)",)()(()(((,9
9,0.002554,0.39046,0.606986,),"(False, 0, early close)",)()(()(((),10


In [64]:
def print_probs(context='\n', f='gpt_0.pt', max_new_tokens=50):
    model = GPT()
    model.load_state_dict(torch.load(f))
    model.eval().to(device)
    @torch.no_grad()
    def _dump(c):
        print(c.replace('\n', '\\n'), encode(c))
        c = torch.tensor([encode(c)], dtype=torch.long, device=device)
        logits, _ = model(c)
    #     print(logits)
        probs = F.softmax(logits[:, -1, :], dim=-1)
        print(probs[0], '\n')
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    _dump(context)
    for i in ['\n', '(', ')']:
        _dump(i + context)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

In [68]:
for context in ['\n', '\n(', '\n()', '\n(())()(())']:
    print_probs(context)

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
\n [0]
tensor([0.0026, 0.9788, 0.0186], device='cuda:0') 

\n\n [0, 0]
tensor([0.0024, 0.9807, 0.0169], device='cuda:0') 

(\n [1, 0]
tensor([0.0020, 0.8932, 0.1047], device='cuda:0') 

)\n [2, 0]
tensor([0.0037, 0.9748, 0.0215], device='cuda:0') 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
\n( [0, 1]
tensor([0.0024, 0.6156, 0.3819], device='cuda:0') 

\n\n( [0, 0, 1]
tensor([0.0023, 0.7020, 0.2957], device='cuda:0') 

(\n( [1, 0, 1]
tensor([0.0024, 0.5320, 0.4656], device='cuda:0') 

)\n( [2, 0, 1]
tensor([0.0025, 0.5969, 0.4006], device='cuda:0') 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
\n() [0, 1, 2]
tensor([0.0053, 0.8960, 0.0987], device='cuda:0') 

\n\n() [0, 0, 1, 2]
tensor([0.0045, 0.9399, 0.0556], device='cuda:0') 

(\n() [1, 0, 1, 2]
tensor([0.0056, 0.7697, 0.2247], device='cuda:0') 

)\n() [2, 0, 1, 2]
tensor(

# policy gradient ...

In [27]:
def generate_partial_example():
    "Generate an example that can be terminated as-is or made valid by adding more parens"
    length = random.randint(5, 10) * 2
    stop = random.randint(0, length)
    n_open, n_closed = 0, 0
    result = [0]
    for i in range(length):
        if i == stop:
            return result
        if n_open >= length // 2:
            n_closed += 1
            result.append(2)
        elif n_open > n_closed:
            s = random.choice('()')
            if s == '(':
                n_open += 1
                result.append(1)
            if s == ')':
                n_closed += 1
                result.append(2)
        else:
            n_open += 1
            result.append(1)
    return result

In [28]:
decode(generate_partial_example())

'\n(('

In [73]:
class Environment:
    def _context_as_tensor(self):
        return torch.tensor([self.context[-block_size:]], dtype=torch.long, device=device)
    
    def reset(self):
        self.context = generate_partial_example() # [0] use partial examples so we're not always starting from nothing
        return self._context_as_tensor()

    def step(self, action):
        self.context.append(action)
        if action == 0:
            done = True
            rew = 10 if is_balanced(decode(self.context)) else 0
        else:
            done = False
            rew = 0
#         return obs, rew, done, _
        return self._context_as_tensor(), rew, done, None

In [80]:
# make loss function whose gradient, for the right data, is policy gradient
def compute_loss(logits, act, weights):
    policy = torch.distributions.categorical.Categorical(logits=logits)
    logp = policy.log_prob(act)
    return -(logp * weights).mean()

In [84]:
model = GPT()
model.load_state_dict(torch.load('gpt_0.pt'))
model.to(device)
# batch_size=5000
batch_size=2000 # TODO: 5000 is really slow - but we need large batch size to reduce variance
env = Environment()
# make optimizer
# optimizer = Adam(logits_net.parameters(), lr=lr)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# for training policy
for i in range(5):
    # make some empty lists for logging.
    batch_obs = []          # for observations
    batch_logits = []
    batch_acts = []         # for actions
    batch_weights = []      # for R(tau) weighting in policy gradient
    batch_rets = []         # for measuring episode returns
    batch_lens = []         # for measuring episode lengths

    # reset episode-specific variables
    obs = env.reset()       # first obs comes from starting distribution
    done = False            # signal from environment that episode is over
    ep_rews = []            # list for rewards accrued throughout ep

    # collect experience by acting in the environment with current policy
    while True:
        # act in the environment
        logits, _ = model(obs)
        logits = logits[:, -1, :][0]
        policy = torch.distributions.categorical.Categorical(logits=logits)
        act = policy.sample().item()
        obs, rew, done, _ = env.step(act)

        # save observation, logits, action, reward
        batch_obs.append(obs)
        batch_logits.append(logits)
        batch_acts.append(act)
        ep_rews.append(rew)

        if done:
            # if episode is over, record info about episode
            ep_ret, ep_len = sum(ep_rews), len(ep_rews)
            batch_rets.append(ep_ret)
            batch_lens.append(ep_len)

            # the weight for each logprob(a|s) is R(tau)
            batch_weights += [ep_ret] * ep_len

            # reset episode-specific variables
            obs, done, ep_rews = env.reset(), False, []

            # end experience loop if we have enough of it
            if len(batch_obs) > batch_size:
                break

    # take a single policy gradient update step
    optimizer.zero_grad()
    batch_loss = compute_loss(
            logits=torch.stack(batch_logits),
            act=torch.as_tensor(batch_acts, dtype=torch.int32, device=device),
            weights=torch.as_tensor(batch_weights, dtype=torch.float32, device=device))
    batch_loss.backward()
    optimizer.step()
    print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f' %
            (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
    
torch.save(model.state_dict(), 'gpt_1.pt')

epoch:   0 	 loss: 2.527 	 return: 3.920 	 ep_len: 10.060
epoch:   1 	 loss: 2.468 	 return: 4.200 	 ep_len: 10.070
epoch:   2 	 loss: 2.132 	 return: 3.886 	 ep_len: 9.507
epoch:   3 	 loss: 2.251 	 return: 4.123 	 ep_len: 8.794
epoch:   4 	 loss: 2.861 	 return: 5.145 	 ep_len: 8.324


In [85]:
generate_sample_as_df('gpt_1.pt')

Unnamed: 0,end,(,),char,balanced,text,len
0,0.002948,0.981665,0.015387,(,"(False, n_open=1 n_closed=0)",(,1
1,0.002446,0.580757,0.416797,(,"(False, n_open=2 n_closed=0)",((,2
2,0.002437,0.484173,0.51339,),"(False, n_open=2 n_closed=1)",((),3
3,0.006088,0.769993,0.223919,),True,(()),4
4,0.023373,0.89737,0.079256,(,"(False, n_open=3 n_closed=2)",(())(,5
5,0.002698,0.439528,0.557773,),True,(())(),6
6,0.040663,0.87936,0.079977,(,"(False, n_open=4 n_closed=3)",(())()(,7
7,0.002475,0.386458,0.611067,(,"(False, n_open=5 n_closed=3)",(())()((,8
8,0.002579,0.369717,0.627704,),"(False, n_open=5 n_closed=4)",(())()((),9
9,0.015082,0.470775,0.514142,),True,(())()(()),10


In [86]:
for sample in generate_sample('gpt_1.pt').split('\n'):
    print(_is_balanced(sample), sample)

True 
(False, 2, 'early close') ())()
True (((())))(())
(False, 'n_open=9 n_closed=8') (()((()))()((()))
(False, 16, 'early close') ()(()()(((())))))
True ()()()()(()())
(False, 'n_open=3 n_closed=2') ((())
(False, 12, 'early close') ()(((())))()))
(False, 'n_open=9 n_closed=8') ()(()()()()(()())
True ()()(((())))
True ((((()()))()))
(False, 2, 'early close') ())()((()()
(False, 2, 'early close') ())())()((((()))
(False, 4, 'early close') (()))()())()
(False, 'n_open=6 n_closed=5') ()((())()()
(False, 16, 'early close') ((()((()))())()))
(False, 'n_open=3 n_closed=2') (()()
True ()((()))(()())
True (()((()())))()((()))
True ()()(((())))
True ()()()()()()
(False, 'n_open=9 n_closed=8') (()()()()(((())))
(False, 6, 'early close') (()()))(()
(False, 14, 'early close') ()(()((())())))))
(False, 'n_open=3 n_closed=2') ()(()
(False, 4, 'early close') ()())(()()()(())()
(False, 8, 'early close') ()((())))()
True ()()()()(()())()()
(False, 10, 'early close') ((()()())))
(False, 'n_open=5 n_clo