In [1]:
import sqlite3
import os
from pprint import pprint
from pathlib import Path
try:
    script_dir = Path(__FILE__).resolve().parent
except NameError:
    script_dir = Path('').resolve()

import torch
import torch.nn as nn
from torch.nn import functional as F

import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
class SQLite3Wrapper:
    def __init__(self, path='database.db'):
        self.connection = sqlite3.connect(path)
        self.cursor = self.connection.cursor()

    def table_exists(self, table_name):
        result = self.cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'")
        return result.fetchone() != None
    
    def execute(self, *args, **kwargs):
        return self.cursor.execute(*args, **kwargs)
    
    def print(self, *args, **kwargs):
        print(*args)
        print(kwargs)

    def ensure_create(self, table_name, attributes):
        if not self.table_exists(table_name):
            print('Creating table', table_name, 'with attributes', attributes)
            self.execute(f'CREATE TABLE {table_name}({attributes})')
    
    def count(self, table_name):
        return self.execute(f'SELECT COUNT(*) FROM {table_name}').fetchone()[0]
    
    def select(self, table_name):
        return self.execute(f'SELECT * FROM {table_name}').fetchall()
    
    def get(self, query, params=[], default=None):
        result = self.execute(query, params)
        result = result.fetchone()
        if result is None:
            return default
        else:
            return result[0]
    
    def print(self, table_name):
        for row in self.execute(f'SELECT * FROM {table_name}').fetchall():
            print(row)


In [3]:
def initialize_data(w):
    print('Loading raw data from csv file.')
    data = pd.read_csv('sources/dataset/Gungor_2018_VictorianAuthorAttribution_data-train.csv', encoding='latin-1')
    print('Putting raw data into sqlite table.')
    for index, row in data.iterrows():
        w.execute('INSERT INTO data VALUES (?, ?)', (row['text'], row['author']))
        print(row['author'])
    w.connection.commit()
    print('sqlite data table ready.')

def initialize_word_counts(w):
    # Not sure why I bothered making this.
    print('Counting words.')
    # w.execute('DELETE FROM word_counts')
    rows = w.execute('SELECT * FROM data').fetchall()
    for row in rows:
        text, author = row
        print('Author ', author, end='\r')
        for word in text.rstrip(' ').split(' '):
            old_count = w.execute('SELECT count FROM word_counts WHERE word = ?', (word,)).fetchone()
            if old_count is None:
                w.execute('INSERT INTO word_counts (word, count) VALUES (?, ?)', (word, 1))
            else:
                old_count ,= old_count
                w.execute('UPDATE word_counts SET count = ? WHERE word = ?', (old_count + 1, word))
        
    w.connection.commit()

    w.print('word_counts')
    print(w.count('word_counts'))

def initialize_id_word(w):
    rows = w.execute('SELECT * FROM data').fetchall()
    for row in rows:
        text, author = row
        print('Author ', author, end='\r')
        for word in text.rstrip(' ').split(' '):
            w.execute('INSERT OR IGNORE INTO id_word (word) VALUES (?)', [word])
    print()
    
    w.print('id_word')
    w.connection.commit()


In [4]:
w = SQLite3Wrapper()
w.ensure_create('data', 'words TEXT, author INT')

# w.ensure_create('word_counts', 'word VARCHAR(255), count INT')
# w.execute('CREATE UNIQUE INDEX IF NOT EXISTS index_word ON word_counts (word)')

# w.execute('DROP TABLE IF EXISTS id_word')
w.ensure_create('id_word', 'id INTEGER PRIMARY KEY, word VARCHAR(255) UNIQUE')
w.execute('CREATE UNIQUE INDEX IF NOT EXISTS index_word ON id_word (word)')

if w.count('data') < 53678: initialize_data(w)
if False: initialize_word_counts(w)
if w.count('id_word') < 10000: initialize_id_word(w)

word_to_id = {word:id for id, word in w.select('id_word')}
id_to_word = [None] + list(sorted((word for word, id in word_to_id.items()), key=lambda word : word_to_id[word]))

id_to_author = [None, 'Arthur Conan Doyle', 'Charles Darwin', 'Charles Dickens', 'Edith Wharton', 'George Eliot', 'Horace Greeley', 'Jack London', 'James Baldwin', 'Jane Austen', 'John Muir', 'Joseph Conrad', 'Mark Twain', 'Nathaniel Hawthorne', 'Ralph Emerson', 'Robert Louis Stevenson', 'Rudyard Kipling', 'Sinclair Lewis', 'Theodore Dreiser', 'Thomas Hardy', 'Walt Whitman', 'Washington Irving', 'William Carleton', 'Albert Ross', 'Anne Manning', 'Arlo Bates', 'Bret Harte', 'Catharine Maria Sedgwick', 'Charles Reade', 'Edward Eggleston', 'Fergus Hume', 'Frances Hodgson Burnett', 'George Moore', 'George William Curtis', 'Helen Mathers', 'Henry Rider Haggard', 'Isabella Lucy Bird', 'Jacob Abbott', 'James Grant', 'James Payn', 'John Kendrick Bangs', 'John Pendleton Kennedy', 'John Strange Winter', 'Lucas Malet', 'Marie Corelli', 'Oliver Optic', 'Sarah Orne Jewett', 'Sarah Stickney Ellis', 'Thomas Anstey Guthrie', 'Thomas Nelson Page', 'William Black']
author_to_id = {author:id for id, author in enumerate(id_to_author)}

rows = []
print('Loading rows into Python list')
for text, author in w.select('data'):
    words = text.rstrip(' ').split(' ')
    row = [word_to_id[word] for word in words] + [author]
    rows.append(row)
    print('Author:', author, end='\r')
print('\nDone loading rows')

print('Loading into dataframe')
data = pd.DataFrame(rows, columns=list(range(1000)) + ['author'])
print('Ok')

valid_authors = {id_author[0]:id_author[1] for id_author in enumerate(id_to_author) if id_author != None and id_author[0] in data['author'].values}
train_data, test_data = train_test_split(data, train_size=0.8, random_state=6128)


Loading rows into Python list
Author: 5014 1822 26 41
Done loading rows
Loading into dataframe
Ok


In [5]:
def pandas_row_to_words(pandas_row):
    index = sorted(i for i in pandas_row.index if isinstance(i, int))
    return ' '.join(id_to_word[pandas_row[word_index]] for word_index in index)

def get_author_rows(author_id):
    return data[data['author'] == author_id]

author_tensors = []
def get_author_tensor(author_id):
    while author_id >= len(author_tensors):
        rows = get_author_rows(len(author_tensors)).drop(['author'], axis=1)
        author_tensors.append(torch.tensor(rows.values, dtype=torch.int64))
    return author_tensors[author_id]

def tensor_to_words(t):
    if len(t.shape) == 0: return id_to_word[t.tolist()]
    if len(t.shape) == 1: return ' '.join(id_to_word[i] for i in t.tolist())
    return '\n'.join(tensor_to_words(subtensor) for subtensor in t)


In [33]:
# hyperparameters
batch_size = 24 # how many independent sequences will we process in parallel?
block_size = 128 # what is the maximum context length for predictions?
max_iters = 160000
eval_interval = 20000
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 8
n_layer = 8
dropout = 0.3

torch.manual_seed(2965)
print('device is', device)

device is cuda


In [7]:
def get_batch(author_id=12):
    # Select random excerpt by the author
    #  then select random start point within the excerpt.
    data = get_author_tensor(author_id)
    selected_excerpts = torch.randint(data.shape[0], (batch_size,))
    selected_starts = torch.randint(data.shape[1] - block_size, (batch_size,))
    excerpt_start = torch.stack((selected_excerpts, selected_starts), axis=1)
    x = torch.stack([data[e][s:s+block_size] for e, s in excerpt_start]).cuda()
    y = torch.stack([data[e][s+1:s+block_size+1] for e, s in excerpt_start]).cuda()
    return x, y

# xb, yb = get_batch()
# print('inputs:')
# print(xb.shape)
# print(xb)
# print('targets:')
# print(yb.shape)
# print(yb)
# for b in range(batch_size): # batch dimension
#     for t in range(block_size): # time dimension
#         context = xb[b, :t+1]
#         target = yb[b,t]
#         print(f"when input is {repr(tensor_to_words(context))} the target is {repr(tensor_to_words(target))}")



In [17]:
@torch.no_grad()
def estimate_loss(model, author):
    model.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_batch(author)
        logits, loss = model(X, Y)
        losses[k] = loss.item()
    model.train()
    return losses.mean()



In [9]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x) # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.vocab_size = vocab_size
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [31]:
# check if there's a 'model.pth' in the directory before training the model
def load_or_train(author):
    model = BigramLanguageModel(vocab_size=len(id_to_word))
    m = model.to(device)

    models_dir = script_dir.joinpath('models')
    models_dir.mkdir(exist_ok=True)
    expected_path = models_dir.joinpath(f'model{author}.pth')
    if expected_path.exists():
        print(f"Loading model {author} '{id_to_author[author]}'...")
        model.load_state_dict(torch.load(expected_path))
    else:
        # create a PyTorch optimizer
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
        print(f"Training model for author {author} '{id_to_author[author]}',  {sum(p.numel() for p in m.parameters())/1e6:.2f} M params, {max_iters} iterations...")
        for iter in range(max_iters + 1):
            print('Iteration', iter, '           ', end='\r')
            # every once in a while evaluate the loss on train and val sets
            if iter % eval_interval == 0:
                loss = estimate_loss(model, author)
                print()
                print(f"step {iter}: loss {loss:.4f}")
        
            xb, yb = get_batch(author)
        
            # evaluate the loss
            logits, loss = model(xb, yb)
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()
        
        # save the model
        print('Saving model')
        torch.save(model.state_dict(), expected_path)
    print("Ok")

    return model



In [34]:
for author_id in valid_authors:
    load_or_train(author_id)
    


Training model for author 1 'Arthur Conan Doyle',  1.70 M params, 160000 iterations...
Iteration 0            
step 0: loss 9.3648
Iteration 20000            
step 20000: loss 4.2312
Iteration 40000            
step 40000: loss 3.9808
Iteration 60000            
step 60000: loss 3.8338
Iteration 80000            
step 80000: loss 3.7388
Iteration 100000            
step 100000: loss 3.6836
Iteration 120000            
step 120000: loss 3.6456
Iteration 140000            
step 140000: loss 3.6068
Iteration 160000            
step 160000: loss 3.5883
Saving model
Ok
Training model for author 2 'Charles Darwin',  1.70 M params, 160000 iterations...
Iteration 0            
step 0: loss 9.3652
Iteration 20000            
step 20000: loss 3.8359
Iteration 40000            
step 40000: loss 3.3376
Iteration 60000            
step 60000: loss 3.0668
Iteration 80000            
step 80000: loss 2.9082
Iteration 100000            
step 100000: loss 2.8127
Iteration 120000            
step 120000

KeyboardInterrupt: 

In [13]:

# generate from the model
def generate_sample(model):
    context = torch.zeros((1, 1), dtype=torch.long, device=device)
    idx = model.generate(context, max_new_tokens=200)
    idx_list = idx[0][1:]
    return tensor_to_words(idx_list)
print(generate_sample(load_or_train(12)))


Loading model {author} '{id_to_author[author]}'...
Ok
he but his would make himself disagreeable what a well you are going to do said mrs with a lunatic a direct in the i have been mad said mrs when my mother was out of the house that lady seems to know i don t but she said nothing at one time had i served this â a â came on the tree and found her placed it over by the dagger on the mused and see ct io it must have been from the author of the sea ware believed that he had come here safe without in this somewhat the lost she did not wish to turn her attention as could forbear a spirit of her own she now and deprived of him to confide his chance words which might wish to the of them in he would have take it secretly from a and let me about it on a stick a little into that is missing and if any other man and the less has been related in all not likely to afford the chance of his descent to the royal cause but only were exact in his possession of s will he wished to


In [14]:
working_model = load_or_train(12)


Loading model {author} '{id_to_author[author]}'...
Ok


In [None]:
def predict_probabilities(model, tensor):
    model.eval()
    with torch.no_grad():
        t_sample_prefix_logits, _ = model(tensor.unsqueeze(0).to(device))
    t_prefix_logits = t_sample_prefix_logits[0]
    t_logits = t_prefix_logits[-1]
    t_probabilities = F.softmax(t_logits, dim=0)
    return t_probabilities

def predict_word_ids(model, tensor):
    t_probabilities = predict_probabilities(model, tensor)
    top_word_things = torch.topk(t_probabilities, 5)

    best = []
    cumulative_probability = 0
    for word_id, word_probability in zip(top_word_things.indices, top_word_things.values):
        word_id = word_id.tolist()
        word_probability = word_probability.tolist()
        best.append((word_id, word_probability))
        cumulative_probability += word_probability
        if cumulative_probability >= 0.95:
            break
    
    return best

def predict_words(model, tensor):
    word_ids = predict_word_ids(model, tensor)
    words = [id_to_word[word_id] for word_id, probability in word_ids]
    return words

working_tensor = get_author_tensor(12)[10][:45]
print('Seeing',
      repr(tensor_to_words(working_tensor[:-1])),
      '\nwe expect',
      predict_words(working_model, working_tensor[:-1]),
      'when the real value is',
      tensor_to_words(working_tensor[-1])
     )

In [None]:

# get the probability averaged over the input text
def get_prob_avg(model, tensor):
    model.eval()
    
    for i in range(1, tensor.shape[0] - block_size):
        print(i)
    return 12

    # Total number of chunks
    total_chunks = len(input_text) // block_size
    if len(input_text) % block_size != 0:
        total_chunks += 1  # Account for the last shorter chunk if any

    avg_probs = []

    # Process each chunk
    for i in range(total_chunks):
        start_index = i * block_size
        end_index = start_index + block_size
        chunk = input_text[start_index:end_index]

        # Encode the chunk
        encoded_chunk = torch.tensor(encode(chunk), dtype=torch.long).unsqueeze(0).to(device)  # Add batch dimension

        # Run the model on the chunk
        with torch.no_grad():
            logits, _ = model(encoded_chunk)

        # Calculate probabilities
        probs = F.softmax(logits, dim=-1)  # Apply softmax over the last dimension (vocab_size)

        # Get top-k probabilities for the last token of the chunk
        last_token_probs = probs[0, -1, :]  # Probabilities of the last token 
        # top_k_probs, _ = torch.topk(last_token_probs, k)

        # Calculate average of top-k probabilities and store
        # avg_prob = torch.mean(top_k_probs).item()
        avg_probs.append(last_token_probs.max().item())

    # Calculate the overall average probability across all chunks
    overall_avg_prob = sum(avg_probs) / len(avg_probs)
    return overall_avg_prob

print(f"Average probability of author 12 being author {12}: {get_prob_avg(working_model, get_author_tensor(12)[10]):.4f}")
# print(f"Average probability of author 17 being author {12}: {get_prob_avg(working_model, get_author_tensor(17)[10]):.4f}")


In [None]:

print(f"Average probability of author 12 being author {12}: {get_prob_avg(working_model, get_author_tensor(12)[10]):.4f}")
print(f"Average probability of author 17 being author {12}: {get_prob_avg(working_model, get_author_tensor(17)[10]):.4f}")

