## Drug Name Generator

Using the FDA Orangebook as a training data set, found [here](https://www.fda.gov/drugs/drug-approvals-and-databases/orange-book-data-files)

Following steps outlined in Andrej Karpathy's YouTube series, [Neural Networks: Zero to Hero](https://www.youtube.com/playlist?list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ)

In [5]:
import re
import torch
import torch.nn.functional as F 

drug_names = [re.sub(r'[^a-z]', '', line.split(' ')[0].lower()) for line in open('fda_orangebook.txt', 'r').read().splitlines()]    #  take the first word of each line, remove non-alphabetic characters, convert to lowercase
drug_names = list(dict.fromkeys(drug_names)) # remove duplicates

# drug_names[:10]

In [26]:
# build vocabulary of characters from the file, separating each name by '.', and map each character to an integer
chars = sorted(list(set(''.join(drug_names))))
char2int = {c: i+1 for i, c in enumerate(chars)}    # i+1 because we want to reserve 0 for "padding" between names
char2int['.'] = 0
int2char = {i: c for c, i in char2int.items()}
# print(int2char)
# len(int2char)

In [19]:
#####################################################
# Build the Dataset

block_size = 3                              # sliding window: the number of characters we'll use to predict the next character in the sequence
X, Y = [], []

for d in drug_names:
    context = [0] * block_size
    for ch in d + '.':
        idx = char2int[ch]
        X.append(context)
        Y.append(idx)
        context = context[1:] + [idx]       # update the context with the current character

X = torch.tensor(X)
Y = torch.tensor(Y)

# X.shape, Y.shape

In [20]:
#####################################################
# Define Functions
# make our code a lil easier to read and our model easier to optimize later (eventually)

def forwardPass(C, X, W1, b1, W2, b2, dims, block_size, batch):
    # Forward pass
    emb = C[X[batch]]                                                   # batching will keep this tensor at a manageable size and help to efficiently train the model
    h = torch.tanh(emb.view(-1, (dims * block_size)) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[batch])
    return loss

def backwardPass(loss, parameters):
    # Backward pass
    for p in parameters:
        p.grad = None                                                   # clear the gradients of the parameters before calculating them
    loss.backward()

def updateParams(parameters, learning_rate=0.01):
    # Update the parameters
    for p in parameters:
        p.data += -learning_rate * p.grad                               # update the parameters using the gradients at learning rate == .01 (arbitrarily chosen, could be optimized)


In [21]:
#####################################################
# (Re)Set Parameters

dims = 10                                           # the number of dimensions for our embeddings
neurons = 200                                       # the number of neurons in our hidden layer
vocab = len(int2char)                               # the number of unique characters in our vocabulary

C = torch.randn(vocab, dims)                        # the lookup table for embeddings, initially populated with random values
W1 = torch.rand((dims * block_size), neurons)       # our first set of weights
b1 = torch.randn(neurons)                           # our first set of biases, one for each neuron
W2 = torch.randn(neurons, vocab)                    # our second set of weights, one for each character in our vocabulary
b2 = torch.randn(vocab)                             # our second set of biases, one for each of the 27 output neurons

parameters = [C, W1, b1, W2, b2]

In [22]:
####################################################
# Train the Network (Update Parameters)
# run this a few times to train the model, checking the loss output each time to see if it's decreasing (getting better)
# if the loss is increasing, you may need to adjust the learning rate or the number of training loops
# note: avoid running too many times to avoid overfitting; reset the parameters in the cell above to start fresh

for p in parameters:
    p.requires_grad = True                          # we'll need to calculate the gradients of the parameters during training

for _ in range(30000):                              # number of training loops (30k runs in ~45 secs on my MacBook Air)
    # mini-batch                                    
    batch = torch.randint(0, X.shape[0], (100,))    # we'll use mini-batches of 100 examples to more efficiently train the model

    # forward pass
    loss = forwardPass(C, X, W1, b1, W2, b2, dims, block_size, batch)
    
    # backward pass
    backwardPass(loss, parameters)
    
    # update the parameters
    updateParams(parameters, learning_rate=0.1)     # learning rate == .1 (sorta arbitrarily chosen, could be optimized --> try re-running with smaller weights)


print(f'loss = {loss.item()}')                      # the final loss after training (lower is better --> can you get it under 2???)


loss = 2.4172518253326416


In [29]:
####################################################
# Sampling the Model ---> Generate Some New Drug Names!
# run this cell as many times as you like to generate new drug names

drugs = []

for _ in range(10):
    output = []
    context = [0] * block_size
    while True:
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        idx = torch.multinomial(probs, num_samples=1).item()
        context = context[1:] + [idx]
        output.append(idx)
        if idx == 0:
            output = ''.join(int2char[i] for i in output[1:])
            output = output[:-1]
            break

    drugs.append(output)

print(drugs)

# my faves:
# janarta, starigo, iginabine, omiderex, iperidazole, ipronazole, emcotrol, traelela, odopa, robinamide, isopril, eofulphenicopofuega

['hlorechnacidolexys',
 'id',
 'imale',
 'enlalmin',
 'lurandysozide',
 'eine',
 'iptrone',
 'etasytospil',
 'thy',
 'hine']