In [1]:
#load "llm.fsx"

open System
open TorchSharp

let device = Llm.device
printfn "Using device: %s" (device.ToString())

Using CUDA / GPU
Using device: cuda


In [2]:
printfn "%s" <| System.Environment.GetEnvironmentVariable("LD_LIBRARY_PATH")
printfn "%s" <| System.Runtime.InteropServices.RuntimeInformation.FrameworkDescription

/usr/local/cuda-11.7/lib64:
.NET 7.0.13


In [3]:
let text = IO.File.ReadAllText("shakespeare.txt")
let chars = Seq.distinct text |> Seq.toArray |> Array.sort
let vocabSize = chars.Length
printfn "Vocab size: %d" vocabSize
printfn "Chars: %s" (String.Join("", chars))

Vocab size: 65
Chars: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [4]:
let cfg = {|
    vocabSize = vocabSize
    blockSize = 32
    batchSize = 4L
    encodingSize = 80L
    nHeads = 4L
    dropout = 0.1
    bias = false
|}        


In [5]:
let encode, decode =
    let maxCharCode = chars |> Seq.map int |> Seq.max
    let char2index = Array.zeroCreate (maxCharCode + 1)
    let index2char = Array.zeroCreate (vocabSize + 1)
    for i,ch in Seq.indexed chars do
        char2index[int ch] <- int i
        index2char[i] <- ch
    let encode (s:string) =
        Array.init s.Length (fun i -> char2index[int s[i]])
    let decode (xs: int[]) =
        xs |> Array.map (fun i -> index2char[int i]) |> String.Concat
    encode, decode

printfn "hello world -> %A -> %A" (encode "hello world") (decode (encode "hello world"))

hello world -> [|46; 43; 50; 50; 53; 1; 61; 53; 56; 50; 42|] -> "hello world"


In [6]:
let trainData, validData =
    let splitIdx = text.Length * 9 / 10
    encode text[0..splitIdx-1], encode text[splitIdx..]
printfn "Shapes: train=%d, validation=%d" trainData.Length validData.Length


Shapes: train=1003854, validation=111540


In [7]:
type BatchGenerator (data: int[], blockSize, batchSize, device) =
    let nChars = data.Length
    let maxRnd = nChars - blockSize
    let mutable random = new Random()
    let xs = Array.zeroCreate (batchSize * blockSize)
    let ys = Array.zeroCreate (batchSize * blockSize)
    let tensorShape = [|int64 batchSize; int64 blockSize|]
    let mutable charsServed = 0
    member this.StartReproducibleRandom() =
        random <- new Random(1337)
    member this.GetBatch() =
        for i in 0..batchSize-1 do
            let ix = random.Next(maxRnd)
            let xdst = xs.AsSpan(i*blockSize, blockSize)
            let xsrc = data.AsSpan(ix, blockSize)
            xsrc.CopyTo(xdst)
            let ydst = ys.AsSpan(i*blockSize, blockSize)
            let ysrc = data.AsSpan(ix+1, blockSize)
            ysrc.CopyTo(ydst)

        charsServed <- charsServed + batchSize * blockSize

        let xb = torch.tensor(xs, dimensions=tensorShape, dtype=torch.int64, device=device, requires_grad=false)
        let yb = torch.tensor(ys, dimensions=tensorShape, dtype=torch.int64, device=device, requires_grad=false)
        xb, yb
    member this.Epoch = float charsServed / float nChars

let test() =
    let xb, yb = BatchGenerator([| for i in 0..12 -> int i |], 5, 4, device).GetBatch()
    xb.slice(0L, 0L, 1L, 1L), yb.slice(0L, 0L, 1L, 1L)
test()


Unnamed: 0,Unnamed: 1
Item1,"[1x5], type = Int64, device = cuda:0  1 2 3 4 5"
Item2,"[1x5], type = Int64, device = cuda:0  2 3 4 5 6"


In [8]:
let tr =
    //new CausalSelfAttention(4)
    (new Llm.Block(4, cfg.encodingSize, cfg.bias, cfg.dropout)).``to``(device)
let t = torch.rand([|1L;8L;int64 cfg.encodingSize|], device=device, dtype=torch.float32)
tr.forward(t)


In [9]:
let loss (ys:torch.Tensor) (yHat:torch.Tensor) =
    let batchSize = ys.shape.[0]
    let logits = yHat.reshape(-1, vocabSize)
    let target = ys.reshape([|batchSize * (int64 cfg.blockSize)|])
    //printfn "shapes: logits=%A targets=%A" logits.shape target.shape
    let loss = torch.nn.functional.cross_entropy(logits, target)
    loss

let evalLosses : (torch.Tensor->torch.Tensor) -> float*float =
    let nLoops = 16
    let batchSize = 256
    let trainBatchGen = BatchGenerator(trainData, cfg.blockSize, batchSize, device)
    let validBatchGen = BatchGenerator(validData, cfg.blockSize, batchSize, device)
    fun (forward : torch.Tensor -> torch.Tensor) ->
        let getLoss (bg:BatchGenerator) =
            use _ = torch.NewDisposeScope()
            bg.StartReproducibleRandom()
            torch.stack([
                for i in 1..nLoops do
                    let xb,yb = bg.GetBatch()
                    yield loss yb (forward xb)
            ]).mean().ToDouble()

        getLoss trainBatchGen, getLoss validBatchGen

### Select which the model to train
... locally defined or the one loaded from script (both should have the same number of parameters)

In [10]:
let model =
    new Llm.LanguageModel(2, cfg.nHeads, cfg.encodingSize, cfg.vocabSize, cfg.blockSize, cfg.bias, cfg.dropout)
    //torch.jit.load<torch.Tensor, torch.Tensor>("shakespeare.pt.zip")
model.``to``(device) |> ignore


Seq.sum [ for p in model.parameters() do if p.requires_grad then p.numel() ]
|> printfn "Number of parameters: %d"
//for n,p in model.named_parameters() do
//    printfn "%s: %A on %A" n p.shape p.device

if false then
    let xb, yb = BatchGenerator(trainData, cfg.blockSize, 16, device).GetBatch()
    printfn "xb: %A on %A" xb.shape xb.device
    let yHat = model.forward(xb) |> unbox<torch.Tensor>
    printfn "%A" yHat.shape


//loss yb yHat, evalLosses model
//loss yb yHat
//printfn "%s" <| model.generate 100 "       w"
//model.named_parameters()

Number of parameters: 154465


In [11]:
/// run inference to generate some text from the model
let generate (forward:torch.Tensor -> torch.Tensor) nTokens (input:string) =
    let blockSize = cfg.blockSize

    let generateIxs (ixs:torch.Tensor) nTokens =
        model.eval()
        let mutable ixs = ixs
        let result = [|
            yield ixs
            for i in 1..nTokens do
                //printfn "ixs.shape=%A" ixs.shape
                use yHat = model.forward(ixs)
                use yHat' = yHat.select(1L,-1L)
                use probs = torch.nn.functional.softmax(yHat', 1)
                //printfn "yHat.shape=%A" yHat.shape
                let yIdx = torch.multinomial(probs, num_samples=1)
                use ixs' =
                    if ixs.shape[1] >= blockSize then
                        ixs.slice(1L, 1L, int64 blockSize, 1L)
                    else
                        ixs.clone()
                ixs <- torch.cat([|ixs'; yIdx|], dim=1)
                //if i = 5 then
                //    printfn "shapes: ixs: %A yHat: %A yHat': %A probs: %A yIdx: %A" ixs.shape yHat.shape yHat'.shape probs.shape yIdx.shape
                yield yIdx
        |]
        torch.cat(result, dim=1)


    let input = input.PadLeft(blockSize, ' ')
    if input.Length <> blockSize then
        failwithf "Expected input of length %d, got %d" blockSize input.Length
    let idx = torch.tensor(encode input, dimensions=[|1L;int64 blockSize|], dtype=torch.int64, device=device, requires_grad=false)
    let idx = idx.expand([|2L;-1L|])
    //printfn "idx.shape=%A" idx.shape
    let idx' = generateIxs idx nTokens
    //printfn "generated.shape=%A" idx'.shape
    let idx' = idx'.select(0L,0L).cpu()
    let ords = Array.init (int idx'.NumberOfElements) (fun i -> idx'[int64 i].ToInt32())
    //idx'.to_type(torch.int32, copy=false).data().CopyTo(ords, 0, 0)
    decode ords

generate model.forward 100 "w" // initially this should render some jibberish

                               wPH?FhXPk;!tHPP$a&aPE!YOTCcpfCtg a&ipltMJJb.at,J:g!$Ju?F.PEhLPfWT!?SIyKkPTM?PkPaV3tQkXbD&sOT'OO&BHP'.

### Select optimization features

In [12]:
printfn "flash sdp enabled = %b" (torch.backends.cuda.flash_sdp_enabled())

// use tf32 for matmul

//torch.backends.cuda.matmul.allow_tf32 <- true
torch.backends.cuda.matmul.allow_tf32 <- false

flash sdp enabled = true


In [13]:
let optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

let batchSize = 384
let batchGen = BatchGenerator(trainData, cfg.blockSize, batchSize, device)

let train nSteps =
    model.train()
    let stopwatch = Diagnostics.Stopwatch.StartNew()
    for step in 0..nSteps do
        use _ = torch.NewDisposeScope()
        let xb, yb = batchGen.GetBatch()
        let yHat = model.forward(xb)
        let loss = loss yb yHat
        optimizer.zero_grad()
        loss.backward()
        optimizer.step() |> ignore
        if step % 5000 = 0 || step = nSteps then
            model.eval()
            let trainLoss,validLoss = evalLosses model.forward
            printfn "Step %5d: loss=%6f train=%6f valid=%6f epoch=%.2f" step (loss.ToDouble()) trainLoss validLoss batchGen.Epoch
            model.train()
    stopwatch.Stop()
    printfn "Elapsed: %A" stopwatch.Elapsed

In [14]:
// pretrain a bit to warmup dotent optimizer
train 2000

GC.Collect()

Step     0: loss=4.479452 train=4.467664 valid=4.462698 epoch=0.01
Elapsed: 00:00:11.7987403


In [None]:
// do the actual training (measure time)
train 50000

Step     0: loss=2.204164 train=2.160022 valid=2.198765 epoch=24.51
Step  5000: loss=1.870674 train=1.812406 valid=1.934276 epoch=85.71


In [None]:
printfn "%s" <| generate model.forward 500 "M"

Error: input.fsx (1,17)-(1,25) typecheck error The value or constructor 'generate' is not defined. Maybe you want one of the following:
   Delegate

In [None]:
for struct(name,param) in model.named_parameters() do
    printfn $"%s{name.PadRight(36)}: %s{param.ToString()}"

layers.embed.tok_emb.weight         : [65x80], type = Float32, device = cuda:0
layers.embed.pos_emb.weight         : [32x80], type = Float32, device = cuda:0
layers.block1.ln1.weight            : [80], type = Float32, device = cuda:0
layers.block1.ln1.bias              : [80], type = Float32, device = cuda:0
layers.block1.attn.c_attn.weight    : [240x80], type = Float32, device = cuda:0
layers.block1.ln2.weight            : [80], type = Float32, device = cuda:0
layers.block1.ln2.bias              : [80], type = Float32, device = cuda:0
layers.block1.mlp.net.c_fc.weight   : [320x80], type = Float32, device = cuda:0
layers.block1.mlp.net.c_proj.weight : [80x320], type = Float32, device = cuda:0
layers.block2.ln1.weight            : [80], type = Float32, device = cuda:0
layers.block2.ln1.bias              : [80], type = Float32, device = cuda:0
layers.block2.attn.c_attn.weight    : [240x80], type = Float32, device = cuda:0
layers.block2.ln2.weight            : [80], type = Float32, device