# Homework 10
#### Course Notes
**Language Models:** https://github.com/rjenki/BIOS512/tree/main/lecture17  
**Unix:** https://github.com/rjenki/BIOS512/tree/main/lecture18  
**Docker:** https://github.com/rjenki/BIOS512/tree/main/lecture19

## Question 1
#### Make a language model that uses ngrams and allows the user to specify start words, but uses a random start if one is not specified.

#### a) Make a function to tokenize the text.

In [4]:
library(httr)
library(stringr)


tokenize <- function(text) {
  # Convert to lowercase
  text <- tolower(text)
  
  # Extract words (remove punctuation)
  # Using regex pattern to match word boundaries
  tokens <- str_extract_all(text, "\\b\\w+\\b")[[1]]
  
  return(tokens)
}

#### b) Make a function generate keys for ngrams.

In [5]:
make_ngrams <- function(tokens, n) {
  ngrams <- list()
  
  # Create n-grams by sliding window
  for (i in 1:(length(tokens) - n + 1)) {
    ngram <- tokens[i:(i + n - 1)]
    ngrams[[i]] <- ngram
  }
  
  return(ngrams)
}

#### c) Make a function to build an ngram table.

In [6]:
build_ngram_table <- function(ngrams) {
  # Use environment as dictionary/table
  table <- new.env(hash = TRUE)
  
  for (gram in ngrams) {
    # Prefix is all words except last
    prefix <- paste(gram[1:(length(gram) - 1)], collapse = " ")
    
    # Next word is the last word
    next_word <- gram[length(gram)]
    
    # Add to table (append to list)
    if (exists(prefix, envir = table)) {
      table[[prefix]] <- c(table[[prefix]], next_word)
    } else {
      table[[prefix]] <- c(next_word)
    }
  }
  
  return(table)
}

#### d) Function to digest the text.

In [7]:
digest_text <- function(text, n = 2) {
  tokens <- tokenize(text)
  ngrams <- make_ngrams(tokens, n)
  table <- build_ngram_table(ngrams)
  
  return(table)
}

#### e) Function to digest the url.

In [8]:
digest_url <- function(url, n = 2) {
  # Fetch URL content
  response <- GET(url)
  text <- content(response, "text", encoding = "UTF-8")
  
  # Use digest_text
  return(digest_text(text, n))
}

#### f) Function that gives random start.

In [9]:
random_start <- function(table) {
  # Get all keys (prefixes)
  keys <- ls(table)
  
  # Return random choice
  return(sample(keys, 1))
}

#### g) Function to predict the next word.

In [10]:
predict_next <- function(prefix, table) {
  # Check if prefix exists in table
  if (!exists(prefix, envir = table)) {
    return(NULL)
  }
  
  # Get possible next words
  options <- table[[prefix]]
  
  # Return random choice from options
  return(sample(options, 1))
}

#### h) Function that puts everything together. Specify that if the user does not give a start word, then the random start will be used.

In [11]:
generate <- function(text_or_url, n = 2, start = NULL, length = 50) {
  # Decide if input is URL
  if (grepl("^http", text_or_url)) {
    table <- digest_url(text_or_url, n)
  } else {
    table <- digest_text(text_or_url, n)
  }
  
  # Prepare start prefix
  if (is.null(start)) {
    # Use random start
    prefix <- random_start(table)
  } else {
    # Tokenize start words
    t <- tokenize(start)
    
    # Check if enough start words
    if (length(t) < (n - 1)) {
      stop("Not enough start words")
    }
    
    # Take last (n-1) words and create prefix
    prefix <- paste(tail(t, n - 1), collapse = " ")
  }
  
  # Generate words
  result <- unlist(strsplit(prefix, " "))
  
  for (i in 1:length) {
    # Get current prefix
    current_prefix <- paste(tail(result, n - 1), collapse = " ")
    
    # Predict next word
    next_word <- predict_next(current_prefix, table)
    
    # Stop if no next word found
    if (is.null(next_word)) {
      break
    }
    
    # Add to result
    result <- c(result, next_word)
  }
  
  # Return as single string
  return(paste(result, collapse = " "))
}

## Question 2
#### For this question, set `seed=2025`.
#### a) Test your model using a text file of [Grimm's Fairy Tails](https://www.gutenberg.org/cache/epub/2591/pg2591.txt)
#### i) Using n=3, with the start word(s) "the king", with length=15. 
#### ii) Using n=3, with no start word, with length=15.

In [12]:
# Question 2a - Grimm's Fairy Tales
# Set seed as required
set.seed(2025)

# URL for Grimm's Fairy Tales
grimm_url <- "https://www.gutenberg.org/cache/epub/2591/pg2591.txt"

# i) Using n=3, with start words "the king", length=15
cat("2a(i) Grimm's Fairy Tales - Start: 'the king', n=3, length=15:\n")
result_2a_i <- generate(grimm_url, n = 3, start = "the king", length = 15)
print(result_2a_i)

# Reset seed for reproducibility
set.seed(2025)

# ii) Using n=3, with no start word, length=15
cat("\n2a(ii) Grimm's Fairy Tales - Random start, n=3, length=15:\n")
result_2a_ii <- generate(grimm_url, n = 3, start = NULL, length = 15)
print(result_2a_ii)


2a(i) Grimm's Fairy Tales - Start: 'the king', n=3, length=15:
[1] "the king said to the king and told him the silken cord for then the peasant in"

2a(ii) Grimm's Fairy Tales - Random start, n=3, length=15:
[1] "sprinkled salt upon it i cannot and dare not take pity on her dress blackened her face"


#### b) Test your model using a text file of [Ancient Armour and Weapons in Europe](https://www.gutenberg.org/cache/epub/46342/pg46342.txt)
#### i) Using n=3, with the start word(s) "the king", with length=15. 
#### ii) Using n=3, with no start word, with length=15.

In [13]:
# Question 2b - Ancient Armour and Weapons in Europe
# Reset seed
set.seed(2025)

# URL for Ancient Armour
armour_url <- "https://www.gutenberg.org/cache/epub/46342/pg46342.txt"

# i) Using n=3, with start words "the king", length=15
cat("\n2b(i) Ancient Armour - Start: 'the king', n=3, length=15:\n")
result_2b_i <- generate(armour_url, n = 3, start = "the king", length = 15)
print(result_2b_i)

# Reset seed for reproducibility
set.seed(2025)

# ii) Using n=3, with no start word, length=15
cat("\n2b(ii) Ancient Armour - Random start, n=3, length=15:\n")
result_2b_ii <- generate(armour_url, n = 3, start = NULL, length = 15)
print(result_2b_ii)


2b(i) Ancient Armour - Start: 'the king', n=3, length=15:
[1] "the king and many fell in death among their children 244 the last item we see from"

2b(ii) Ancient Armour - Random start, n=3, length=15:
[1] "l estoc stabbing sword is named in the form of this century it is vain to inquire"


#### c) Explain in 1-2 sentences the difference in content generated from each source.

In [None]:
Grimms Fairy Tales generates narrative text with story elements like characters and 
emotions, while Ancient Armour produces technical, historical language focused on weapons and medieval warfare.

## Question 3
#### a) What is a language learning model? 

A language model (often called a Language Learning Model or LLM - Large Language Model) is a type of machine learning model that predicts the probability of a sequence of words to understand and generate human language. At its core, a language model is a probability distribution over words. It takes input text as context and predicts the most likely next word (or token) based on patterns learned from training data. Modern language models like ChatGPT, Claude, and Gemini use this principle: they analyze previous words in a sequence and generate the next most appropriate word, continuing this process to produce coherent text.

Language models range from simple statistical models (like the n-gram models I built in Questions 1 and 2) to complex neural network-based transformers that use attention mechanisms to understand context across long sequences of text. The key difference is that advanced models can "pay attention" to relevant parts of the input text contextually, rather than just looking at a fixed window of previous words.

#### b) Imagine the internet goes down and you can't run to your favorite language model for help. How do you run one locally?

To run a language model locally (offline, on your own computer), you need to use local-first LLM tools that download and execute models without requiring an internet connection. Popular tools include:

1. **Ollama** - Simple command-line tool where you download models with `ollama pull <model_name>` and run them locally
2. **LM Studio** - User-friendly desktop application with a graphical interface for downloading and chatting with models offline
3. **GPT4All** - Downloadable application that runs over 1,000 open-source models locally without internet
4. **Llamafile** - Converts language models into executable files that run on any platform with no installation

The process involves downloading a model file (usually in GGUF format) to your computer, then using one of these tools to load and run it. All processing happens on your machine, so no data leaves your computer and it works completely offline once the model is downloaded. 

#### Question 4

Explain what the following vocab words mean in the context of typing `mkdir project` into the command line. If the term doesn't apply to this command, give the definition and/or an example.

| Term | Meaning |  
|------|---------|
| **Shell** | The shell is the program that interprets and executes the commands we type. When I type `mkdir project`, the shell reads that command, figures out what program needs to run (the mkdir program), and tells the operating system to execute it. Basically it's the intermediary between me typing commands and the computer actually doing something. |
| **Terminal emulator** | A terminal emulator is the application window where I'm actually typing the command. It's called an emulator because back in the day people used physical terminals (like hardware devices), but now we just have software that emulates those old terminals. When I open Terminal on my computer and type `mkdir project`, I'm using a terminal emulator. |
| **Process** | A process is a running program. When I type `mkdir project` and hit enter, the shell creates a new process to run the mkdir program. That process does its job (creating the directory) and then terminates. Every time a command runs, it becomes a process. |
| **Signal** | A signal is a way to send messages to processes, usually to interrupt or stop them. This doesn't really apply to `mkdir project` since that command runs so quickly, but an example would be if I was running a long command and pressed Ctrl+C to stop it - that sends a signal to kill the process. |
| **Standard input** | Standard input (stdin) is where a program reads input from, usually the keyboard. For `mkdir project`, this doesn't really apply because mkdir doesn't read any input - it just takes the directory name as a command line argument and creates it. An example where stdin matters would be something like `cat`, which reads from stdin if you don't give it a file. |
| **Standard output** | Standard output (stdout) is where a program sends its normal output, usually to the screen. For `mkdir project`, there's actually no output to stdout if it succeeds - it just silently creates the directory. But if I ran `ls`, the list of files would be sent to stdout and displayed on my screen. |
| **Command line argument** | Command line arguments are the extra information we give to a command. In `mkdir project`, "project" is the command line argument - it's telling mkdir what directory name to create. The shell passes this argument to the mkdir program when it runs. |
| **The environment** | The environment is a set of variables that programs can access to get information about the system and user settings. For `mkdir project`, the environment might include things like the PATH variable (which tells the shell where to find the mkdir program) or the current working directory. These environment variables affect how commands run. |


## Question 5
#### Consider the following command `find . -iname "*.R" | xargs grep read_csv`.
#### a) What are the programs?

The programs are:
- **find** - searches for files and directories
- **xargs** - takes input from one command and converts it into arguments for another command
- **grep** - searches for text patterns within files

#### b) Explain what this command is doing, part by part.

`find .` - This searches for files starting in the current directory (the dot means current directory). It will look through all subdirectories too.

`-iname "*.R"` - This tells find to look for files with names that match the pattern `*.R`. The `-iname` flag means it's case-insensitive, so it will find files ending in `.R`, `.r`, or any combination of uppercase and lowercase letters.

`|` - This is a pipe. It takes the output from the find command (the list of R files that were found) and sends it to the next command.

`xargs grep read_csv` - This takes the list of R files from find and passes them as arguments to the grep command. grep then searches through each of those R files looking for lines that contain the text "read_csv". So basically, it's searching all R files in the current directory and its subdirectories for any occurrences of the function "read_csv".

In plain English, this command finds all R files and then searches through them to see which ones contain the text "read_csv". This is useful if you want to know which R scripts in a project are reading in data using the `read_csv` function.

## Question 6
#### Install Docker on your machine. See [here](https://github.com/rjenki/BIOS512/blob/main/lecture18/docker_install.md) for instructions. 
#### a) Show the response when you run `docker run hello-world`.

Unable to find image 'hello-world:latest' locally
latest: Pulling from library/hello-world
17eec7bbc9d7: Pull complete
Digest: sha256:f7931603f70e13dbd844253370742c4fc4202d290c80442b2e68706d8f33ce26
Status: Downloaded newer image for hello-world:latest

Hello from Docker!
This message shows that your installation appears to be working correctly.

To generate this message, Docker took the following steps:
 1. The Docker client contacted the Docker daemon.
 2. The Docker daemon pulled the "hello-world" image from the Docker Hub.
    (amd64)
 3. The Docker daemon created a new container from that image which runs the
    executable that produces the output you are currently reading.
 4. The Docker daemon streamed that output to the Docker client, which sent it
    to your terminal.

To try something more ambitious, you can run an Ubuntu container with:
 $ docker run -it ubuntu bash

Share images, automate workflows, and more with a free Docker ID:
 https://hub.docker.com/

For more examples and ideas, visit:
 https://docs.docker.com/get-started/

#### b) Access Rstudio through a Docker container. Set your password and make sure your files show up on the Rstudio server. Type the command and the output you get below.

C:\Users\mckap>docker run --rm -p 8787:8787 -e PASSWORD=bios512 -v C:\dockerwork:/home/rstudio rocker/rstudio
Unable to find image 'rocker/rstudio:latest' locally
latest: Pulling from rocker/rstudio
2c9ba66d5dbe: Pull complete
e4b9e87bb831: Pull complete
3c7cdccc4be7: Pull complete
39038e16d1ba: Pull complete
3665120d345d: Pull complete
971ba7cf0d8a: Pull complete
5d246ec925db: Pull complete
664fb1818bbb: Pull complete
b71e78fefbbb: Pull complete
890065c4c99d: Pull complete
d923cf803a12: Pull complete
2a63ed8b2250: Pull complete
4b3ffd8ccb52: Pull complete
999e4b8f7ed8: Pull complete
9c1a4a0706b7: Pull complete
191985778909: Pull complete
08e74fd5985d: Pull complete
62f215ca34c6: Pull complete
Digest: sha256:9f85211a666fb426081a6f5a01f9f9f51655262258419fa21e0ce38a5afc78d8
Status: Downloaded newer image for rocker/rstudio:latest
[s6-init] making user provided files available at /var/run/s6/etc...exited 0.
[s6-init] ensuring user provided files have correct perms...exited 0.
[fix-attrs.d] applying ownership & permissions fixes...
[fix-attrs.d] done.
[cont-init.d] executing container initialization scripts...
[cont-init.d] 01_set_env: executing...
skipping /var/run/s6/container_environment/HOME
skipping /var/run/s6/container_environment/PASSWORD
skipping /var/run/s6/container_environment/RSTUDIO_VERSION
[cont-init.d] 01_set_env: exited 0.
[cont-init.d] 02_userconf: executing...
[cont-init.d] 02_userconf: exited 0.
[cont-init.d] done.
[services.d] starting services
[services.d] done.

#### c) How do you log in to the RStudio server?

To log in to the RStudio server, I opened my web browser and navigated to `http://localhost:8787`. This displayed the RStudio Server login page. I entered "rstudio" as the username (the default username for the rocker/rstudio container) and "bios512" as the password (which I set using the `-e PASSWORD=bios512` flag in the docker run command). After clicking the Sign In button, RStudio Server loaded in my browser and I could access my mounted files in the Files pane.
