In [1]:
# This sample code reads in a text corpus, builds a bigram model by counting adjacent word pairs, and then generates text based on the computed probabilities.

using Random
using StatsBase  # Import StatsBase to use the sample function

# The line using StatsBase makes the sample function available. 
# If you haven't installed StatsBase, you can do so by running using Pkg; Pkg.add("StatsBase") in the Julia REPL.

# Build the bigram model as a dictionary of dictionaries
function build_bigram_model(corpus::String)
    # Tokenize the corpus by splitting on whitespace and converting to lowercase
    tokens = split(lowercase(corpus))
    model = Dict{String, Dict{String, Int64}}()
    
    # Count bigrams
    for i in 1:length(tokens)-1
        current = tokens[i]
        next_word = tokens[i+1]
        if haskey(model, current)
            model[current][next_word] = get(model[current], next_word, 0) + 1
        else
            model[current] = Dict(next_word => 1)
        end
    end
    return model
end

# The build_bigram_model function tokenizes the input corpus and builds a dictionary where each word maps to another dictionary containing the counts of words that follow it.

# Generate text based on the bigram model
function generate_text(model::Dict{String, Dict{String, Int64}}, start_word::String, num_words::Int)
    current_word = start_word
    output = [current_word]
    
    for _ in 2:num_words
        if !haskey(model, current_word)
            break  # Stop if no continuation is found
        end
        
        # Extract possible next words and their counts
        next_options = model[current_word]
        words = collect(keys(next_options))
        counts = [next_options[word] for word in words]
        
        # Compute probabilities and sample the next word using weighted random selection
        total = sum(counts)
        probs = [c / total for c in counts]
        # Use StatsBase.sample to randomly select an index based on weights
        index = sample(1:length(words), Weights(probs))
        current_word = words[index]
        push!(output, current_word)
    end
    
    return join(output, " ")
end

# The generate_text function uses the sample function from StatsBase to select the next word based on the weighted probabilities computed from the bigram frequencies.

# New function: Predict and display next words with their probabilities for a given word
function predict_next_words(model::Dict{String, Dict{String, Int64}}, word::String)
    if !haskey(model, word)
        println("No predictions available for the word \"$word\".")
        return
    end
    next_options = model[word]
    total = sum(values(next_options))
    println("Predicted next words for \"$word\":")
    for (w, count) in next_options
        prob = round(count / total, digits=2)
        println("  $w: $prob")
    end
end

# This function checks if a given word exists in the model. 
# If it does, it computes the probability for each subsequent word based on their counts and prints them.

predict_next_words (generic function with 1 method)

In [2]:
# Example usage
corpus = """
This is a sample text for our bigram language model. This language model will help generate text based on bigram frequencies.
"""

model = build_bigram_model(corpus)
println("Generated text:")
println(generate_text(model, "this", 20))

Generated text:
this language model will help generate text based on bigram frequencies.


In [3]:
# Below is an extended example showing how you can not only generate text but also inspect the model’s learned probabilities for a given word. 
# In this second example, we add a helper function to print the predicted next words along with their probabilities.

# ----- Example Usage 2 -----

# A new corpus for demonstration
corpus = """
In data science, models are essential. The bigram language model is a simple model. 
It helps in understanding word pair frequencies. Data science projects can involve large text corpora.
"""

# Build the model from the new corpus
model2 = build_bigram_model(corpus)

# Generate text starting with the word "data"
println("Generating text with model2 starting with 'data':")
println(generate_text(model2, "data", 15))

println("\nPrediction of next words for 'model':")
predict_next_words(model2, "model")

# A different corpus is used to build a second model (model2). 
# We generate text starting with the word "data" and then print out the probabilities of possible next words following "model".

Generating text with model2 starting with 'data':
data science, models are essential. the bigram language model is a simple model. it helps

Prediction of next words for 'model':
Predicted next words for "model":
  is: 1.0


In [4]:
# The output looks as expected for a demonstration with a small corpus:

    # Generated Text:
    # The text generated starting with "data" reflects the structure of your input corpus. 
    # The sequence "data science, models are essential. the bigram language model is a simple model. it helps" shows that the model is chaining words based on observed bigram frequencies. 
    # In this case, the text is coherent given the limited training data, though with a larger and more varied corpus, you’d likely see more diversity and less repetition.

    # Prediction for "model":
    # the fact that the predicted next word for "model" is "is" with a probability of 1.0 indicates that every instance of "model" in your corpus was followed by "is." 
    # This deterministic result is expected with a small dataset and clearly demonstrates that your model is correctly capturing the relationships between word pairs.