# Question 5.3: Confusion Matrix Calculations - Recall, Precision


In [None]:
import numpy as np

# Confusion Matrix as given in the question
# Rows: System (Predicted), Columns: Gold (Actual)
# Classes: Cat, Dog, Rabbit
confusion_matrix = np.array([
    [5, 10, 5],   # Predicted Cat
    [15, 20, 10],  # Predicted Dog
    [0, 15, 10]    # Predicted Rabbit
])

classes = ['Cat', 'Dog', 'Rabbit']

# --- Per-class Precision and Recall ---
print("--- Per-class Metrics ---")
per_class_precision = []
per_class_recall = []
for i, cls in enumerate(classes):
    tp = confusion_matrix[i, i]
    fp = np.sum(confusion_matrix[i, :]) - tp
    fn = np.sum(confusion_matrix[:, i]) - tp
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    per_class_precision.append(precision)
    per_class_recall.append(recall)
    
    print(f"Class: {cls}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}\n")

# --- Macro-averaged Precision and Recall ---
print("--- Macro-averaged Metrics ---")
macro_precision = np.mean(per_class_precision)
macro_recall = np.mean(per_class_recall)

print(f"Macro-averaged Precision: {macro_precision:.4f}")
print(f"Macro-averaged Recall:    {macro_recall:.4f}\n")

# --- Micro-averaged Precision and Recall ---
print("--- Micro-averaged Metrics ---")
total_tp = np.trace(confusion_matrix) # Sum of diagonal
# For multi-class, total FP and FN are the sum of off-diagonal elements
total_fp_fn = np.sum(confusion_matrix) - total_tp

micro_precision = total_tp / (total_tp + total_fp_fn)
micro_recall = total_tp / (total_tp + total_fp_fn)

print(f"Micro-averaged Precision: {micro_precision:.4f}")
print(f"Micro-averaged Recall:    {micro_recall:.4f}")



--- Per-class Metrics ---
Class: Cat
  Precision: 0.2500
  Recall:    0.2500

Class: Dog
  Precision: 0.4444
  Recall:    0.4444

Class: Rabbit
  Precision: 0.4000
  Recall:    0.4000

--- Macro-averaged Metrics ---
Macro-averaged Precision: 0.3648
Macro-averaged Recall:    0.3648

--- Micro-averaged Metrics ---
Micro-averaged Precision: 0.3889
Micro-averaged Recall:    0.3889


### Q5.3 Summary of Results

**Per-class Metrics:**
- **Cat:** Precision = 0.2500, Recall = 0.2500
- **Dog:** Precision = 0.4444, Recall = 0.4444
- **Rabbit:** Precision = 0.4000, Recall = 0.4000

**Averaged Metrics:**
- **Macro-averaged Precision:** 0.3648
- **Macro-averaged Recall:** 0.3648
- **Micro-averaged Precision:** 0.3889
- **Micro-averaged Recall:** 0.3889

The micro-averaged precision and recall are equal, which is expected, and they both represent the overall accuracy of the model.


# Question 8: Bigram Language Model


In [None]:
!pip install nltk
# Install the nltk package

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1


In [None]:
import nltk
from nltk.util import ngrams
from collections import defaultdict, Counter

# Training corpus
corpus_raw = [
    "<s> I love NLP </s>",
    "<s> I love deep learning </s>",
    "<s> deep learning is fun </s>"
]

# Tokenize the corpus by splitting on spaces
tokenized_corpus = [sentence.split() for sentence in corpus_raw]

# --- Compute unigram and bigram counts ---

# Flatten the tokenized corpus to get a single list of all tokens for unigram counting
all_tokens = [token for sentence in tokenized_corpus for token in sentence]
unigram_counts = Counter(all_tokens)

# Generate bigrams from each sentence in the tokenized corpus
all_bigrams = [bigram for sentence in tokenized_corpus for bigram in ngrams(sentence, 2)]
bigram_counts = Counter(all_bigrams)

print("--- Counts ---")
print("Unigram Counts:", dict(unigram_counts))
print("\nBigram Counts:", dict(bigram_counts))


# --- Estimate bigram probabilities using Maximum Likelihood Estimation (MLE) ---
bigram_probabilities = defaultdict(float)
for bigram, count in bigram_counts.items():
    prefix = bigram[0]
    prefix_count = unigram_counts[prefix]
    if prefix_count > 0:
        bigram_probabilities[bigram] = count / prefix_count

print("\n--- Bigram Probabilities (MLE) ---")
for bigram, prob in bigram_probabilities.items():
    print(f"P({bigram[1]:<10} | {bigram[0]:<10}) = {prob:.4f}")


# --- Implement a function to calculate sentence probability ---
def calculate_sentence_probability(sentence, bigram_probs):
    """Calculates the probability of a sentence using a bigram model."""
    tokens = sentence.split()
    sentence_bigrams = ngrams(tokens, 2)
    probability = 1.0
    for bigram in sentence_bigrams:
        # If a bigram was not seen in training, its probability is 0
        probability *= bigram_probs.get(bigram, 0)
    return probability

# --- Test on given sentences ---
sentence1 = "<s> I love NLP </s>"
sentence2 = "<s> I love deep learning </s>"

prob1 = calculate_sentence_probability(sentence1, bigram_probabilities)
prob2 = calculate_sentence_probability(sentence2, bigram_probabilities)

print("\n--- Sentence Probabilities ---")
print(f"P('{sentence1}') = {prob1:.4f}")
print(f"P('{sentence2}') = {prob2:.4f}")

# --- Print which sentence the model prefers and why ---
print("\n--- Model Preference ---")
if prob1 > prob2:
    print(f"The model prefers '{sentence1}'.")
    print(f"Reason: Its calculated probability ({prob1:.4f}) is higher than for the other sentence ({prob2:.4f}).")
elif prob2 > prob1:
    print(f"The model prefers '{sentence2}'.")
    print(f"Reason: Its calculated probability ({prob2:.4f}) is higher than for the other sentence ({prob1:.4f}).")
else:
    print("The model has no preference; both sentences have the same probability.")



--- Counts ---
Unigram Counts: {'<s>': 3, 'I': 2, 'love': 2, 'NLP': 1, '</s>': 3, 'deep': 2, 'learning': 2, 'is': 1, 'fun': 1}

Bigram Counts: {('<s>', 'I'): 2, ('I', 'love'): 2, ('love', 'NLP'): 1, ('NLP', '</s>'): 1, ('love', 'deep'): 1, ('deep', 'learning'): 2, ('learning', '</s>'): 1, ('<s>', 'deep'): 1, ('learning', 'is'): 1, ('is', 'fun'): 1, ('fun', '</s>'): 1}

--- Bigram Probabilities (MLE) ---
P(I          | <s>       ) = 0.6667
P(love       | I         ) = 1.0000
P(NLP        | love      ) = 0.5000
P(</s>       | NLP       ) = 1.0000
P(deep       | love      ) = 0.5000
P(learning   | deep      ) = 1.0000
P(</s>       | learning  ) = 0.5000
P(deep       | <s>       ) = 0.3333
P(is         | learning  ) = 0.5000
P(fun        | is        ) = 1.0000
P(</s>       | fun       ) = 1.0000

--- Sentence Probabilities ---
P('<s> I love NLP </s>') = 0.3333
P('<s> I love deep learning </s>') = 0.1667

--- Model Preference ---
The model prefers '<s> I love NLP </s>'.
Reason: Its calculat

### Q8 Summary of Results

The bigram language model was trained on the provided corpus. The probabilities for the two test sentences were calculated as follows:

- **P('&lt;s&gt; I love NLP &lt;/s&gt;')**: 0.3333
- **P('&lt;s&gt; I love deep learning &lt;/s&gt;')**: 0.1667

Based on these probabilities, the model **prefers the sentence ('&lt;s&gt; I love NLP &lt;/s&gt;')**.

The reason for this preference is that the bigram `('love', 'NLP')` has a higher conditional probability within this specific training corpus than the bigram `('love', 'deep')`. Both sentences share the common prefix `<s> I love`, but the model assigns a higher likelihood to the sequence continuing with `NLP` than with `deep learning` based on the word co-occurrences it learned from the training data.
