<a href="https://colab.research.google.com/github/mobraine/Natural-Language-Understanding/blob/main/COMP599_A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from typing import Union, Iterable, Callable
import random

import torch.nn as nn
import torch


def load_datasets(data_directory: str) -> Union[dict, dict]:
    """
    Reads the training and validation splits from disk and load
    them into memory.

    Parameters
    ----------
    data_directory: str
        The directory where the data is stored.

    Returns
    -------
    train: dict
        The train dictionary with keys 'premise', 'hypothesis', 'label'.
    validation: dict
        The validation dictionary with keys 'premise', 'hypothesis', 'label'.
    """
    import json
    import os

    with open(os.path.join(data_directory, "train.json"), "r") as f:
        train = json.load(f)

    with open(os.path.join(data_directory, "validation.json"), "r") as f:
        valid = json.load(f)

    return train, valid


def tokenize(
    text: "list[str]", max_length: int = None, normalize: bool = True
) -> "list[list[str]]":
    """
    Tokenize the text into individual words (nested list of string),
    where the inner list represent a single example.

    Parameters
    ----------
    text: list of strings
        Your cleaned text data (either premise or hypothesis).
    max_length: int, optional
        The maximum length of the sequence. If None, it will be
        the maximum length of the dataset.
    normalize: bool, default True
        Whether to normalize the text before tokenizing (i.e. lower
        case, remove punctuations)
    Returns
    -------
    list of list of strings
        The same text data, but tokenized by space.

    Examples
    --------
    >>> tokenize(['Hello, world!', 'This is a test.'], normalize=True)
    [['hello', 'world'], ['this', 'is', 'a', 'test']]
    """
    import re

    if normalize:
        regexp = re.compile("[^a-zA-Z ]+")
        # Lowercase, Remove non-alphanum
        text = [regexp.sub("", t.lower()) for t in text]

    return [t.split()[:max_length] for t in text]


def build_word_counts(token_list: "list[list[str]]") -> "dict[str, int]":
    """
    This builds a dictionary that keeps track of how often each word appears
    in the dataset.

    Parameters
    ----------
    token_list: list of list of strings
        The list of tokens obtained from tokenize().

    Returns
    -------
    dict of {str: int}
        A dictionary mapping every word to an integer representing the
        appearance frequency.

    Notes
    -----
    If you have  multiple lists, you should concatenate them before using
    this function, e.g. generate_mapping(list1 + list2 + list3)
    """
    word_counts = {}

    for words in token_list:
        for word in words:
            word_counts[word] = word_counts.get(word, 0) + 1

    return word_counts


def build_index_map(
    word_counts: "dict[str, int]", max_words: int = None
) -> "dict[str, int]":
    """
    Builds an index map that converts a word into an integer that can be
    accepted by our model.

    Parameters
    ----------
    word_counts: dict of {str: int}
        A dictionary mapping every word to an integer representing the
        appearance frequency.
    max_words: int, optional
        The maximum number of words to be included in the index map. By
        default, it is None, which means all words are taken into account.

    Returns
    -------
    dict of {str: int}
        A dictionary mapping every word to an integer representing the
        index in the embedding.
    """

    return {
        word: ix
        for ix, (word, _) in enumerate(
            sorted(word_counts.items(), key=lambda item: item[1], reverse=True)[
                :max_words
            ]
        )
    }


def tokens_to_ix(
    tokens: "list[list[str]]", index_map: "dict[str, int]"
) -> "list[list[int]]":
    """
    Converts a nested list of tokens to a nested list of indices using
    the index map.

    Parameters
    ----------
    tokens: list of list of strings
        The list of tokens obtained from tokenize().
    index_map: dict of {str: int}
        The index map from build_index_map().

    Returns
    -------
    list of list of int
        The same tokens, but converted into indices.

    Notes
    -----
    Words that have not been seen are ignored.
    """
    return [
        [index_map[word] for word in words if word in index_map] for words in tokens
    ]

train, valid=load_datasets("/content")
print(valid['premise'])
# valid=tokenize(valid['premise'])
# print(valid)
# valid_counts=build_word_counts(valid)
# print(valid_counts)
# valid_index=build_index_map(valid_counts)
# print(valid_index)
# tokens_final=tokens_to_ix(valid, valid_index)
# print(tokens_final)


FileNotFoundError: ignored

In [None]:
### 1.1 Batching, shuffling, iteration
def build_loader(
    data_dict: dict, batch_size: int = 64, shuffle: bool = False
) -> Callable[[], Iterable[dict]]:
    """
    Build a nested function. build_loader(...) specifies what type of loader
    you want, and the output is itself a function that, when called, returns
    a generator. You can iterate over the generator to get a batch of data
    (which is  a dictionary with the same keys).

    Parameters
    ----------
    data_dict: dict
        A dictionary with keys 'premise', 'hypothesis', and potentially
        'label', all of which are lists of same length.
    batch_size: int, optional
        The size of the batch.
    shuffle: bool, optional
        Whether to shuffle the dataset.

    Returns
    -------
    function
        A loader function with no input and returns an iterator yielding a
        dictionary with the same keys as data_dict, but with length
        corresponding to batch_size.

    Notes
    -----
    It's possible to implement this function such that data_dict could have
    arbitrary keys as long as they are all lists of same length.

    Examples
    --------
    >>> loader = build_loader(data)
    >>> for batch in loader():
    ...     premise = batch['premise']
    ...     label_batch = batch['label']
    ...     # do something with batch here
    """
    # TODO: Your code here

    def loader():
        # TODO: Your code here
        pass

    return loader


### 1.2 Converting a batch into inputs
def convert_to_tensors(text_indices: "list[list[str]]") -> torch.Tensor:
    """
    Given a list of lists of indices, convert it to a tensor of shape (N, L).
    You will need to handle the padding, which will be the integer
    """
    # TODO: Your code here
    pass





In [None]:
### 2.1 Design a logistic model with embedding and pooling
def max_pool(x: torch.Tensor) -> torch.Tensor:
    """
    Take the pooling over the second dimension, i.e. a
    (N, L, D) -> (N, D) transformation where D is the `hidden_size`,
    N is the batch size, L is the sequence length.
    """
    # TODO: Your code here
    pass


class PooledLogisticRegression(nn.Module):
    def __init__(self, embedding: nn.Embedding):
        """
        When called this simple linear model will do the following:
            1. Individually embed a batch of premise and hypothesis (token indices)
            2. Individually apply max_pool along the sequence length (L_p and L_h)
            3. Concatenate the pooled tensors into a single tensor
            4. Apply the logistic regression to obtain prediction

        Parameters
        ----------
        embedding: nn.Embedding
            The embedding layer you created using the size of the word index.
            You can create it outside of this module. The transformation is
            (N, L) -> (N, L, E) where E is the initial embedding dimension, and L is
            the sequence length.
        """
        super().__init__()

        # TODO: Your code here

    # DO NOT CHANGE THE SECTION BELOW! ###########################
    # # This is to force you to initialize certain things in __init__
    def get_layer_pred(self):
        return self.layer_pred

    def get_embedding(self):
        return self.embedding

    def get_sigmoid(self):
        return self.sigmoid

    # DO NOT CHANGE THE SECTION ABOVE! ###########################

    def forward(self, premise: torch.Tensor, hypothesis: torch.Tensor) -> torch.Tensor:
        """
        Parameters
        ----------
        premise: torch.Tensor[N, L_p]
            The premise tensor, where L_p is the premise sequence length and
            N is the batch size.
        hypothesis: torch.Tensor[N, L_h]
            The hypothesis tensor, where L_h is the hypothesis sequence length.

        Returns
        -------
        torch.Tensor[N]
            The predicted score for each example in the batch.

        Notes
        -----
        Note the returned tensor is of shape N, not (N, 1). You will need to
        reshape your tensor to get the correct format.
        """

        emb = self.get_embedding()
        layer_pred = self.get_layer_pred()
        sigmoid = self.get_sigmoid()

        # TODO: Your code here


### 2.2 Choose an optimizer and a loss function
def assign_optimizer(model: nn.Module, **kwargs) -> torch.optim.Optimizer:
    """
    Parameters
    ----------
    model: nn.Module
        The model to optimize.
    kwargs: dict
        The arguments to pass to the optimizer. This will vary depending on the
        optimizer, but the most common one is `lr`.

    Returns
    -------
    torch.optim.Optimizer
        The optimizer that you will use during the model training.

    Notes
    -----
    There's many optimizers in PyTorch. You can start with SGD, but
    it's recommended to try other popular options:
    https://pytorch.org/docs/stable/optim.html#algorithms
    """
    # TODO: Your code here
    pass


def bce_loss(y: torch.Tensor, y_pred: torch.Tensor) -> torch.Tensor:
    """
    The binary cross entropy loss, implemented from scratch using torch
    (do not use torch.nn).

    Parameters
    ----------
    y: torch.Tensor[N]
        The true labels.
    y_pred: torch.Tensor[N]
        The predicted labels.

    Returns
    -------
    torch.Tensor
        The binary cross entropy loss (averaged over N).
    """
    # TODO: Your code here
    pass


### 2.3 Forward and backward pass
def forward_pass(model: nn.Module, batch: dict, device="cpu"):
    """
    Implement a function that performs one step of the training process. Given
    a batch and a model, this function should handle the text to tensor conversion
    and pass it in a model.

    Parameters
    ----------
    model: nn.Module
        The model you will use to perform the forward pass.
    batch: dict of list
        A dictionary with 'premise' and 'hypothesis' keys (lists of same size).
    device: str
        The device you want to run the model on. This is usually 'cpu' or 'cuda'.

    Returns
    -------
    torch.Tensor
        The predicted labels.

    This function should return the predicted y value by the model.
    """
    # TODO: Your code here
    pass


def backward_pass(
    optimizer: torch.optim.Optimizer, y: torch.Tensor, y_pred: torch.Tensor
) -> torch.Tensor:
    """
    This function takes in the optimizer, the true labels, and the predicted labels,
    then computes the loss and performs a backward pass before updating the weights.

    Parameters
    ----------
    optimizer: torch.optim.Optimizer
        The optimizer you will use to perform the backward pass.
    y: torch.Tensor[N]
        The true labels.
    y_pred: torch.Tensor[N]
        The predicted labels.

    Returns
    -------
    torch.Tensor
        The loss value computed with bce_loss()
    """
    # TODO: Your code here
    pass


### 2.4 Evaluation
def f1_score(y: torch.Tensor, y_pred: torch.Tensor, threshold=0.5) -> torch.Tensor:
    """
    Compute the F1 score from scratch (without using external libraries).

    Parameters
    ----------
    y: torch.Tensor[N]
        The true labels.
    y_pred: torch.Tensor[N]
        The predicted labels.
    threshold: float, default 0.5
        The threshold to use to convert the predicted labels to binary. If set
        to None, y_pred will not be thresholded.

    Returns
    -------
    torch.Tensor[1]
        The F1 score.

    """
    # TODO: Your code here
    pass


### 2.5 Train loop
def eval_run(
    model: nn.Module, loader: Callable[[], Iterable[dict]], device: str = "cpu"
):
    """
    Iterate through a loader and predict the labels for each example, all while
    collecting the original labels.

    Parameters
    ----------
    model: nn.Module
        The model you will use to perform the forward pass.
    loader: Callable[[], Iterable[dict]]
        The loader function that will yield batches.
    device: str
        The device you want to run the model on. This is usually 'cpu' or 'cuda'.

    Returns
    -------
    y_true: torch.Tensor[N]
        The true labels, extracted from the loader.
    y_pred: torch.Tensor[N]
        The labels predicted by the model (output of forward_pass).

    Notes
    -----
    You can use the `forward_pass` function to get the predicted labels. Don't
    forget to  disable the gradients for the model and to turn your model into
    evaluation mode.
    """
    # TODO: Your code here
    pass


def train_loop(
    model: nn.Module,
    train_loader,
    valid_loader,
    optimizer,
    n_epochs: int = 3,
    device: str = "cpu",
):
    """
    Train a model for a given number of epochs.

    Parameters
    ----------
    model: nn.Module
        The model you will use to perform the forward pass.
    train_loader: Callable[[], Iterable[dict]]
        The loader function that will yield shuffled batches of training data.
    valid_loader: Callable[[], Iterable[dict]]
        The loader function that will yield non-shuffled batches of validation data.
    optimizer: torch.optim.Optimizer
        The optimizer you will use to perform the backward pass.
    n_epochs: int
        The number of epochs you want to train your model
    device: str
        The device you want to run the model on. This is usually 'cpu' or 'cuda'.

    Returns
    -------
    list
        A list of f1 scores evaluated on the valid_loader at the end of each epoch.

    Notes
    -----
    This function is left open-ended and is strictly to help you train your model.
    You are free to implement what you think works best, as long as it runs on the
    training and validation data and return a list of validation score at the end
    of each epoch.
    """
    # TODO: Your code here
    pass



In [None]:
### 3.1
class ShallowNeuralNetwork(nn.Module):
    def __init__(self, embedding: nn.Embedding, hidden_size: int):
        """
        When called this simple linear model will do the following:
            1. Individually embed a batch of premise and hypothesis (token indices)
            2. Individually apply max_pool along the sequence length (L_p and L_h)
            3. Individually apply one feedforward layer to your pooled tensors
            4. Use the ReLU on the outputs of your layer
            5. Concatenate the activated tensors into a single tensor
            6. Apply sigmoid layer to obtain prediction

        Parameters
        ----------
        embedding: nn.Embedding
            The embedding layer you created using the size of the word index.
        hidden_size: int
            The size of the hidden layer.
        """
        super().__init__()

        # TODO: continue here

    # DO NOT CHANGE THE SECTION BELOW! ###########################
    # # This is to force you to initialize certain things in __init__
    def get_ff_layer(self):
        return self.ff_layer

    def get_layer_pred(self):
        return self.layer_pred

    def get_embedding(self):
        return self.embedding

    def get_sigmoid(self):
        return self.sigmoid

    def get_activation(self):
        return self.activation

    # DO NOT CHANGE THE SECTION ABOVE! ###########################

    def forward(self, premise: torch.Tensor, hypothesis: torch.Tensor) -> torch.Tensor:
        """
        Parameters
        ----------
        premise: torch.Tensor[N, L_p]
            The premise tensor, where N is the batch size and L_p is the premise
            sequence length.
        hypothesis: torch.Tensor[N, L_h]
            The hypothesis tensor, where L_h is the hypothesis sequence length.

        Returns
        -------
        torch.Tensor[N]
            The scores for each example in the batch.
        """

        emb = self.get_embedding()
        layer_pred = self.get_layer_pred()
        sigmoid = self.get_sigmoid()
        ff_layer = self.get_ff_layer()
        act = self.get_activation()

        # TODO: continue here


### 3.2
class DeepNeuralNetwork(nn.Module):
    def __init__(self, embedding: nn.Embedding, hidden_size: int, num_layers: int = 2):
        """
        When called this simple linear model will do the following:
            1. Individually embed a batch of premise and hypothesis (token indices)
            2. Individually apply max_pool along the sequence length (L_p and L_h)
            3. Individually apply one feedforward layer to your pooled tensors
            4. Use the ReLU on the outputs of your layer, repeat (3) for `num_layers` times.
            5. Concatenate the activated tensors into a single tensor
            6. Apply sigmoid layer to obtain prediction

        Parameters
        ----------
        embedding: nn.Embedding
            The embedding layer you created using the size of the of the word index. You can
            create it outside of this module. The transforma dimensions is (N, L) -> (N, L, E) where
            E is the initial embedding dimension, and L is the sequence length.
        hidden_size: int
            The size of the hidden layer.
        num_layers: int, default 2
            The number of hidden layers in your deep network. Each layer must
            be activated with ReLU.

        Notes
        -----
        You will need to use nn.ModuleList to track your layers.
        """
        super().__init__()

        # TODO: continue here

    # DO NOT CHANGE THE SECTION BELOW! ###########################
    # # This is to force you to initialize certain things in __init__
    def get_ff_layers(self):
        return self.ff_layers

    def get_layer_pred(self):
        return self.layer_pred

    def get_embedding(self):
        return self.embedding

    def get_sigmoid(self):
        return self.sigmoid

    def get_activation(self):
        return self.activation

    # DO NOT CHANGE THE SECTION ABOVE! ###########################

    def forward(self, premise: torch.Tensor, hypothesis: torch.Tensor) -> torch.Tensor:
        """
        Parameters
        ----------
        premise: torch.Tensor[N, L_p]
            The premise tensor, where N is the batch size and L_p is the premise
            sequence length.
        hypothesis: torch.Tensor[N, L_h]
            The hypothesis tensor, where L_h is the hypothesis sequence length.

        Returns
        -------
        torch.Tensor[N]
            The scores for each example in the batch.
        """

        emb = self.get_embedding()
        layer_pred = self.get_layer_pred()
        sigmoid = self.get_sigmoid()
        ff_layers = self.get_ff_layers()
        act = self.get_activation()

        # TODO: continue here


In [None]:
if __name__ == "__main__":
    # If you have any code to test or train your model, do it BELOW!

    # Seeds to ensure reproducibility
    random.seed(2022)
    torch.manual_seed(2022)

    # If you use GPUs, use the code below:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Prefilled code showing you how to use the helper functions
    train_raw, valid_raw = load_datasets("data")

    train_tokens = {
        "premise": tokenize(train_raw["premise"], max_length=64),
        "hypothesis": tokenize(train_raw["hypothesis"], max_length=64),
    }

    valid_tokens = {
        "premise": tokenize(valid_raw["premise"], max_length=64),
        "hypothesis": tokenize(valid_raw["hypothesis"], max_length=64),
    }

    word_counts = build_word_counts(
        train_tokens["premise"]
        + train_tokens["hypothesis"]
        + valid_tokens["premise"]
        + valid_tokens["hypothesis"]
    )
    index_map = build_index_map(word_counts, max_words=10000)

    train_indices = {
        "label": train_raw["label"],
        "premise": tokens_to_ix(train_tokens["premise"], index_map),
        "hypothesis": tokens_to_ix(train_tokens["hypothesis"], index_map)
    }

    valid_indices = {
        "label": valid_raw["label"],
        "premise": tokens_to_ix(valid_tokens["premise"], index_map),
        "hypothesis": tokens_to_ix(valid_tokens["hypothesis"], index_map)
    }

    # 1.1
    train_loader = "your code here"
    valid_loader = "your code here"

    # 1.2
    batch = next(train_loader())
    y = "your code here"

    # 2.1
    embedding = "your code here"
    model = "your code here"

    # 2.2
    optimizer = "your code here"

    # 2.3
    y_pred = "your code here"
    loss = "your code here"

    # 2.4
    score = "your code here"

    # 2.5
    n_epochs = 2

    embedding = "your code here"
    model = "your code here"
    optimizer = "your code here"

    scores = "your code here"

    # 3.1
    embedding = "your code here"
    model = "your code here"
    optimizer = "your code here"

    scores = "your code here"

    # 3.2
    embedding = "your code here"
    model = "your code here"
    optimizer = "your code here"

    scores = "your code here"