# Notes
1. error handling

# Regex Tokenizer

In [None]:
import re
from typing import List, Dict
import numpy as np

# from a1_p1_murugan_116745378 import wordTokenizer

In [None]:
def getConllTags(filename: str) -> List[List]:
    # input: filename for a conll style parts of speech tagged file
    # output: a list of list of tuples [sent]. representing [[[word1, tag], [word2, tag2]]

    wordTagsPerSent = [[]]
    sentNum = 0
    with open(filename, encoding="utf8") as f:
        for wordtag in f:
            wordtag = wordtag.strip()
            if wordtag:  # still reading current sentence
                (word, tag) = wordtag.split("\t")
                wordTagsPerSent[sentNum].append((word, tag))
            else:  # new sentence
                wordTagsPerSent.append([])
                sentNum += 1
    return wordTagsPerSent


In [None]:
data = getConllTags("data/daily547_3pos.txt")

with open("data/daily547_tweets.txt", encoding="utf8") as f:
    tweets = f.read().split("\n")

In [None]:
REGEX_PATTERN = r"""
https?://\S+\.\S+\w\/?|                              # URLs with http or https
\w+\.com\b|                                          # URLs with .com
[:;]-?[\)D\(P/]|                                     # Emoticons 1
[DP][:;]|                                            # Emoticons 2
(?:[A-Z]\.)+|                                        # Abbreviations
[A-z]+[`'][A-z]+|                                    # Contractions
\d+\.\d+|                                            # Numbers with decimal
\d+:\d+|                                             # Time
# [$£]?(?:\d{,3},)*\d+(?:\.\d+)?|                    # Money
\w+[\/]\w+|                                          # Words with slashes
(?:\.+|,+|!+|\?+|\(+|\)+|\?\!|[:;"'`~\{\}\[\]])|     # Punctuation
[@#]?[\w\-]+|                                        # Words with optional @ or #
\S                                                   # Any other non-whitespace character
"""


def wordTokenizer(sent: str) -> List[str]:
    """Split a string into list of tokens matched by regex"""
    # TODO: Need to check if the regex is accurate enough
    # TODO: A. should be captured as ["A", "."] and not ["A."]
    # pattern = re.compile(
    #     r"(?:[A-Z]\.)+|[A-z]+'[A-z]+|\d+\.\d+|[.,:;'`]|[@#]?[A-Za-z0-9]+|\S+"
    # )
    pattern = re.compile(REGEX_PATTERN, re.VERBOSE)

    tokens = re.findall(pattern, sent)

    # Check if tokens add back to original sentence
    assert "".join(tokens) == "".join(
        sent.split()
    ), f"Tokens don't add up to original sentence\nTokens: {tokens}\nSentence: {sent}"
    return tokens


In [None]:
# Testing the wordTokenizer function
a = [[t for t,_ in s] for s in data ]
b = [wordTokenizer(s) for s in tweets[:150]]

In [None]:
wordTokenizer("U.S.A. A.")

In [None]:
counter = 0
for i in range(150):
    if set(a[i]) != set(b[i]):
        counter += 1
        print("Error in tweet", i)
        print("a:", a[i])
        print("b:", b[i])

In [None]:
print("Number of errors:", counter)

# Logistic Regression

In [None]:
unique_postags = set.union(*[set(token[1] for token in sentence) for sentence in data])
unique_tokens = set.union(*[set(token[0] for token in sentence) for sentence in data])

postag_index = {postag: id for id, postag in enumerate(unique_postags)}
token_index = {token: id for id, token in enumerate(unique_tokens)}

In [None]:
def getFeaturesForTarget(tokens: List[str], targetI: int, wordToIndex: Dict[str, int]) -> np.array:
    # input: tokens: a list of tokens in a sentence,
    #        targetI: index for the target token
    #        wordToIndex: dict mapping ‘word’ to an index in the feature list.
    # output: list (or np.array) of k feature values for the given target

    #<FILL IN>
    assert targetI < len(tokens), "list index out of range"

    # feature 1
    first_letter, fl_ascii = tokens[targetI][0], ord(tokens[targetI][0])
    capital = np.array([int(64 < fl_ascii < 91)])

    # feature 2
    first_letter_f = np.zeros(257)
    first_letter_f[fl_ascii if fl_ascii < 256 else 256] = 1
    # fl_ascii = ord(tokens[targetI][0])

    # feature 3
    length = np.array([len(tokens[targetI])])

    # feature 4
    previous_word = np.zeros(len(wordToIndex))
    if targetI != 0:
        previous_word[wordToIndex[tokens[targetI - 1]]] = 1

    # feature 5
    current_word = np.zeros(len(wordToIndex))
    current_word[wordToIndex[tokens[targetI]]] = 1

    # feature 6
    next_word = np.zeros(len(wordToIndex))
    if targetI != len(tokens) - 1:
        next_word[wordToIndex[tokens[targetI + 1]]] = 1

    feature_vector = np.concatenate([
        capital,
        first_letter_f,
        length,
        previous_word,
        current_word,
        next_word,
    ])

    return feature_vector

In [None]:
X = np.array([
    getFeaturesForTarget([i for i,_ in sentence], id, token_index)
    for sentence in data
    for id, _ in enumerate(sentence)
])

y = np.array([
    postag_index[postag]
    for sentence in data
    for _, postag in sentence
])


In [None]:
data[-1]

In [None]:
X[0, :].sum(), *X[-5:, :].sum(1)

In [None]:
import torch
from torch import nn
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import DataLoader, TensorDataset

class MulticlassLogisticRegression(nn.Module):
    def __init__(self, dim, nclass):
        super(MulticlassLogisticRegression, self).__init__()

        self.linear = nn.Linear(dim, nclass, dtype=torch.float32)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.linear(x)
        x = self.log_softmax(x)

        return x

loss_fn = nn.NLLLoss()
model = MulticlassLogisticRegression(X.shape[1], 3)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=0.01)

X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.3)


Xt = torch.tensor(X_train, dtype=torch.float32)
yt = torch.tensor(y_train, dtype=torch.long)

Xd = torch.tensor(X_dev, dtype=torch.float32)
yd = torch.tensor(y_dev, dtype=torch.long)

train_dataset = TensorDataset(Xt, yt)
train_dataloader = DataLoader(train_dataset, batch_size=200)

train_loss = []
dev_loss = []
train_accuracy = []
dev_accuracy = []
for epoch in range(200):
    for batch_X, batch_y in train_dataloader:
        optimizer.zero_grad()
        log_prob_pred = model(batch_X)
        loss = loss_fn(log_prob_pred, batch_y)
        loss.backward()
        optimizer.step()

    train_log_prob_pred = model(Xt)
    dev_log_prob_pred = model(Xd)
    train_y_pred = train_log_prob_pred.argmax(1)
    dev_y_pred = dev_log_prob_pred.argmax(1)

    train_loss.append(loss_fn(train_log_prob_pred, yt).item())
    dev_loss.append(loss_fn(dev_log_prob_pred, yd).item())
    train_accuracy.append(accuracy_score(yt.numpy(), train_y_pred.numpy()))
    dev_accuracy.append(accuracy_score(yd.numpy(), dev_y_pred.numpy()))

# Inference

In [None]:
from a1_p1_murugan_116745378 import wordTokenizer
from a1_p2_murugan_116745378 import getFeaturesForTarget

In [None]:
sampleSentences = [
    'The horse raced past the barn fell.',
    'For 3 years, we attended S.B.U. in the CS program.',
    'Did you hear Sam tell me to "chill out" yesterday? #rude'
]

In [None]:
s = sampleSentences[2]
t = wordTokenizer(s)
Xi = np.array([getFeaturesForTarget(t, i, token_index) for i in range(len(t))])

In [None]:
model(torch.tensor(Xi, dtype=torch.float32)).argmax(1)

# Byte Pair Encoding

In [None]:

import heapq


In [None]:
def spacelessBPETokenize(text, vocab):
    # input: text, a single string to be word tokenized.
    #       vocab, a set of valid vocabulary words
    # output: words, a list of strings of all word tokens, in order, from the string
    words = None
    return words

In [None]:
def convert_set_to_prefix_tree(vocab):
    # input: vocab, a set of strings
    # output: prefix_tree, a dict of dicts representing a prefix tree
    prefix_tree = None
    return prefix_tree

In [None]:
vocab = ["i", "n", "a", "b", "in"]

In [None]:
heapq._heapify_max(vocab)

In [None]:
heapq.heapify(vocab)

In [None]:
vocab

In [None]:
heapq.