<a href="https://colab.research.google.com/github/luliuzh/Lu_LI/blob/main/CL_UZH_22_ESSENTIALS_SOLUTIONS_3_MATHIAS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Code I have shown on the slides

In [None]:
from nltk.util import ngrams

from collections import Counter

def sorted_bigrams(s):
  tokens = s.split(" ")
  bigrams = list(ngrams(tokens, 2))
  c = Counter(bigrams)

  return c.most_common()

In [None]:
s = "T h e m e t h ane l ane i s s ane"

sorted_bigrams(s)

[(('T', 'h'), 1),
 (('h', 'e'), 1),
 (('e', 'm'), 1),
 (('m', 'e'), 1),
 (('e', 't'), 1),
 (('t', 'h'), 1),
 (('h', 'ane'), 1),
 (('ane', 'l'), 1),
 (('l', 'ane'), 1),
 (('ane', 'i'), 1),
 (('i', 's'), 1),
 (('s', 's'), 1),
 (('s', 'ane'), 1)]

# Exercise 3.5

In [None]:
! python -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

In [None]:
def print_named_entities(s: str) -> None:

  doc = nlp(s)

  for ent in doc.ents:
      print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
print_named_entities("My name is Hulk Hogan and I shot an elephant on the Bahamas and sold it for $30.")

Hulk Hogan 11 21 PERSON
Bahamas 52 59 GPE
30 77 79 MONEY


In [None]:
def display_named_entities(s: str) -> None:
  doc = nlp(s)

  displacy.render(doc, style="ent", jupyter=True)

In [None]:
display_named_entities("My name is Hulk Hogan and I shot an elephant on the Bahamas and sold it for $30.")

In [None]:
display_named_entities("This is a sentence.")

  "__main__", mod_spec)


## Exercise 3.6

In [None]:
def print_pos_tags(s: str) -> None:

  doc = nlp(s)

  for token in doc:
    print(token.text, token.pos_)

In [None]:
print_pos_tags("We climbed the north face of Mount Everest.")

We PRON
climbed VERB
the DET
north ADJ
face NOUN
of ADP
Mount PROPN
Everest PROPN
. PUNCT


In [None]:
def get_pos_of_face(s: str) -> str:

  assert "face" in s, "'face' not found in the input string!"

  doc = nlp(s)

  for token in doc:
    if token.text == "face":
      return token.pos_

In [None]:
get_pos_of_face("We climbed the north face of Mount Everest.")

'NOUN'

In [None]:
TEST_CASES = [("NOUN", "We climbed the north face of Mount Everest."),
              ("VERB", "You seem to face some real difficulties."),
              ("NOUN", "My face was glowing red from the heat."),
              ("VERB", "Let’s face it, How I Met your Mother is dumb.")]

In [None]:
def test_face_pos_is_correct() -> bool:

  for expected_pos, sentence in TEST_CASES:
    actual_pos = get_pos_of_face(sentence)
    assert actual_pos == expected_pos

  return True

In [None]:
test_face_pos_is_correct()

In [None]:
def test_fails_if_face_not_in_sentence():
  # this should return an AssertionError
  get_pos_of_face("any input without lowercase FACE")

In [None]:
test_fails_if_face_not_in_sentence()

AssertionError: ignored

## Exercise 3.9

### First implementation

In [None]:
from typing import List, Set, Tuple, Any
from nltk.util import ngrams
from collections import Counter

In [None]:
def find_most_frequent_bigram(symbols: List[str]) -> Tuple:
  bigrams = list(ngrams(symbols, 2))
  c = Counter(bigrams)

  return c.most_common(1)[0][0]

def split_string_into_characters(s: str) -> List[str]:
  s = s.replace(" ", "")
  return list(s)

In [None]:
class BPE:

  def __init__(self, desired_vocab_size: int) -> None:

    self.desired_vocab_size = desired_vocab_size
    self.vocabulary = None  # type: Set

    self.current_data = None  # type: List[str]

    self.merges = []  # type: List[Tuple[str, str]]

  def _initialize_vocab(self) -> List[str]:
    """
    Builds an initial vocabulary of characters.

    Return: Returns the list of initial characters found in the data.
    """
    self.vocabulary = set(self.current_data)

    return self.vocabulary

  def train(self, training_data: str) -> None:
    """
    """
    training_data = split_string_into_characters(training_data)
    self.current_data = training_data

    self._initialize_vocab()

    while len(self.vocabulary) < self.desired_vocab_size:

      # find the most frequent bigram of symbols sym1 and sym2
      most_frequent_bigram = find_most_frequent_bigram(self.current_data)

      # add the concatenation of sym1, sym2 to the vocabulary
      new_symbol = "".join(list(most_frequent_bigram))
      self.vocabulary.add(new_symbol)

      # In the data, merge all occurrences of sym1, sym2
      current_data_as_string_with_whitespace = " ".join(self.current_data)
      individual_symbols_with_whitespace = "%s %s" % most_frequent_bigram

      # record merge operations
      self.merges.append((individual_symbols_with_whitespace, new_symbol))

      new_data = current_data_as_string_with_whitespace.replace(individual_symbols_with_whitespace, new_symbol)

      self.current_data = list(new_data)

  def apply(self, input_text: str) -> str:
    """
    """
    # first split into individual characters
    input_text = " ".join(split_string_into_characters(input_text))

    # apply all merges recorded during training
    for (individual_symbols_with_whitespace, new_symbol) in self.merges:
      input_text = input_text.replace(individual_symbols_with_whitespace, new_symbol)

    return input_text

train:

In [None]:
data = "the methane lane is sane"
desired_size = 12

b = BPE(desired_vocab_size=desired_size)

b.train(data)

In [None]:
b.vocabulary

{'  ', ' e', 'a', 'an', 'e', 'h', 'i', 'l', 'm', 'n', 's', 't'}

apply a trained model to a new text:

In [None]:
b.apply("methane")

'm e t h an e'

### Second implementation (hopefully better)

In [None]:
def find_most_frequent_bigram(symbols: List[str]) -> Tuple:
  bigrams = list(ngrams(symbols, 2))
  c = Counter(bigrams)

  return c.most_common(1)[0][0]

In [None]:
def find_sub_list_indexes(sl,l):
    """
    Source: https://stackoverflow.com/a/17870684/1987598
    """
    results=[]
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            results.append((ind,ind+sll-1))

    return results

In [None]:
def concatenate_most_frequent_bigram_in_data(most_frequent_bigram: Tuple[str, str],
                                             current_data: List[str]) -> List[str]:
    """
    Adapted from: https://stackoverflow.com/a/1142879/1987598
    """

    sublist_indexes = find_sub_list_indexes(list(most_frequent_bigram), current_data)

    for sublist_index in reversed(sublist_indexes):
        index_sym_1, index_sym_2 = sublist_index
        current_data[index_sym_1:index_sym_2+1] = [''.join(current_data[index_sym_1:index_sym_2+1])]

    return current_data

In [None]:
class BPEBetter:

  def __init__(self, desired_vocab_size: int) -> None:

    self.desired_vocab_size = desired_vocab_size
    self.vocabulary = None  # type: Set

    self.current_data = None  # type: List[str]

    self.merges = []  # type: List[Tuple[str, str]]

  def _initialize_vocab(self) -> List[str]:
    """
    Builds an initial vocabulary of characters.

    Return: Returns the list of initial characters found in the data.
    """
    self.vocabulary = set(self.current_data)

    return self.vocabulary


  def train(self, training_data: str) -> None:
    """
    """

    # split training data into characters
    training_data = list(training_data)
    self.current_data = training_data

    self._initialize_vocab()

    expected_number_of_steps = self.desired_vocab_size - len(self.vocabulary)

    print("Expected number of steps: %d" % expected_number_of_steps)

    num_steps = 0

    while len(self.vocabulary) < self.desired_vocab_size:

      num_steps += 1

      print("Step %d of %d ..." % (num_steps, expected_number_of_steps))

      # find the most frequent bigram of symbols sym1 and sym2
      most_frequent_bigram = find_most_frequent_bigram(self.current_data)

      # add the concatenation of sym1, sym2 to the vocabulary
      new_symbol = "".join(list(most_frequent_bigram))
      self.vocabulary.add(new_symbol)

      # In the data, merge all occurrences of sym1, sym2
      self.current_data = concatenate_most_frequent_bigram_in_data(most_frequent_bigram, self.current_data)

      # record merge operations (for applying the model later)
      self.merges.append(most_frequent_bigram)


  def apply(self, input_text: str) -> str:
    """
    """
    # first split into individual characters
    input_text = list(input_text)

    # apply all merges recorded during training
    for most_frequent_bigram in self.merges:
        input_text = concatenate_most_frequent_bigram_in_data(most_frequent_bigram, input_text)

    return input_text

In [None]:
data = "the methane lane is sane"
desired_size = 15

b3 = BPEBetter(desired_vocab_size=desired_size)

b3.train(data)

Expected number of steps: 5
Step 1 of 5 ...
Step 2 of 5 ...
Step 3 of 5 ...
Step 4 of 5 ...
Step 5 of 5 ...


In [None]:
b3.vocabulary

{' ',
 'a',
 'an',
 'ane ',
 'e',
 'e ',
 'h',
 'i',
 'l',
 'm',
 'n',
 's',
 't',
 'th',
 'the '}

In [None]:
b3.apply("methane")

['m', 'e', 'th', 'an', 'e']

Now try on some larger text:

In [None]:
! wget https://www.gutenberg.org/files/2554/2554-0.txt

--2023-09-13 17:05:47--  https://www.gutenberg.org/files/2554/2554-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... failed: Connection timed out.
Connecting to www.gutenberg.org (www.gutenberg.org)|2610:28:3090:3000:0:bad:cafe:47|:443... failed: Cannot assign requested address.
Retrying.

--2023-09-13 17:07:58--  (try: 2)  https://www.gutenberg.org/files/2554/2554-0.txt
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1201520 (1.1M) [text/plain]
Saving to: ‘2554-0.txt.1’


2023-09-13 17:08:06 (2.80 MB/s) - ‘2554-0.txt.1’ saved [1201520/1201520]



In [None]:
from typing import Optional

def read_training_data(filepath: str, max_num_lines: Optional[int] = None) -> str:
  with open(filepath) as infile:

    training_lines = []

    lines_seen = 0

    for line in infile:
      line = line.strip()
      training_lines.append(line)

      lines_seen += 1

      if max_num_lines is not None:
        if lines_seen == max_num_lines:
          break

  return " ".join(training_lines)

In [None]:
training_data = read_training_data("2554-0.txt", max_num_lines=10000)

see how many individual characters are in this data:

In [None]:
len(set(training_data))

87

In [None]:
desired_size = 200

b4 = BPEBetter(desired_vocab_size=desired_size)

b4.train(training_data)

Expected number of steps: 113
Step 1 of 113 ...
Step 2 of 113 ...
Step 3 of 113 ...
Step 4 of 113 ...
Step 5 of 113 ...
Step 6 of 113 ...
Step 7 of 113 ...
Step 8 of 113 ...
Step 9 of 113 ...
Step 10 of 113 ...
Step 11 of 113 ...
Step 12 of 113 ...
Step 13 of 113 ...
Step 14 of 113 ...
Step 15 of 113 ...
Step 16 of 113 ...
Step 17 of 113 ...
Step 18 of 113 ...
Step 19 of 113 ...
Step 20 of 113 ...
Step 21 of 113 ...
Step 22 of 113 ...
Step 23 of 113 ...
Step 24 of 113 ...
Step 25 of 113 ...
Step 26 of 113 ...
Step 27 of 113 ...
Step 28 of 113 ...
Step 29 of 113 ...
Step 30 of 113 ...
Step 31 of 113 ...
Step 32 of 113 ...
Step 33 of 113 ...
Step 34 of 113 ...
Step 35 of 113 ...
Step 36 of 113 ...
Step 37 of 113 ...
Step 38 of 113 ...
Step 39 of 113 ...
Step 40 of 113 ...
Step 41 of 113 ...
Step 42 of 113 ...
Step 43 of 113 ...
Step 44 of 113 ...
Step 45 of 113 ...
Step 46 of 113 ...
Step 47 of 113 ...
Step 48 of 113 ...
Step 49 of 113 ...
Step 50 of 113 ...
Step 51 of 113 ...
Step 52 of

In [None]:
b4.apply("My English text is nice")

['M',
 'y ',
 'E',
 'n',
 'g',
 'l',
 'is',
 'h',
 ' t',
 'e',
 'x',
 't ',
 'is ',
 'n',
 'ic',
 'e']