### import packages

In [48]:
import collections
from collections import Counter, OrderedDict
from typing import List, Tuple, Dict, Set, Union, Optional
import numpy as np
from nltk.tokenize import TreebankWordTokenizer
import spacy
import pandas as pd

pd.set_option('display.max_rows', None)

### NLTK BoW

In [2]:
text = """
BlackRock, Inc. is an American multinational investment management corporation based in New York City. Founded in 1988, initially as a risk management and fixed income institutional asset manager, BlackRock is the world's largest asset manager, with US$10 trillion in assets under management as of January 2022. BlackRock operates globally with 70 offices in 30 countries and clients in 100 countries.
"""

In [3]:
# text tokenization
tkns = TreebankWordTokenizer().tokenize(text)

In [5]:
# get a bag-of-word
bow = Counter(tkns)

In [13]:
# get a bag-of-words as term frequency
bow_tf = {k: np.round(v / len(tkns), 4) for k, v in bow_tf.items()} # k = term, v = frequency

In [15]:
# get first paragraph of Nike wikipedia
text_0 = """ 
The company was founded on January 25, 1964, as "Blue Ribbon Sports", by Bill Bowerman and Phil Knight, and officially became Nike, Inc. on May 30, 1971. The company takes its name from Nike, the Greek goddess of victory. Nike markets its products under its own brand, as well as Nike Golf, Nike Pro, Nike+, Air Jordan, Nike Blazers, Air Force 1, Nike Dunk, Air Max, Foamposite, Nike Skateboarding, Nike CR7, and subsidiaries including Jordan Brand and Converse. Nike also owned Bauer Hockey from 1995 to 2008, and previously owned Cole Haan, Umbro, and Hurley International. In addition to manufacturing sportswear and equipment, the company operates retail stores under the Niketown name. Nike sponsors many high-profile athletes and sports teams around the world, with the highly recognized trademarks of "Just Do It" and the Swoosh logo.
"""

text_1 = """
The company was started by Adolf Dassler in his mother's house; he was joined by his elder brother Rudolf in 1924 under the name Gebrüder Dassler Schuhfabrik ("Dassler Brothers Shoe Factory"). Dassler assisted in the development of spiked running shoes (spikes) for multiple athletic events. To enhance the quality of spiked athletic footwear, he transitioned from a previous model of heavy metal spikes to utilising canvas and rubber. Dassler persuaded U.S. sprinter Jesse Owens to use his handmade spikes at the 1936 Summer Olympics. In 1949, following a breakdown in the relationship between the brothers, Adolf created Adidas, and Rudolf established Puma, which became Adidas' business rival.
"""

text_2 = """
Puma SE, branded as Puma, is a German multinational corporation that designs and manufactures athletic and casual footwear, apparel and accessories, which is headquartered in Herzogenaurach, Bavaria, Germany. Puma is the third largest sportswear manufacturer in the world. The company was founded in 1948 by Rudolf Dassler. In 1924, Rudolf and his brother Adolf "Adi" Dassler had jointly formed the company Gebrüder Dassler Schuhfabrik (Dassler Brothers Shoe Factory). The relationship between the two brothers deteriorated until the two agreed to split in 1948, forming two separate entities, Adidas and Puma. Both companies are currently based in Herzogenaurach, Germany.
"""

# list of docs
docs = [text_0, text_1, text_2]

# tokenization - list of lists to preserve the length of docs
docs_tkns = [sorted(TreebankWordTokenizer().tokenize(doc)) for doc in docs]

In [16]:
docs_tkns?

[1;31mType:[0m        list
[1;31mString form:[0m [["''", "''", ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',' <...> he', 'the', 'the', 'the', 'third', 'to', 'two', 'two', 'two', 'until', 'was', 'which', 'world.']]
[1;31mLength:[0m      3
[1;31mDocstring:[0m  
Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list.
The argument must be an iterable if specified.


In [17]:
docs?

[1;31mType:[0m        list
[1;31mString form:[0m [' \nThe company was founded on January 25, 1964, as "Blue Ribbon Sports", by Bill Bowerman and P <...> ate entities, Adidas and Puma. Both companies are currently based in Herzogenaurach, Germany.\n']
[1;31mLength:[0m      3
[1;31mDocstring:[0m  
Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list.
The argument must be an iterable if specified.


In [26]:
# vocabulary (dictionary or lexicon) - sort unique tokens
vocab = sorted(set(sum(docs_tkns, [])))

# iterate over tokenized docs and project them onto the vocabulary
vector_space = []
for doc in docs_tkns:
    # preserving order of tokens
    vector = OrderedDict((token, 0) for token in vocab)
    tkns_count = Counter(doc)

    # iterate over tokens in doc and update the vector
    for k, v in tkns_count.items():
        vector[k] = v # / len(doc) # standarise the frequency
    
    # append the updated vector to the vector space
    vector_space.append(vector)
    del vector

In [27]:
vector_space?

[1;31mType:[0m        list
[1;31mString form:[0m [OrderedDict([("'", 0), ("''", 2), ("'s", 0), ('(', 0), (')', 0), (',', 24), ('.', 1), ('1', 1),  <...> victory.', 0), ('was', 1), ('well', 0), ('which', 1), ('with', 0), ('world', 0), ('world.', 1)])]
[1;31mLength:[0m      3
[1;31mDocstring:[0m  
Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list.
The argument must be an iterable if specified.


In [29]:
# adidas vector space
vector_space[1]

OrderedDict([("'", 1),
             ("''", 1),
             ("'s", 1),
             ('(', 2),
             (')', 2),
             (',', 5),
             ('.', 2),
             ('1', 0),
             ('1924', 1),
             ('1936', 1),
             ('1948', 0),
             ('1949', 1),
             ('1964', 0),
             ('1971.', 0),
             ('1995', 0),
             ('2008', 0),
             ('25', 0),
             ('30', 0),
             (';', 1),
             ('Adi', 0),
             ('Adidas', 2),
             ('Adolf', 2),
             ('Air', 0),
             ('Bauer', 0),
             ('Bavaria', 0),
             ('Bill', 0),
             ('Blazers', 0),
             ('Blue', 0),
             ('Both', 0),
             ('Bowerman', 0),
             ('Brand', 0),
             ('Brothers', 1),
             ('CR7', 0),
             ('Cole', 0),
             ('Converse.', 0),
             ('Dassler', 5),
             ('Dassler.', 0),
             ('Do', 0),
             (

### spaCy BoW

In [33]:
nlp = spacy.load('en_core_web_sm')
docs_tkns_spacy = []
for doc in docs:
    tmp = [
        token for token in nlp(doc)
        if (not token.is_punct) and (not token.is_stop) and (not token.is_alpha)
    ]
    docs_tkns_spacy.append(tmp)
    del tmp

# vocabulary (dictionary or lexicon) - sort unique tokens
vocab = sorted(set(sum(docs_tkns, [])))

# iterate over tokenized docs and project them onto the vocabulary
vector_space_spacy = []
for doc in docs_tkns:
    # preserving order of tokens
    vector = OrderedDict((token, 0) for token in vocab)
    tkns_count = Counter(doc)

    # iterate over tokens in doc and update the vector
    for k, v in tkns_count.items():
        vector[k] = v # / len(doc) # standarise the frequency
    
    # append the updated vector to the vector space
    vector_space_spacy.append(vector)
    del vector

In [37]:
vector_space_spacy?

[1;31mType:[0m        list
[1;31mString form:[0m [OrderedDict([("'", 0), ("''", 2), ("'s", 0), ('(', 0), (')', 0), (',', 24), ('.', 1), ('1', 1),  <...> victory.', 0), ('was', 1), ('well', 0), ('which', 1), ('with', 0), ('world', 0), ('world.', 1)])]
[1;31mLength:[0m      3
[1;31mDocstring:[0m  
Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list.
The argument must be an iterable if specified.


In [42]:
# for loop for bow and dictionary creation
def doc2bow(tkns_: List[str], voc_: Dict[str, int]) -> List[Tuple[int, int]]:
    """_summary_

    Args:
        tkns_ (List[str]): tokenized document
        voc_ (List[str, int]): dictionary tokens included in a corpus

    Returns:
        List[uple[int, int]]: bow representation for individual docs belonging to a corpus

    """
    tkns_count = collections.defaultdict(int)
    for tkn in tkns_:
        if tkn not in voc_:
            voc_[tkn] = len(voc_)
        tkns_count[tkn] += 1
    
    return list(tkns_count.items())

# create an empty vocabulary
vocab = {}

doc2bow(tkns_ = TreebankWordTokenizer().tokenize(text_0), voc_ = vocab)
doc2bow(tkns_ = TreebankWordTokenizer().tokenize(text_1), voc_ = vocab)
doc2bow(tkns_ = TreebankWordTokenizer().tokenize(text_2), voc_ = vocab)

[('Puma', 3),
 ('SE', 1),
 (',', 10),
 ('branded', 1),
 ('as', 1),
 ('is', 3),
 ('a', 1),
 ('German', 1),
 ('multinational', 1),
 ('corporation', 1),
 ('that', 1),
 ('designs', 1),
 ('and', 5),
 ('manufactures', 1),
 ('athletic', 1),
 ('casual', 1),
 ('footwear', 1),
 ('apparel', 1),
 ('accessories', 1),
 ('which', 1),
 ('headquartered', 1),
 ('in', 5),
 ('Herzogenaurach', 2),
 ('Bavaria', 1),
 ('Germany.', 1),
 ('the', 5),
 ('third', 1),
 ('largest', 1),
 ('sportswear', 1),
 ('manufacturer', 1),
 ('world.', 1),
 ('The', 2),
 ('company', 2),
 ('was', 1),
 ('founded', 1),
 ('1948', 2),
 ('by', 1),
 ('Rudolf', 2),
 ('Dassler.', 1),
 ('In', 1),
 ('1924', 1),
 ('his', 1),
 ('brother', 1),
 ('Adolf', 1),
 ('``', 1),
 ('Adi', 1),
 ("''", 1),
 ('Dassler', 3),
 ('had', 1),
 ('jointly', 1),
 ('formed', 1),
 ('Gebrüder', 1),
 ('Schuhfabrik', 1),
 ('(', 1),
 ('Brothers', 1),
 ('Shoe', 1),
 ('Factory', 1),
 (')', 1),
 ('.', 2),
 ('relationship', 1),
 ('between', 1),
 ('two', 3),
 ('brothers', 1),


### One-Hot Encoding

In [50]:
# one-hot encoding with Pandas
oh = pd.DataFrame()

for i, doc in enumerate(docs):
    # tokenize the text
    tkns = TreebankWordTokenizer().tokenize(doc)

    # get the unique tokens
    vocab = sorted(set(tkns))
    print(vocab)

    # one-hot encoding
    corpus = pd.DataFrame({k: 1 for k in vocab}, index=[i])

    # append the one-hot encoding to the dataframe
    oh = pd.concat([oh, corpus], axis=1)
    oh.fillna(0, inplace=True)

["''", ',', '.', '1', '1964', '1971.', '1995', '2008', '25', '30', 'Air', 'Bauer', 'Bill', 'Blazers', 'Blue', 'Bowerman', 'Brand', 'CR7', 'Cole', 'Converse.', 'Do', 'Dunk', 'Foamposite', 'Force', 'Golf', 'Greek', 'Haan', 'Hockey', 'Hurley', 'In', 'Inc.', 'International.', 'It', 'January', 'Jordan', 'Just', 'Knight', 'Max', 'May', 'Nike', 'Nike+', 'Niketown', 'Phil', 'Pro', 'Ribbon', 'Skateboarding', 'Sports', 'Swoosh', 'The', 'Umbro', '``', 'addition', 'also', 'and', 'around', 'as', 'athletes', 'became', 'brand', 'by', 'company', 'equipment', 'founded', 'from', 'goddess', 'high-profile', 'highly', 'including', 'its', 'logo', 'manufacturing', 'many', 'markets', 'name', 'name.', 'of', 'officially', 'on', 'operates', 'own', 'owned', 'previously', 'products', 'recognized', 'retail', 'sponsors', 'sports', 'sportswear', 'stores', 'subsidiaries', 'takes', 'teams', 'the', 'to', 'trademarks', 'under', 'victory.', 'was', 'well', 'with', 'world']
["'", "''", "'s", '(', ')', ',', '.', '1924', '193

In [49]:
oh.T

Unnamed: 0,0,1,2
'',1.0,0.0,0.0
",",1.0,0.0,0.0
.,1.0,0.0,0.0
1,1.0,0.0,0.0
1964,1.0,0.0,0.0
1971.,1.0,0.0,0.0
1995,1.0,0.0,0.0
2008,1.0,0.0,0.0
25,1.0,0.0,0.0
30,1.0,0.0,0.0
