In [1]:
import spacy

from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1 | Spacy

In [7]:
# A corpus of sentences.
corpus = [
  "Red Bull drops hint on F1 engine.",
  "Honda exits F1, leaving F1 partner Red Bull.",
  "Hamilton eyes record eighth F1 title.",
  "Aston Martin announces sponsor."
]

vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(corpus)

# View features (tokens).
print(vectorizer.get_feature_names_out())

# View vocabulary dictionary.
vectorizer.vocabulary_

['announces' 'aston' 'bull' 'drops' 'eighth' 'engine' 'exits' 'eyes' 'f1'
 'hamilton' 'hint' 'honda' 'leaving' 'martin' 'on' 'partner' 'record'
 'red' 'sponsor' 'title']


{'red': 17,
 'bull': 2,
 'drops': 3,
 'hint': 10,
 'on': 14,
 'f1': 8,
 'engine': 5,
 'honda': 11,
 'exits': 6,
 'leaving': 12,
 'partner': 15,
 'hamilton': 9,
 'eyes': 7,
 'record': 16,
 'eighth': 4,
 'title': 19,
 'aston': 1,
 'martin': 13,
 'announces': 0,
 'sponsor': 18}

In [3]:
print(type(bow))

<class 'scipy.sparse.csr.csr_matrix'>


In [10]:
print(bow) # only the word (1,8) accurse twice.


# f we look at the raw structure, we'll see tuples where the first element represents the document, 
# and the second element represents a token ID. It's then followed by a count of that token. 
# So in the second document (index 1), token 8 ("f1") occurs twice.



# Before we explore further, we want to make a few modifications.
# 1. What if we want to use another tokenizer like spaCy's?
# 2. Instead of frequency, what if we want to have a binary BOW?


  (0, 17)	1
  (0, 2)	1
  (0, 3)	1
  (0, 10)	1
  (0, 14)	1
  (0, 8)	1
  (0, 5)	1
  (1, 17)	1
  (1, 2)	1
  (1, 8)	2
  (1, 11)	1
  (1, 6)	1
  (1, 12)	1
  (1, 15)	1
  (2, 8)	1
  (2, 9)	1
  (2, 7)	1
  (2, 16)	1
  (2, 4)	1
  (2, 19)	1
  (3, 1)	1
  (3, 13)	1
  (3, 0)	1
  (3, 18)	1


## 2 | Binary BOW

In [12]:
nlp = spacy.load("en_core_web_sm")

# Create a tokenizer callback using spaCy under the hood. Here, we tokenize
# the passed-in text and return the tokens, filtering out punctuation.
def spacy_tokenizer(doc):
  return [t.text for t in nlp(doc) if not t.is_punct]



# use the custom tokenizer for the CountVectorizer functoin
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, lowercase=False, binary=True) # setting binary=True make i Binary BOW
bow = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
vectorizer.vocabulary_

['Aston' 'Bull' 'F1' 'Hamilton' 'Honda' 'Martin' 'Red' 'announces' 'drops'
 'eighth' 'engine' 'exits' 'eyes' 'hint' 'leaving' 'on' 'partner' 'record'
 'sponsor' 'title']




{'Red': 6,
 'Bull': 1,
 'drops': 8,
 'hint': 13,
 'on': 15,
 'F1': 2,
 'engine': 10,
 'Honda': 4,
 'exits': 11,
 'leaving': 14,
 'partner': 16,
 'Hamilton': 3,
 'eyes': 12,
 'record': 17,
 'eighth': 9,
 'title': 19,
 'Aston': 0,
 'Martin': 5,
 'announces': 7,
 'sponsor': 18}

In [13]:
print(bow)

  (0, 6)	1
  (0, 1)	1
  (0, 8)	1
  (0, 13)	1
  (0, 15)	1
  (0, 2)	1
  (0, 10)	1
  (1, 6)	1
  (1, 1)	1
  (1, 2)	1
  (1, 4)	1
  (1, 11)	1
  (1, 14)	1
  (1, 16)	1
  (2, 2)	1
  (2, 3)	1
  (2, 12)	1
  (2, 17)	1
  (2, 9)	1
  (2, 19)	1
  (3, 0)	1
  (3, 5)	1
  (3, 7)	1
  (3, 18)	1


In [6]:
print('A dense representation like we saw in the slides.')
print(bow.toarray())
print()
print('Indexing and slicing.')
print(bow[0])
print()
print(bow[0:2])

A dense representation like we saw in the slides.
[[0 1 1 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0]
 [0 1 1 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0]
 [0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1]
 [1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0]]

Indexing and slicing.
  (0, 6)	1
  (0, 1)	1
  (0, 8)	1
  (0, 13)	1
  (0, 15)	1
  (0, 2)	1
  (0, 10)	1

  (0, 6)	1
  (0, 1)	1
  (0, 8)	1
  (0, 13)	1
  (0, 15)	1
  (0, 2)	1
  (0, 10)	1
  (1, 6)	1
  (1, 1)	1
  (1, 2)	1
  (1, 4)	1
  (1, 11)	1
  (1, 14)	1
  (1, 16)	1


## 3 | cosine similarity

In [14]:
# cosine_similarity can take either array-likes or sparse matrices.
print(cosine_similarity(bow)) # the pair-wise similarity matrix you will get. ---> 4 rows --> 4 x 4 matrix.

[[1.         0.42857143 0.15430335 0.        ]
 [0.42857143 1.         0.15430335 0.        ]
 [0.15430335 0.15430335 1.         0.        ]
 [0.         0.         0.         1.        ]]


## 4 | N-grams

In [15]:
#you can generate n-grams simply by the Countvector function. 

# ngram_range = (1,2) includes 1 and 2 grams.
# default is = (1,1)
# if you wanna only 2 -->  ngram_range = (2,2) includes 2 grams.



vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, lowercase=False, binary=True, ngram_range=(1,2))
bigrams = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print('Number of features: {}'.format(len(vectorizer.get_feature_names_out())))
print(vectorizer.vocabulary_)

['Aston' 'Aston Martin' 'Bull' 'Bull drops' 'F1' 'F1 engine' 'F1 leaving'
 'F1 partner' 'F1 title' 'Hamilton' 'Hamilton eyes' 'Honda' 'Honda exits'
 'Martin' 'Martin announces' 'Red' 'Red Bull' 'announces'
 'announces sponsor' 'drops' 'drops hint' 'eighth' 'eighth F1' 'engine'
 'exits' 'exits F1' 'eyes' 'eyes record' 'hint' 'hint on' 'leaving'
 'leaving F1' 'on' 'on F1' 'partner' 'partner Red' 'record'
 'record eighth' 'sponsor' 'title']
Number of features: 40
{'Red': 15, 'Bull': 2, 'drops': 19, 'hint': 28, 'on': 32, 'F1': 4, 'engine': 23, 'Red Bull': 16, 'Bull drops': 3, 'drops hint': 20, 'hint on': 29, 'on F1': 33, 'F1 engine': 5, 'Honda': 11, 'exits': 24, 'leaving': 30, 'partner': 34, 'Honda exits': 12, 'exits F1': 25, 'F1 leaving': 6, 'leaving F1': 31, 'F1 partner': 7, 'partner Red': 35, 'Hamilton': 9, 'eyes': 26, 'record': 36, 'eighth': 21, 'title': 39, 'Hamilton eyes': 10, 'eyes record': 27, 'record eighth': 37, 'eighth F1': 22, 'F1 title': 8, 'Aston': 0, 'Martin': 13, 'announces



In [16]:
print(bigrams.toarray())

[[0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 1 1 0 0
  0 0 0 0]
 [0 0 1 0 1 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 1 1
  0 0 0 0]
 [0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0
  1 1 0 1]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 0]]
