# Text Vectorization
This notebook introduces the ways to convert texts to vectors.

## BOW

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

texts = [
    'There used to be Stone Age',
    'There used to be Bronze Age bronze',
    'There used to be Iron Age',
    'There was Age of Revolution',
    'Now it is Digital Age'
]
vectorizer = CountVectorizer(analyzer='word')
vec = vectorizer.fit_transform(texts)
print(vectorizer.vocabulary_)

{'there': 11, 'used': 13, 'to': 12, 'be': 1, 'stone': 10, 'age': 0, 'bronze': 2, 'iron': 4, 'was': 14, 'of': 8, 'revolution': 9, 'now': 7, 'it': 6, 'is': 5, 'digital': 3}


In [30]:
print(vec)

  (0, 0)	1
  (0, 10)	1
  (0, 1)	1
  (0, 12)	1
  (0, 13)	1
  (0, 11)	1
  (1, 2)	2
  (1, 0)	1
  (1, 1)	1
  (1, 12)	1
  (1, 13)	1
  (1, 11)	1
  (2, 4)	1
  (2, 0)	1
  (2, 1)	1
  (2, 12)	1
  (2, 13)	1
  (2, 11)	1
  (3, 9)	1
  (3, 8)	1
  (3, 14)	1
  (3, 0)	1
  (3, 11)	1
  (4, 3)	1
  (4, 5)	1
  (4, 6)	1
  (4, 7)	1
  (4, 0)	1


In [31]:
print(vec.toarray())

[[1 1 0 0 0 0 0 0 0 0 1 1 1 1 0]
 [1 1 2 0 0 0 0 0 0 0 0 1 1 1 0]
 [1 1 0 0 1 0 0 0 0 0 0 1 1 1 0]
 [1 0 0 0 0 0 0 0 1 1 0 1 0 0 1]
 [1 0 0 1 0 1 1 1 0 0 0 0 0 0 0]]


In [32]:
vec = vectorizer.transform([ 'There was Stone Age'])
print(vec.toarray())

[[1 0 0 0 0 0 0 0 0 0 1 1 0 0 1]]


## N-Grams

In [45]:
texts = [
    'There used to be Stone Age',
    'There used to be Bronze Age bronze',
    'There used to be Iron Age',
    'There was Age of Revolution',
    'Now it is Digital Age'
]
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
vec = vectorizer.fit_transform(texts)
print(vectorizer.vocabulary_)

{'there used': 13, 'used to': 16, 'to be': 15, 'be stone': 4, 'stone age': 12, 'be bronze': 2, 'bronze age': 5, 'age bronze': 0, 'be iron': 3, 'iron age': 7, 'there was': 14, 'was age': 17, 'age of': 1, 'of revolution': 11, 'now it': 10, 'it is': 9, 'is digital': 8, 'digital age': 6}


In [46]:
print(vec)

  (0, 12)	1
  (0, 4)	1
  (0, 15)	1
  (0, 16)	1
  (0, 13)	1
  (1, 0)	1
  (1, 5)	1
  (1, 2)	1
  (1, 15)	1
  (1, 16)	1
  (1, 13)	1
  (2, 7)	1
  (2, 3)	1
  (2, 15)	1
  (2, 16)	1
  (2, 13)	1
  (3, 11)	1
  (3, 1)	1
  (3, 17)	1
  (3, 14)	1
  (4, 6)	1
  (4, 8)	1
  (4, 9)	1
  (4, 10)	1


In [47]:
print(vec.toarray())

[[0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 1 0]
 [1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0]
 [0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 1 0]
 [0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1]
 [0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0]]


In [48]:
vec = vectorizer.transform([ 'There was Stone Age'])
print(vec.toarray())

[[0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0]]


## TF-IDF

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = [
    'There used to be Stone Age',
    'There used to be Bronze Age bronze',
    'There used to be Iron Age',
    'There was Age of Revolution',
    'Now it is Digital Age'
]
vectorizer = TfidfVectorizer(analyzer='word', smooth_idf=True)
vec = vectorizer.fit_transform(texts)
print(vectorizer.vocabulary_)

{'there': 11, 'used': 13, 'to': 12, 'be': 1, 'stone': 10, 'age': 0, 'bronze': 2, 'iron': 4, 'was': 14, 'of': 8, 'revolution': 9, 'now': 7, 'it': 6, 'is': 5, 'digital': 3}


In [50]:
print(vec)

  (0, 11)	0.33140159786840845
  (0, 13)	0.3939481437168047
  (0, 12)	0.3939481437168047
  (0, 1)	0.3939481437168047
  (0, 10)	0.5882354607969754
  (0, 0)	0.28029734885918384
  (1, 11)	0.23213777065833785
  (1, 13)	0.27594991824306836
  (1, 12)	0.27594991824306836
  (1, 1)	0.27594991824306836
  (1, 0)	0.1963406395869283
  (1, 2)	0.8240857580041683
  (2, 11)	0.33140159786840845
  (2, 13)	0.3939481437168047
  (2, 12)	0.3939481437168047
  (2, 1)	0.3939481437168047
  (2, 0)	0.28029734885918384
  (2, 4)	0.5882354607969754
  (3, 11)	0.2992461174212536
  (3, 0)	0.25310044945192844
  (3, 14)	0.5311597134872388
  (3, 8)	0.5311597134872388
  (3, 9)	0.5311597134872388
  (4, 0)	0.2317654623904255
  (4, 7)	0.48638584746139363
  (4, 6)	0.48638584746139363
  (4, 5)	0.48638584746139363
  (4, 3)	0.48638584746139363


In [51]:
print(vec.toarray())

[[0.28029735 0.39394814 0.         0.         0.         0.
  0.         0.         0.         0.         0.58823546 0.3314016
  0.39394814 0.39394814 0.        ]
 [0.19634064 0.27594992 0.82408576 0.         0.         0.
  0.         0.         0.         0.         0.         0.23213777
  0.27594992 0.27594992 0.        ]
 [0.28029735 0.39394814 0.         0.         0.58823546 0.
  0.         0.         0.         0.         0.         0.3314016
  0.39394814 0.39394814 0.        ]
 [0.25310045 0.         0.         0.         0.         0.
  0.         0.         0.53115971 0.53115971 0.         0.29924612
  0.         0.         0.53115971]
 [0.23176546 0.         0.         0.48638585 0.         0.48638585
  0.48638585 0.48638585 0.         0.         0.         0.
  0.         0.         0.        ]]


In [52]:
vec = vectorizer.transform([ 'There was Stone Age'])
print(vec.toarray())

[[0.29872406 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.62690599 0.3531879
  0.         0.         0.62690599]]


## Gemsim

In [53]:
from gensim import corpora, models

corpus = [
    'There used to be Stone Age',
    'There used to be Bronze Age bronze',
    'There used to be Iron Age',
    'There was Age of Revolution',
    'Now it is Digital Age'
]
texts = [[word for word in document.lower().split()] for document in corpus]
dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)

{'age': 0, 'be': 1, 'stone': 2, 'there': 3, 'to': 4, 'used': 5, 'bronze': 6, 'iron': 7, 'of': 8, 'revolution': 9, 'was': 10, 'digital': 11, 'is': 12, 'it': 13, 'now': 14}


In [54]:
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)], [(0, 1), (1, 1), (3, 1), (4, 1), (5, 1), (6, 2)], [(0, 1), (1, 1), (3, 1), (4, 1), (5, 1), (7, 1)], [(0, 1), (3, 1), (8, 1), (9, 1), (10, 1)], [(0, 1), (11, 1), (12, 1), (13, 1), (14, 1)]]


In [55]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(1, 0.27610534488362876), (2, 0.8699140943739595), (3, 0.12061087840494013), (4, 0.27610534488362876), (5, 0.27610534488362876)]
[(1, 0.1526807310725103), (3, 0.06669540243027648), (4, 0.1526807310725103), (5, 0.1526807310725103), (6, 0.9620901758006651)]
[(1, 0.27610534488362876), (3, 0.12061087840494013), (4, 0.27610534488362876), (5, 0.27610534488362876), (7, 0.8699140943739595)]
[(3, 0.07979258234193365), (8, 0.5755093812740171), (9, 0.5755093812740171), (10, 0.5755093812740171)]
[(11, 0.5), (12, 0.5), (13, 0.5), (14, 0.5)]


In [56]:
query = 'There was Stone Age'
query_bow = dictionary.doc2bow(query.lower().split())
print(query_bow)
query_tfidf = tfidf[query_bow]
print(query_tfidf)

[(0, 1), (2, 1), (3, 1), (10, 1)]
[(2, 0.7037329116471254), (3, 0.09757037526482304), (10, 0.7037329116471254)]


## Sequence

In [71]:
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing import sequence

train_texts = [
    'There used to be Stone Age',
    'There used to be Bronze Age bronze',
    'There used to be Iron Age',
    'There was Age of Revolution',
    'Now it is Digital Age'
]

tokenizer = text.Tokenizer(num_words=20000)
tokenizer.fit_on_texts(train_texts)
print(tokenizer)

x_train = tokenizer.texts_to_sequences(train_texts)
print(x_train)

<keras_preprocessing.text.Tokenizer object at 0x7fba046334a8>
[[2, 3, 4, 5, 7, 1], [2, 3, 4, 5, 6, 1, 6], [2, 3, 4, 5, 8, 1], [2, 9, 1, 10, 11], [12, 13, 14, 15, 1]]


In [72]:
MAX_SEQUENCE_LENGTH = 500
max_len = len(max(x_train, key=len))
if max_len > MAX_SEQUENCE_LENGTH:
    max_len = MAX_SEQUENCE_LENGTH
print(max_len)

7


In [62]:
pad_x_train = sequence.pad_sequences(x_train, maxlen=7, padding='post')
print(pad_x_train)

[[ 2  3  4  5  7  1  0]
 [ 2  3  4  5  6  1  6]
 [ 2  3  4  5  8  1  0]
 [ 2  9  1 10 11  0  0]
 [12 13 14 15  1  0  0]]


In [65]:
seq = tokenizer.texts_to_sequences(['There was Stone Age'])
print(seq)
pad_seq = sequence.pad_sequences(seq, maxlen=7, padding='post')
print(pad_seq)

[[2, 9, 7, 1]]
[[2 9 7 1 0 0 0]]


## One-hot

In [69]:
one_hot = text.one_hot('my name is Join', n=12)
print(one_hot)

[5, 1, 4]
