#### Text Vectorization Techniques

In [98]:
import re
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim

In [90]:
text = """Machine learning is a powerful technology that helps computers learn from data. Deep learning is a subset of machine learning that uses neural networks. Computer vision uses machine learning to analyze images and videos."""
data = nltk.sent_tokenize(text);
data

['Machine learning is a powerful technology that helps computers learn from data.',
 'Deep learning is a subset of machine learning that uses neural networks.',
 'Computer vision uses machine learning to analyze images and videos.']

Bag of Words

In [91]:
cv = CountVectorizer(lowercase=True, stop_words='english')
bow = cv.fit_transform(data)
print(len(cv.vocabulary_))
print(cv.vocabulary_)
print(bow[0].toarray())
print(bow[1].toarray())

18
{'machine': 9, 'learning': 8, 'powerful': 12, 'technology': 14, 'helps': 5, 'computers': 2, 'learn': 7, 'data': 3, 'deep': 4, 'subset': 13, 'uses': 15, 'neural': 11, 'networks': 10, 'computer': 1, 'vision': 17, 'analyze': 0, 'images': 6, 'videos': 16}
[[0 0 1 1 0 1 0 1 1 1 0 0 1 0 1 0 0 0]]
[[0 0 0 0 1 0 0 0 2 1 1 1 0 1 0 1 0 0]]


In [92]:
cv_transform = cv.transform(["machine learning algorithms neural networks"]).toarray()[0]
cv_transform

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0])

N-Grams

In [93]:
ng_cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1,3))
ng_transform_data = ng_cv.fit_transform(data)
print(len(ng_cv.vocabulary_))
print(ng_cv.vocabulary_)
print(ng_transform_data[0].toarray())
print(ng_transform_data[1].toarray())

55
{'machine': 29, 'learning': 20, 'powerful': 37, 'technology': 43, 'helps': 13, 'computers': 6, 'learn': 18, 'data': 9, 'machine learning': 30, 'learning powerful': 23, 'powerful technology': 38, 'technology helps': 44, 'helps computers': 14, 'computers learn': 7, 'learn data': 19, 'machine learning powerful': 32, 'learning powerful technology': 24, 'powerful technology helps': 39, 'technology helps computers': 45, 'helps computers learn': 15, 'computers learn data': 8, 'deep': 10, 'subset': 40, 'uses': 46, 'neural': 35, 'networks': 34, 'deep learning': 11, 'learning subset': 25, 'subset machine': 41, 'learning uses': 27, 'uses neural': 49, 'neural networks': 36, 'deep learning subset': 12, 'learning subset machine': 26, 'subset machine learning': 42, 'machine learning uses': 33, 'learning uses neural': 28, 'uses neural networks': 50, 'computer': 3, 'vision': 52, 'analyze': 0, 'images': 16, 'videos': 51, 'computer vision': 4, 'vision uses': 53, 'uses machine': 47, 'learning analyze':

In [94]:
ng_cv_transform = ng_cv.transform(["machine learning algorithms neural networks"]).toarray()[0]
ng_cv_transform

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

TF-IDF Vectorization

In [95]:
tfidf = TfidfVectorizer(lowercase=True, stop_words='english')
tfidf_data = tfidf.fit_transform(data)
print(len(tfidf.vocabulary_))
print(tfidf.vocabulary_)
print(tfidf_data[0].toarray())
print(tfidf_data[1].toarray())

18
{'machine': 9, 'learning': 8, 'powerful': 12, 'technology': 14, 'helps': 5, 'computers': 2, 'learn': 7, 'data': 3, 'deep': 4, 'subset': 13, 'uses': 15, 'neural': 11, 'networks': 10, 'computer': 1, 'vision': 17, 'analyze': 0, 'images': 6, 'videos': 16}
[[0.         0.         0.38640134 0.38640134 0.         0.38640134
  0.         0.38640134 0.22821485 0.22821485 0.         0.
  0.38640134 0.         0.38640134 0.         0.         0.        ]]
[[0.         0.         0.         0.         0.39769885 0.
  0.         0.         0.46977469 0.23488735 0.39769885 0.39769885
  0.         0.39769885 0.         0.30246022 0.         0.        ]]


In [96]:
tfidf.transform(["machine learning algorithms neural networks"]).toarray()[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.35959372, 0.35959372,
       0.6088451 , 0.6088451 , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        ])

In [97]:
print(tfidf.idf_)
print(tfidf.get_feature_names_out())

[1.69314718 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.         1.         1.69314718 1.69314718
 1.69314718 1.69314718 1.69314718 1.28768207 1.69314718 1.69314718]
['analyze' 'computer' 'computers' 'data' 'deep' 'helps' 'images' 'learn'
 'learning' 'machine' 'networks' 'neural' 'powerful' 'subset' 'technology'
 'uses' 'videos' 'vision']


Word2Vec
###### Not work good for small datasets

In [112]:
proccessed_text = gensim.utils.simple_preprocess(text)
proccessed_text

['machine',
 'learning',
 'is',
 'powerful',
 'technology',
 'that',
 'helps',
 'computers',
 'learn',
 'from',
 'data',
 'deep',
 'learning',
 'is',
 'subset',
 'of',
 'machine',
 'learning',
 'that',
 'uses',
 'neural',
 'networks',
 'computer',
 'vision',
 'uses',
 'machine',
 'learning',
 'to',
 'analyze',
 'images',
 'and',
 'videos']

In [119]:
word2vec_model = gensim.models.Word2Vec(window=4, min_count=1, workers=4)
word2vec_model.build_vocab([proccessed_text], progress_per=1)

In [120]:
word2vec_model.train([proccessed_text], total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)

(21, 160)

In [124]:
word2vec_model.wv.most_similar('deep')

[('learning', 0.21921981871128082),
 ('images', 0.17476163804531097),
 ('and', 0.1637784093618393),
 ('is', 0.10880494117736816),
 ('computers', 0.10782014578580856),
 ('subset', 0.06551405042409897),
 ('from', 0.05962836369872093),
 ('of', 0.04915107786655426),
 ('videos', 0.0477207787334919),
 ('machine', 0.022456886246800423)]