In [None]:
import os
import re
import pandas as pd
import numpy as np
from collections import Counter

Lets take an example of 5 sentences. We will first create vocabulary of the words and assign a number to each word.

In [None]:
sent1 = "He is playing in the field".lower()
sent2 = "He is running around with his friends".lower()
sent3 = "They are playing football".lower()
sent4 = "It started raining while they were playing".lower()
sent5 = "They stopped playing football and are now playing ludo".lower()
all_text = [sent1, sent2, sent3, sent4, sent5]

In [None]:
vocab_dict = Counter([x for sent in all_text for x in sent.split()])
vocab = list(set([x for x in vocab_dict]))
vocab_len = len(vocab)

In [None]:
print(vocab)
print(vocab_len)

['he', 'it', 'running', 'with', 'and', 'while', 'in', 'started', 'they', 'raining', 'ludo', 'field', 'are', 'stopped', 'playing', 'football', 'the', 'is', 'now', 'his', 'were', 'friends', 'around']
23


## Count Vectorizer
Taking each sentence one at a time, we’ll read the first word, find it’s total occurrence in the sentence. Once we have the number of times it appears in that sentence, we’ll identify the position of the word in the list above and replace the same zero with this count at that position. This is repeated for all words and for all sentences

In [None]:
def get_count_vectors():
  all_vectors = [np.zeros(vocab_len) for sent in all_text]
  for i, sent in enumerate(all_text):
    sent_counter = Counter(sent.split())
    for j, term in enumerate(vocab):
      if term in sent_counter:
        term_counter = sent_counter[term]
      else:
        term_counter = 0
      all_vectors[i][j] = term_counter
    return all_vectors

In [None]:
all_vectors = get_count_vectors()
print(all_vectors)
print(vocab)

[array([1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1.,
       1., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0.])]
['he', 'it', 'running', 'with', 'and', 'while', 'in', 'started', 'they', 'raining', 'ludo', 'field', 'are', 'stopped', 'playing', 'football', 'the', 'is', 'now', 'his', 'were', 'friends', 'around']


Creating Count Vectorizer Using Sklearn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
sentence_vectors = vectorizer.fit_transform(all_text)

print(sentence_vectors.toarray())
print(vectorizer.get_feature_names())

# TF-IDF

TF (Term Frequency) — It is defined as the number of times a word appears in the given sentence.

IDF (Inverse Document Frequency) — It is defined as the log to the base e of number of the total documents divided by the documents in which the word appears.

For each word in each sentence, we’ll calculate the TF-IDF value and update the corresponding value in the vector of that sentence

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvectorizer = TfidfVectorizer()
sentence_vectors = tfidfvectorizer.fit_transform(all_text)
print(sentence_vectors.toarray())
print(vectorizer.get_feature_names())

[[0.         0.         0.         0.46528078 0.         0.
  0.3753856  0.         0.46528078 0.3753856  0.         0.
  0.         0.26213107 0.         0.         0.         0.
  0.46528078 0.         0.         0.         0.        ]
 [0.         0.         0.39835162 0.         0.         0.39835162
  0.32138758 0.39835162 0.         0.32138758 0.         0.
  0.         0.         0.         0.39835162 0.         0.
  0.         0.         0.         0.         0.39835162]
 [0.         0.56106597 0.         0.         0.56106597 0.
  0.         0.         0.         0.         0.         0.
  0.         0.39179133 0.         0.         0.         0.
  0.         0.46573544 0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.41645294 0.
  0.         0.23462232 0.41645294 0.         0.41645294 0.
  0.         0.27890339 0.41645294 0.41645294 0.        ]
 [0.37742714 0.30450584 0.         0.   

NameError: ignored