### Term Frequence - Inverse Document Frequency(TF-IDF) Using sklearn

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
text = ["Try Checking the network cables, Try Checking the modem, and Try Checking the router. Reconnecting to WiFi"]

In [2]:
vector = CountVectorizer()
vector1 = vector.fit(text)
vector2 = vector.transform(text)
print(vector1.vocabulary_)

{'try': 9, 'checking': 2, 'the': 7, 'network': 4, 'cables': 1, 'modem': 3, 'and': 0, 'router': 6, 'reconnecting': 5, 'to': 8, 'wifi': 10}


In [3]:
print(vector2.toarray())

[[1 1 3 1 1 1 1 3 1 3 1]]


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
vector_tfidf = TfidfVectorizer()

In [5]:
test = ["Try Checking the network cables", "Try Checking the modem", "and Try Checking the router", "Reconnecting to WiFi"]

In [6]:
vector3 = vector_tfidf.fit(test)
print(vector3.vocabulary_)

{'try': 9, 'checking': 2, 'the': 7, 'network': 4, 'cables': 1, 'modem': 3, 'and': 0, 'router': 6, 'reconnecting': 5, 'to': 8, 'wifi': 10}


In [7]:
vector4 = vector_tfidf.transform(test)
print(vector4.toarray())

[[0.         0.55708525 0.3555803  0.         0.55708525 0.
  0.         0.3555803  0.         0.3555803  0.        ]
 [0.         0.         0.42817512 0.67081906 0.         0.
  0.         0.42817512 0.         0.42817512 0.        ]
 [0.55708525 0.         0.3555803  0.         0.         0.
  0.55708525 0.3555803  0.         0.3555803  0.        ]
 [0.         0.         0.         0.         0.         0.57735027
  0.         0.         0.57735027 0.         0.57735027]]


In [8]:
test1 = test[0]
test1_tfidf = vector_tfidf.transform([test1])
print(test1_tfidf.toarray())

[[0.         0.55708525 0.3555803  0.         0.55708525 0.
  0.         0.3555803  0.         0.3555803  0.        ]]


In [9]:
test1_tfidf = vector_tfidf.fit([test1])
print(test1_tfidf.vocabulary_)

{'try': 4, 'checking': 1, 'the': 3, 'network': 2, 'cables': 0}


### TF-IDF Using gensim

In [10]:
import gensim
import pprint
from gensim import corpora
from gensim.utils import simple_preprocess

In [11]:
doc_list = ["This is a book about Natural Language Processing.",
'By "natural language" we mean a language that is used for everyday communication by humans; languages like English, Hindi or Portuguese.',
"In contrast to artificial languages such as programming languages and mathematical notations,natural languages have evolved as they pass from generation to generation, and are hard to pin down with explicit rules. ",
"We will take Natural Language Processing — or NLP for short — in a wide sense to cover any kind of computer manipulation of natural language. ",
"At one extreme, it could be as simple as counting word frequencies to compare different writing styles. ",
"At the other extreme, NLP involves understanding complete human utterances,at least to the extent of being able to give useful responses to them."]

doc_list2 = ["Hello, how are you ?", "How do you do ?",
           "Hey what are you doing ?", "yes you what are you doing ?"]

In [12]:
doc_token = [simple_preprocess(doc) for doc in doc_list]
dictionary = corpora.Dictionary()

In [13]:
bow_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_token]
bow_corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(2, 1),
  (3, 2),
  (4, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1)],
 [(4, 1),
  (14, 3),
  (22, 2),
  (23, 1),
  (24, 1),
  (25, 2),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 2),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 3),
  (44, 1)],
 [(3, 2),
  (4, 2),
  (5, 1),
  (11, 1),
  (17, 1),
  (21, 1),
  (34, 1),
  (43, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 2),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1)],
 [(25, 2),
  (43, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1)],
 [(43, 3),
  (50, 1),
  (51, 1),
  (57, 2),
  (63, 1),
  (71, 1),
  (72, 1),
 

In [14]:
for doc in bow_corpus:
    print([[dictionary[id],freq] for id,freq in doc])

[['about', 1], ['book', 1], ['is', 1], ['language', 1], ['natural', 1], ['processing', 1], ['this', 1]]
[['is', 1], ['language', 2], ['natural', 1], ['by', 2], ['communication', 1], ['english', 1], ['everyday', 1], ['for', 1], ['hindi', 1], ['humans', 1], ['languages', 1], ['like', 1], ['mean', 1], ['or', 1], ['portuguese', 1], ['that', 1], ['used', 1], ['we', 1]]
[['natural', 1], ['languages', 3], ['and', 2], ['are', 1], ['artificial', 1], ['as', 2], ['contrast', 1], ['down', 1], ['evolved', 1], ['explicit', 1], ['from', 1], ['generation', 2], ['hard', 1], ['have', 1], ['in', 1], ['mathematical', 1], ['notations', 1], ['pass', 1], ['pin', 1], ['programming', 1], ['rules', 1], ['such', 1], ['they', 1], ['to', 3], ['with', 1]]
[['language', 2], ['natural', 2], ['processing', 1], ['for', 1], ['or', 1], ['we', 1], ['in', 1], ['to', 1], ['any', 1], ['computer', 1], ['cover', 1], ['kind', 1], ['manipulation', 1], ['nlp', 1], ['of', 2], ['sense', 1], ['short', 1], ['take', 1], ['wide', 1], [

In [15]:
doc

[(43, 3),
 (50, 1),
 (51, 1),
 (57, 2),
 (63, 1),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 1),
 (76, 1),
 (77, 1),
 (78, 1),
 (79, 1),
 (80, 1),
 (81, 2),
 (82, 1),
 (83, 1),
 (84, 1),
 (85, 1)]

In [16]:
import numpy as np
from gensim import models
from gensim.models import TfidfModel
tfidf = gensim.models.TfidfModel(bow_corpus, smartirs='ntc')
for doc in tfidf[bow_corpus]:
    print([[dictionary[id], np.around(freq, decimals=2)] for id,freq in doc])

[['about', 0.49], ['book', 0.49], ['is', 0.32], ['language', 0.22], ['natural', 0.14], ['processing', 0.32], ['this', 0.49]]
[['is', 0.16], ['language', 0.21], ['natural', 0.07], ['by', 0.49], ['communication', 0.24], ['english', 0.24], ['everyday', 0.24], ['for', 0.16], ['hindi', 0.24], ['humans', 0.24], ['languages', 0.16], ['like', 0.24], ['mean', 0.24], ['or', 0.16], ['portuguese', 0.24], ['that', 0.24], ['used', 0.24], ['we', 0.16]]
[['natural', 0.05], ['languages', 0.34], ['and', 0.35], ['are', 0.18], ['artificial', 0.18], ['as', 0.23], ['contrast', 0.18], ['down', 0.18], ['evolved', 0.18], ['explicit', 0.18], ['from', 0.18], ['generation', 0.35], ['hard', 0.18], ['have', 0.18], ['in', 0.11], ['mathematical', 0.18], ['notations', 0.18], ['pass', 0.18], ['pin', 0.18], ['programming', 0.18], ['rules', 0.18], ['such', 0.18], ['they', 0.18], ['to', 0.15], ['with', 0.18]]
[['language', 0.22], ['natural', 0.15], ['processing', 0.16], ['for', 0.16], ['or', 0.16], ['we', 0.16], ['in', 0.