# TF-IDF stands for "Term Frequency, Inverse Document Frequency." It's a way to score the importance of words (or "terms") in a document based on how frequently they appear across multiple documents.

# Therefore, common words like "the" and "for," which appear in many documents, will be scaled down. Words that appear frequently in a single document will be scaled up.

In [1]:
import math
from textblob import TextBlob as tb

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [6]:
document1 = tb("""Python is a 2000 made-for-TV horror movie directed by Richard
Clabaugh. The film features several cult favorite actors, including William
Zabka of The Karate Kid fame, Wil Wheaton, Casper Van Dien, Jenny McCarthy,
Keith Coogan, Robert Englund (best known for his role as Freddy Krueger in the
A Nightmare on Elm Street series of films), Dana Barron, David Bowe, and Sean
Whalen. The film concerns a genetically engineered snake, a python, that
escapes and unleashes itself on a small town. It includes the classic final
girl scenario evident in films like Friday the 13th. It was filmed in Los Angeles,
 California and Malibu, California. Python was followed by two sequels: Python
 II (2002) and Boa vs. Python (2004), both also made-for-TV films.""")

document2 = tb("""Python, from the Greek word (πύθων/πύθωνας), is a genus of
nonvenomous pythons[2] found in Africa and Asia. Currently, 7 species are
recognised.[2] A member of this genus, P. reticulatus, is among the longest
snakes known.""")

document3 = tb("""The Colt Python is a .357 Magnum caliber revolver formerly
manufactured by Colt's Manufacturing Company of Hartford, Connecticut.
It is sometimes referred to as a "Combat Magnum".[1] It was first introduced
in 1955, the same year as Smith &amp; Wesson's M29 .44 Magnum. The now discontinued
Colt Python targeted the premium revolver market segment. Some firearm
collectors and writers such as Jeff Cooper, Ian V. Hogg, Chuck Hawks, Leroy
Thompson, Renee Smeets and Martin Dougherty have described the Python as the
finest production revolver ever made.""")

bloblist = [document1, document2, document3]
for i, blob in enumerate(bloblist):
    print("Top words in document {}------------------------------------->>>>>".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    print(scores.items())
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in document 1------------------------------------->>>>>
dict_items([('for', 0.0033234844926898722), ('two', 0.0033234844926898722), ('is', -0.0023580497741949257), ('film', 0.0066469689853797444), ('followed', 0.0033234844926898722), ('The', 0.0), ('Dana', 0.0033234844926898722), ('California', 0.0066469689853797444), ('girl', 0.0033234844926898722), ('role', 0.0033234844926898722), ('Robert', 0.0033234844926898722), ('Freddy', 0.0033234844926898722), ('genetically', 0.0033234844926898722), ('his', 0.0033234844926898722), ('including', 0.0033234844926898722), ('actors', 0.0033234844926898722), ('2004', 0.0033234844926898722), ('Clabaugh', 0.0033234844926898722), ('Zabka', 0.0033234844926898722), ('series', 0.0033234844926898722), ('small', 0.0033234844926898722), ('itself', 0.0033234844926898722), ('2002', 0.0033234844926898722), ('was', 0.0), ('Dien', 0.0033234844926898722), ('Barron', 0.0033234844926898722), ('Street', 0.0033234844926898722), ('python', 0.01661742246344936)

# USING GENSIM

In [9]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize

# Create a Dictionary from the articles: dictionary
#my_documents consists of list of 
my_documents = ['The movie was about a spaceship and The aliens.','I really liked the movie!','Awesome action scenes, but boring characters.',
                'The movie was awful! I hate alien films.','Space is cool! I liked the movie.','More space films, please!',]

articles=[word_tokenize(doc.lower()) for doc in my_documents]

dictionary = Dictionary(articles)


# Select the id for "computer": computer_id
print(dictionary.token2id)

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]

# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus)

{'awful': 21, 'boring': 16, 'and': 4, 'hate': 23, 'i': 10, 'awesome': 15, 'spaceship': 6, 'space': 26, '.': 0, 'a': 1, 'scenes': 19, 'alien': 20, 'liked': 11, ',': 13, 'action': 14, 'the': 7, 'characters': 18, 'but': 17, 'about': 2, 'please': 28, 'was': 8, '!': 9, 'films': 22, 'movie': 5, 'cool': 24, 'really': 12, 'more': 27, 'aliens': 3, 'is': 25}
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1)], [(5, 1), (7, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(0, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)], [(0, 1), (5, 1), (7, 1), (8, 1), (9, 1), (10, 1), (20, 1), (21, 1), (22, 1), (23, 1)], [(0, 1), (5, 1), (7, 1), (9, 1), (10, 1), (11, 1), (24, 1), (25, 1), (26, 1)], [(9, 1), (13, 1), (22, 1), (26, 1), (27, 1), (28, 1)]]


In [16]:
from collections import defaultdict
import itertools
from itertools import *

# Save the fifth document: doc
doc = corpus[4]

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print("----->>>>",dictionary.get(word_id), word_count)
    
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

# Create a sorted list from the defaultdict: sorted_word_count 
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)


----->>>> . 1
----->>>> movie 1
----->>>> the 1
----->>>> ! 1
----->>>> i 1
the 5
. 4
movie 4
! 4
i 3


In [17]:
# Import TfidfModel
from gensim.models.tfidfmodel import TfidfModel

# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

# Calculate the tfidf weights of doc: tfidf_weights
tfidf_weights = tfidf[doc]

# Print the first five weights
print(tfidf_weights[:5])

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)

[(0, 0.12839429999391858), (5, 0.12839429999391858), (7, 0.12839429999391858), (9, 0.12839429999391858), (10, 0.21949150558476985)]
cool 0.5673773111634582
is 0.5673773111634582
liked 0.3478858055786885
space 0.3478858055786885
i 0.21949150558476985
