# [anabranch/tfpdf.py](https://gist.github.com/anabranch/48c5c0124ba4e162b2e3) 

TF IDF Explained in Python Along with Scikit-Learn Implementation 

- [Basic Statistical NLP Part 1 - Jaccard Similarity and TF-IDF](http://billchambers.me/tutorials/2014/12/21/tf-idf-explained-in-python.html)
- [Basic Statistical NLP Part 2 - TF-IDF And Cosine Similarity](http://billchambers.me/tutorials/2014/12/22/cosine-similarity-explained-in-python.html)

In [1]:
from __future__ import division
import string
import math

In [2]:
tokenize = lambda doc: doc.lower().split(" ")

In [3]:
document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy."
document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption."
document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views"
document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
document_6 = "Vladimir Putin is riding a horse while hunting deer. Vladimir Putin always seems so serious about things - even riding horses. Is he crazy?"

all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6]

In [4]:
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

def term_frequency(term, tokenized_document):
    return tokenized_document.count(term)

def sublinear_term_frequency(term, tokenized_document):
    count = tokenized_document.count(term)
    if count == 0:
        return 0
    return 1 + math.log(count)

def augmented_term_frequency(term, tokenized_document):
    max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
    return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))

def inverse_document_frequencies(tokenized_documents):
    idf_values = {}
    all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    for tkn in all_tokens_set:
        contains_token = map(lambda doc: tkn in doc, tokenized_documents)
        idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
    return idf_values

def tfidf(documents):
    tokenized_documents = [tokenize(d) for d in documents]
    idf = inverse_document_frequencies(tokenized_documents)
    tfidf_documents = []
    for document in tokenized_documents:
        doc_tfidf = []
        for term in idf.keys():
            tf = sublinear_term_frequency(term, document)
            doc_tfidf.append(tf * idf[term])
        tfidf_documents.append(doc_tfidf)
    return tfidf_documents

In [5]:
# in Scikit-Learn
from sklearn.feature_extraction.text import TfidfVectorizer

sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
sklearn_representation = sklearn_tfidf.fit_transform(all_documents)

In [6]:
print(sklearn_representation.toarray()[0].tolist())

[0.0, 0.3112012154045971, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.18380045100490253, 0.18380045100490253, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2403535665263221, 0.0, 0.0, 0.0, 0.18380045100490253, 0.0, 0.2403535665263221, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2403535665263221, 0.0, 0.2403535665263221, 0.2403535665263221, 0.0, 0.15071899912592052, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2403535665263221, 0.0, 0.0, 0.127247335483483, 0.2403535665263221, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2403535665263221, 0.0, 0.2403535665263221, 0.0, 0.0, 0.0, 0.2403535665263221, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2403535665263221, 0.2403535665263221, 0.10904129874372748, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2403535665263221, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [7]:
def cosine_similarity(vector1, vector2):
    dot_product = sum(p*q for p,q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product/magnitude

tfidf_representation = tfidf(all_documents)
our_tfidf_comparisons = []

for count_0, doc_0 in enumerate(tfidf_representation):
    for count_1, doc_1 in enumerate(tfidf_representation):
        our_tfidf_comparisons.append((cosine_similarity(doc_0, doc_1), count_0, count_1))

skl_tfidf_comparisons = []

for count_0, doc_0 in enumerate(sklearn_representation.toarray()):
    for count_1, doc_1 in enumerate(sklearn_representation.toarray()):
        skl_tfidf_comparisons.append((cosine_similarity(doc_0, doc_1), count_0, count_1))

for x in zip(sorted(our_tfidf_comparisons, reverse = True), sorted(skl_tfidf_comparisons, reverse = True)):
    print(x)

((1.0000000000000002, 4, 4), (1.0000000000000002, 6, 6))
((1.0, 6, 6), (1.0000000000000002, 2, 2))
((1.0, 5, 5), (1.0000000000000002, 0, 0))
((1.0, 2, 2), (1.0, 5, 5))
((1.0, 1, 1), (1.0, 4, 4))
((1.0, 0, 0), (1.0, 3, 3))
((0.9999999999999999, 3, 3), (1.0, 1, 1))
((0.2931092569884059, 4, 2), (0.29310925698840595, 4, 2))
((0.2931092569884059, 2, 4), (0.29310925698840595, 2, 4))
((0.1650630690646461, 6, 3), (0.16506306906464616, 6, 3))
((0.1650630690646461, 3, 6), (0.16506306906464616, 3, 6))
((0.14060334967136978, 3, 2), (0.14060334967136984, 3, 2))
((0.14060334967136978, 2, 3), (0.14060334967136984, 2, 3))
((0.11766551247749865, 3, 0), (0.11766551247749867, 3, 0))
((0.11766551247749865, 0, 3), (0.11766551247749867, 0, 3))
((0.11478807222952392, 5, 3), (0.11478807222952396, 5, 3))
((0.11478807222952392, 3, 5), (0.11478807222952396, 3, 5))
((0.11212208176085793, 6, 1), (0.11212208176085793, 6, 1))
((0.11212208176085793, 1, 6), (0.11212208176085793, 1, 6))
((0.08140732228934985, 1, 0), (0

In [8]:
print(tfidf_representation[0])

[2.9459101490553135, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.252762968495368, 2.9459101490553135, 0.0, 0.0, 0.0, 0.0, 2.9459101490553135, 0.0, 0.0, 0.0, 0.0, 2.9459101490553135, 0.0, 1.5596157879354227, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.9459101490553135, 0.0, 0.0, 0.0, 0.0, 0.0, 3.8142592685777856, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.252762968495368, 2.9459101490553135, 0.0, 0.0, 0.0, 0.0, 1.336472236621213, 2.9459101490553135, 0.0, 0.0, 2.9459101490553135, 0.0, 0.0, 0.0, 0.0, 2.9459101490553135, 2.252762968495368, 2.9459101490553135, 0.0, 0.0, 0.0, 2.9459101490553135, 1.8472978603872037, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.9459101490553135, 2.9459101490553135, 0.0, 0.0, 0.0, 0.0, 0.0]


In [9]:
print(sklearn_representation.toarray()[0].tolist())

[0.0, 0.3112012154045971, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.18380045100490253, 0.18380045100490253, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2403535665263221, 0.0, 0.0, 0.0, 0.18380045100490253, 0.0, 0.2403535665263221, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2403535665263221, 0.0, 0.2403535665263221, 0.2403535665263221, 0.0, 0.15071899912592052, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2403535665263221, 0.0, 0.0, 0.127247335483483, 0.2403535665263221, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2403535665263221, 0.0, 0.2403535665263221, 0.0, 0.0, 0.0, 0.2403535665263221, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2403535665263221, 0.2403535665263221, 0.10904129874372748, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2403535665263221, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
