In [1]:
import math
from textblob import TextBlob as tb

In [2]:
def tf(word,blob):
    return blob.words.count(word)/len(blob.words)


def n_containing(word,bloblist):
    return sum(1 for blob in bloblist if word in blob.words)


def idf(word,bloblist):
    return math.log(len(bloblist)/(1+n_containing(word,bloblist)))


def tfidf(word,blob,bloblist):
    return tf(word,blob)*idf(word,bloblist)

In [3]:
documents1 = tb("""Deep Text: Using Text Analytics to Conquer Information Overload, Get Real Value from Social Media,
and Add Bigger Text to Big Data by Tom Reamy, published by Information Today Inc. (available July 2016) . I have 
known Tom for many years and can attest to his expertise in working on real use cases that generate business value
for organizations. In his book, Tom pulls from these experiences to share some of his most challenging cases.
Whether you’re interested in gaining insight from a stream of social media content or making use of a huge amount
of business information, Deep Text is the text mining book that provides tested insights and examples for doing this
effectively""")

documents2 = tb("""Introduction to Information Retrieval by Christopher D. Manning, Prabhakar Raghavan and Hinrich
Schütze, published by Cambridge University Press. If I had to recommend an introductory text mining book, this is
the one. This book, which is also used by the Stanford University program, is a comprehensive manual that provides
a great overview of text mining, explains all the terminology and still manages to generate the interest to learn
even more. While it’s not a book for business readers, it’s a great resource for helping your technical team grasp
the basics.”""")

documents3 = tb("""The Text Mining Handbook: Advanced Approaches in Analyzing Unstructured Data by Ronen Feldman and
James Sanger, published by Cambridge University Press. This is the text mining book to turn to if you’re looking
for practical examples, software and applied text mining. The concrete text mining examples alone make it a valuable
resource for business readers.""")

documents4 = tb("""Foundations of Statistical Natural Language Processing by Christopher D. Manning and Hinrich Schütze,
published by MIT Press. This is a comprehensive, well written, and clear text mining book that provides lots of detail
on theory in a way that is easily understood by the non-expert. After a general introduction, it covers the most
commonly used methods and algorithms. Like no any other text mining books, this is the book that you want to read
if you are not a pure business person who wants to grasp the economic value of text mining.""")

In [4]:
bloblist = [documents1, documents2, documents3, documents4]
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in document 1
	Word: from, TF-IDF: 0.01793
	Word: Tom, TF-IDF: 0.01793
	Word: his, TF-IDF: 0.01793
Top words in document 2
	Word: great, TF-IDF: 0.01459
	Word: s, TF-IDF: 0.01459
	Word: Introduction, TF-IDF: 0.0073
Top words in document 3
	Word: Mining, TF-IDF: 0.04864
	Word: The, TF-IDF: 0.03648
	Word: Text, TF-IDF: 0.02019
Top words in document 4
	Word: Foundations, TF-IDF: 0.00753
	Word: Statistical, TF-IDF: 0.00753
	Word: Natural, TF-IDF: 0.00753


## Using TFIDF Direct features

In [5]:
documents5 = """Deep Text: Using Text Analytics to Conquer Information Overload, Get Real Value from Social Media,
and Add Bigger Text to Big Data by Tom Reamy, published by Information Today Inc. (available July 2016) . I have 
known Tom for many years and can attest to his expertise in working on real use cases that generate business value
for organizations. In his book, Tom pulls from these experiences to share some of his most challenging cases.
Whether you’re interested in gaining insight from a stream of social media content or making use of a huge amount
of business information, Deep Text is the text mining book that provides tested insights and examples for doing this
effectively"""

documents6 = """Introduction to Information Retrieval by Christopher D. Manning, Prabhakar Raghavan and Hinrich
Schütze, published by Cambridge University Press. If I had to recommend an introductory text mining book, this is
the one. This book, which is also used by the Stanford University program, is a comprehensive manual that provides
a great overview of text mining, explains all the terminology and still manages to generate the interest to learn
even more. While it’s not a book for business readers, it’s a great resource for helping your technical team grasp
the basics.”"""

documents7 = """The Text Mining Handbook: Advanced Approaches in Analyzing Unstructured Data by Ronen Feldman and
James Sanger, published by Cambridge University Press. This is the text mining book to turn to if you’re looking
for practical examples, software and applied text mining. The concrete text mining examples alone make it a valuable
resource for business readers."""

documents8 = """Foundations of Statistical Natural Language Processing by Christopher D. Manning and Hinrich Schütze,
published by MIT Press. This is a comprehensive, well written, and clear text mining book that provides lots of detail
on theory in a way that is easily understood by the non-expert. After a general introduction, it covers the most
commonly used methods and algorithms. Like no any other text mining books, this is the book that you want to read
if you are not a pure business person who wants to grasp the economic value of text mining."""

In [6]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
tfidf1 = TfidfVectorizer()

In [7]:
boblist1=[documents5]
features1 = tfidf1.fit_transform(boblist1)

In [8]:
pd.DataFrame(features1.todense(),columns=tfidf1.get_feature_names())

Unnamed: 0,2016,add,amount,analytics,and,attest,available,big,bigger,book,...,to,today,tom,use,using,value,whether,working,years,you
0,0.06742,0.06742,0.06742,0.06742,0.20226,0.06742,0.06742,0.06742,0.06742,0.13484,...,0.26968,0.06742,0.20226,0.13484,0.06742,0.13484,0.06742,0.06742,0.06742,0.06742


In [9]:
weights1 = np.asarray(features1.mean(axis=0)).ravel().tolist()
weights_df1 = pd.DataFrame({'term': tfidf1.get_feature_names(), 'weight': weights1})

In [10]:
weights_df1.sort_values(by='weight', ascending=False).head(10)

Unnamed: 0,term,weight
62,text,0.3371
46,of,0.26968
67,to,0.26968
30,his,0.20226
4,and,0.20226
32,in,0.20226
25,from,0.20226
24,for,0.20226
34,information,0.20226
69,tom,0.20226


In [11]:
boblist2=[documents6]
features2 = tfidf1.fit_transform(boblist2)

In [12]:
pd.DataFrame(features2.todense(),columns=tfidf1.get_feature_names())

Unnamed: 0,all,also,an,and,basics,book,business,by,cambridge,christopher,...,text,that,the,this,to,university,used,which,while,your
0,0.08165,0.08165,0.08165,0.163299,0.08165,0.244949,0.08165,0.244949,0.08165,0.08165,...,0.163299,0.08165,0.408248,0.163299,0.326599,0.163299,0.08165,0.08165,0.08165,0.08165


In [13]:
weights2 = np.asarray(features2.mean(axis=0)).ravel().tolist()
weights_df2 = pd.DataFrame({'term': tfidf1.get_feature_names(), 'weight': weights2})

In [14]:
weights_df2.sort_values(by='weight', ascending=False).head(10)

Unnamed: 0,term,weight
55,the,0.408248
57,to,0.326599
25,is,0.244949
5,book,0.244949
7,by,0.244949
56,this,0.163299
26,it,0.163299
16,great,0.163299
53,text,0.163299
13,for,0.163299
