In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
documentA = "the man went out for a walk"
documentB = "the children sat around a tree"

In [4]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [5]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [6]:
uniqueWords

{'a',
 'around',
 'children',
 'for',
 'man',
 'out',
 'sat',
 'the',
 'tree',
 'walk',
 'went'}

In [7]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] +=1

In [8]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

### Term Frequency

The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.


In [9]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bagOfWordsCount)
    return tfDict

In [10]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

In [11]:
tfA, tfB

({'the': 0.14285714285714285,
  'walk': 0.14285714285714285,
  'sat': 0.0,
  'tree': 0.0,
  'children': 0.0,
  'went': 0.14285714285714285,
  'man': 0.14285714285714285,
  'for': 0.14285714285714285,
  'out': 0.14285714285714285,
  'around': 0.0,
  'a': 0.14285714285714285},
 {'the': 0.16666666666666666,
  'walk': 0.0,
  'sat': 0.16666666666666666,
  'tree': 0.16666666666666666,
  'children': 0.16666666666666666,
  'went': 0.0,
  'man': 0.0,
  'for': 0.0,
  'out': 0.0,
  'around': 0.16666666666666666,
  'a': 0.16666666666666666})

### Inverse Data Frequency (IDF)

The log of number of documents divided by the number of documents that contain the word w. Inverse data frequency determines the weight of rare words across all documents in the corpus.


In [12]:
def computeIDF(documents):
    import math
    N = len(documents)

    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val>0:
                idfDict[word] += 1
    for word, val in idfDict.items():
        idfDict[word] = math.log(N/float(val))
    return idfDict

In [13]:
idfs = computeIDF([numOfWordsA, numOfWordsB])

In [14]:
idfs

{'the': 0.0,
 'walk': 0.6931471805599453,
 'sat': 0.6931471805599453,
 'tree': 0.6931471805599453,
 'children': 0.6931471805599453,
 'went': 0.6931471805599453,
 'man': 0.6931471805599453,
 'for': 0.6931471805599453,
 'out': 0.6931471805599453,
 'around': 0.6931471805599453,
 'a': 0.0}

Lastly, TF-IDF is simply the product of TF and IDF

In [17]:
def computeTFIDF(tfBagofWords, idfs):
    tfidf = {}
    for word, val in tfBagofWords.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [18]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

In [19]:
df = pd.DataFrame([tfidfA, tfidfB])

In [20]:
df

Unnamed: 0,the,walk,sat,tree,children,went,man,for,out,around,a
0,0.0,0.099021,0.0,0.0,0.0,0.099021,0.099021,0.099021,0.099021,0.0,0.0
1,0.0,0.0,0.115525,0.115525,0.115525,0.0,0.0,0.0,0.0,0.115525,0.0


Rather than manually implementing TF-IDF ourselves, we could use the class provided by sklearn.

In [22]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denseList = dense.tolist()
df = pd.DataFrame(denseList, columns=feature_names)

In [23]:
df

Unnamed: 0,around,children,for,man,out,sat,the,tree,walk,went
0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.0,0.42616,0.42616
1,0.471078,0.471078,0.0,0.0,0.0,0.471078,0.335176,0.471078,0.0,0.0
