# TF-IDF from Scratch

In [1]:
import pandas as pd
import sklearn as sk
import math
import nltk
from nltk.corpus import stopwords


In [None]:
set(stopwords.words('english'))


In [3]:
def computeTF(wordDict, doc):
    """
        tf(t,d) = count of t in d / number of words in d

    :param wordDict:
    :param doc:
    :return:
    """
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count/float(corpusCount)
    return(tfDict)

def computeIDF(docList):
    """
        idf(t) = log(N/(df + 1))
    :param docList:
    :return:
    """
    idfDict = {}
    N = len(docList)

    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) + 1))

    return (idfDict)

def computeTFIDF(tfBow, idfs):
    """
    tf-idf(t, d) = tf(t, d) * log(N/(df + 1))
    :param tfBow:
    :param idfs:
    :return:
    """
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return(tfidf)


In [4]:
sentence1 = "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
sentence2 = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"


In [7]:
#split so each word have their own string
sentence1_list = nltk.word_tokenize(sentence1)
sentence2_list = nltk.word_tokenize(sentence2)
total= set(sentence1_list).union(set(sentence2_list))
print (total)

{'question', 'great', 'wat', 'rate', 'only', 'T', 'in', 'FA', 'tkts', 'final', 'jurong', 'point', 'Go', 'la', 'crazy..', 'Cine', 'until', 'got', 'to', 'receive', '2', 'comp', 'std', '&', 'wkly', 'amore', 'Available', 'world', 'n', '87121', ')', '08452810075over18', '2005', 'Cup', 'Text', 'entry', 'apply', '.', 'there', 'win', 'buffet', 'e', 'May', "'s", '21st', '(', 'txt', 'Free', '...', ',', 'a', 'C', 'bugis'}


In [9]:
def create_word_dict(total, sentence):
    wordDict = dict.fromkeys(total, 0)
    for word in sentence:
        wordDict[word] += 1
    return wordDict

wordDictA = create_word_dict(total,sentence1_list)
wordDictB = create_word_dict(total,sentence2_list)

print (wordDictA)
print()
print (wordDictB)

{'question': 0, 'great': 1, 'wat': 1, 'rate': 0, 'only': 1, 'T': 0, 'in': 1, 'FA': 0, 'tkts': 0, 'final': 0, 'jurong': 1, 'point': 1, 'Go': 1, 'la': 1, 'crazy..': 1, 'Cine': 1, 'until': 1, 'got': 1, 'to': 0, 'receive': 0, '2': 0, 'comp': 0, 'std': 0, '&': 0, 'wkly': 0, 'amore': 1, 'Available': 1, 'world': 1, 'n': 1, '87121': 0, ')': 0, '08452810075over18': 0, '2005': 0, 'Cup': 0, 'Text': 0, 'entry': 0, 'apply': 0, '.': 0, 'there': 1, 'win': 0, 'buffet': 1, 'e': 1, 'May': 0, "'s": 0, '21st': 0, '(': 0, 'txt': 0, 'Free': 0, '...': 2, ',': 1, 'a': 0, 'C': 0, 'bugis': 1}

{'question': 1, 'great': 0, 'wat': 0, 'rate': 1, 'only': 0, 'T': 1, 'in': 1, 'FA': 2, 'tkts': 1, 'final': 1, 'jurong': 0, 'point': 0, 'Go': 0, 'la': 0, 'crazy..': 0, 'Cine': 0, 'until': 0, 'got': 0, 'to': 3, 'receive': 1, '2': 1, 'comp': 1, 'std': 1, '&': 1, 'wkly': 1, 'amore': 0, 'Available': 0, 'world': 0, 'n': 0, '87121': 1, ')': 1, '08452810075over18': 1, '2005': 1, 'Cup': 1, 'Text': 1, 'entry': 2, 'apply': 1, '.': 1,

# Term Freq


In [20]:
tfFirst = computeTF(wordDictA, sentence1_list)
tfSecond = computeTF(wordDictB, sentence2_list)
print ("TERM FREQUENCY OF SENTENCE1:\n",tfFirst)
print()
print ("TERM FREQUENCY OF SENTENCE2:\n",tfSecond)

TERM FREQUENCY OF SENTENCE1:
 {'question': 0.0, 'great': 0.043478260869565216, 'wat': 0.043478260869565216, 'rate': 0.0, 'only': 0.043478260869565216, 'T': 0.0, 'in': 0.043478260869565216, 'FA': 0.0, 'tkts': 0.0, 'final': 0.0, 'jurong': 0.043478260869565216, 'point': 0.043478260869565216, 'Go': 0.043478260869565216, 'la': 0.043478260869565216, 'crazy..': 0.043478260869565216, 'Cine': 0.043478260869565216, 'until': 0.043478260869565216, 'got': 0.043478260869565216, 'to': 0.0, 'receive': 0.0, '2': 0.0, 'comp': 0.0, 'std': 0.0, '&': 0.0, 'wkly': 0.0, 'amore': 0.043478260869565216, 'Available': 0.043478260869565216, 'world': 0.043478260869565216, 'n': 0.043478260869565216, '87121': 0.0, ')': 0.0, '08452810075over18': 0.0, '2005': 0.0, 'Cup': 0.0, 'Text': 0.0, 'entry': 0.0, 'apply': 0.0, '.': 0.0, 'there': 0.043478260869565216, 'win': 0.0, 'buffet': 0.043478260869565216, 'e': 0.043478260869565216, 'May': 0.0, "'s": 0.0, '21st': 0.0, '(': 0.0, 'txt': 0.0, 'Free': 0.0, '...': 0.08695652173913

# IDF

In [13]:
idfs = computeIDF([wordDictA, wordDictB])

idfs

{'question': 0.3010299956639812,
 'great': 0.3010299956639812,
 'wat': 0.3010299956639812,
 'rate': 0.3010299956639812,
 'only': 0.3010299956639812,
 'T': 0.3010299956639812,
 'in': 0.3010299956639812,
 'FA': 0.3010299956639812,
 'tkts': 0.3010299956639812,
 'final': 0.3010299956639812,
 'jurong': 0.3010299956639812,
 'point': 0.3010299956639812,
 'Go': 0.3010299956639812,
 'la': 0.3010299956639812,
 'crazy..': 0.3010299956639812,
 'Cine': 0.3010299956639812,
 'until': 0.3010299956639812,
 'got': 0.3010299956639812,
 'to': 0.3010299956639812,
 'receive': 0.3010299956639812,
 '2': 0.3010299956639812,
 'comp': 0.3010299956639812,
 'std': 0.3010299956639812,
 '&': 0.3010299956639812,
 'wkly': 0.3010299956639812,
 'amore': 0.3010299956639812,
 'Available': 0.3010299956639812,
 'world': 0.3010299956639812,
 'n': 0.3010299956639812,
 '87121': 0.3010299956639812,
 ')': 0.3010299956639812,
 '08452810075over18': 0.3010299956639812,
 '2005': 0.3010299956639812,
 'Cup': 0.3010299956639812,
 'Text

# TDIDF

In [14]:
#running our two sentences through the IDF:
idfFirst = computeTFIDF(tfFirst, idfs)
idfSecond = computeTFIDF(tfSecond, idfs)
print (idfFirst)
print()
print (idfSecond)

{'question': 0.0, 'great': 0.01308826068104266, 'wat': 0.01308826068104266, 'rate': 0.0, 'only': 0.01308826068104266, 'T': 0.0, 'in': 0.01308826068104266, 'FA': 0.0, 'tkts': 0.0, 'final': 0.0, 'jurong': 0.01308826068104266, 'point': 0.01308826068104266, 'Go': 0.01308826068104266, 'la': 0.01308826068104266, 'crazy..': 0.01308826068104266, 'Cine': 0.01308826068104266, 'until': 0.01308826068104266, 'got': 0.01308826068104266, 'to': 0.0, 'receive': 0.0, '2': 0.0, 'comp': 0.0, 'std': 0.0, '&': 0.0, 'wkly': 0.0, 'amore': 0.01308826068104266, 'Available': 0.01308826068104266, 'world': 0.01308826068104266, 'n': 0.01308826068104266, '87121': 0.0, ')': 0.0, '08452810075over18': 0.0, '2005': 0.0, 'Cup': 0.0, 'Text': 0.0, 'entry': 0.0, 'apply': 0.0, '.': 0.0, 'there': 0.01308826068104266, 'win': 0.0, 'buffet': 0.01308826068104266, 'e': 0.01308826068104266, 'May': 0.0, "'s": 0.0, '21st': 0.0, '(': 0.0, 'txt': 0.0, 'Free': 0.0, '...': 0.02617652136208532, ',': 0.01308826068104266, 'a': 0.0, 'C': 0.0

In [18]:

#putting it in a dataframe
idf = pd.DataFrame([idfFirst, idfSecond])
idf.head()

Unnamed: 0,question,great,wat,rate,only,T,in,FA,tkts,final,...,'s,21st,(,txt,Free,....1,",",a,C,bugis
0,0.0,0.013088,0.013088,0.0,0.013088,0.0,0.013088,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.026177,0.013088,0.0,0.0,0.013088
1,0.008136,0.0,0.0,0.008136,0.0,0.008136,0.008136,0.016272,0.008136,0.008136,...,0.016272,0.008136,0.008136,0.008136,0.008136,0.0,0.0,0.008136,0.008136,0.0
