# NLP
## TFIDF

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
documentA = 'the man went out for a walk'
documentB = 'the childern set around the fire'
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
bagOfWordsA

['the', 'man', 'went', 'out', 'for', 'a', 'walk']

In [3]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
uniqueWords

{'a',
 'around',
 'childern',
 'fire',
 'for',
 'man',
 'out',
 'set',
 'the',
 'walk',
 'went'}

### number of counts unique word in each document

In [6]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
numOfWordsA

{'went': 0,
 'set': 0,
 'man': 0,
 'childern': 0,
 'a': 0,
 'around': 0,
 'for': 0,
 'walk': 0,
 'out': 0,
 'the': 0,
 'fire': 0}

In [7]:
for word in bagOfWordsA:
    numOfWordsA[word] +=1
    
numOfWordsA 

{'went': 1,
 'set': 0,
 'man': 1,
 'childern': 0,
 'a': 1,
 'around': 0,
 'for': 1,
 'walk': 1,
 'out': 1,
 'the': 1,
 'fire': 0}

In [8]:
numOfWordsB = dict.fromkeys(uniqueWords, 0)
numOfWordsB
for word in bagOfWordsB:
    numOfWordsB[word] +=1
    
numOfWordsB 

{'went': 0,
 'set': 1,
 'man': 0,
 'childern': 1,
 'a': 0,
 'around': 1,
 'for': 0,
 'walk': 0,
 'out': 0,
 'the': 2,
 'fire': 1}

In [10]:
def computeTF(wordDict, bagOfWords):
    tfDict={}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / bagOfWordsCount
        
    return tfDict


tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsA, bagOfWordsA)        
    

In [15]:
import math
def computeIDF(documents):
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(),0)
    for document in documents:
        for word, value in document.items():
            if value>0:
                idfDict[word] +=1 
                
   
    for word, value in idfDict.items():
        idfDict[word] = math.log(N/value)
    
    return idfDict
    

In [17]:
idfs = computeIDF([numOfWordsA, numOfWordsB])

In [20]:
def computeTFIDF(tfs, idfs):
    tfidf = {}
    for word, value in tfs.items():
        tfidf[word] = value * idfs[word]
        
    return tfidf
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
tfidfA
tfidfB

{'went': 0.09902102579427789,
 'set': 0.0,
 'man': 0.09902102579427789,
 'childern': 0.0,
 'a': 0.09902102579427789,
 'around': 0.0,
 'for': 0.09902102579427789,
 'walk': 0.09902102579427789,
 'out': 0.09902102579427789,
 'the': 0.0,
 'fire': 0.0}