# NLP
## TFIDF

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
documentA = 'the man went out for a walk'
documentB = 'the childern set around the fire'
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
bagOfWordsA

['the', 'man', 'went', 'out', 'for', 'a', 'walk']

In [3]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
uniqueWords

{'a',
 'around',
 'childern',
 'fire',
 'for',
 'man',
 'out',
 'set',
 'the',
 'walk',
 'went'}

### number of counts unique word in each document

In [4]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
numOfWordsA

{'out': 0,
 'man': 0,
 'fire': 0,
 'for': 0,
 'childern': 0,
 'the': 0,
 'a': 0,
 'set': 0,
 'around': 0,
 'walk': 0,
 'went': 0}

In [5]:
for word in bagOfWordsA:
    numOfWordsA[word] +=1
    
numOfWordsA 

{'out': 1,
 'man': 1,
 'fire': 0,
 'for': 1,
 'childern': 0,
 'the': 1,
 'a': 1,
 'set': 0,
 'around': 0,
 'walk': 1,
 'went': 1}

In [6]:
numOfWordsB = dict.fromkeys(uniqueWords, 0)
numOfWordsB
for word in bagOfWordsB:
    numOfWordsB[word] +=1
    
numOfWordsB 

{'out': 0,
 'man': 0,
 'fire': 1,
 'for': 0,
 'childern': 1,
 'the': 2,
 'a': 0,
 'set': 1,
 'around': 1,
 'walk': 0,
 'went': 0}

In [7]:
def computeTF(wordDict, bagOfWords):
    tfDict={}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / bagOfWordsCount
        
    return tfDict


tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsA, bagOfWordsA)        
    

In [8]:
import math
def computeIDF(documents):
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(),0)
    for document in documents:
        for word, value in document.items():
            if value>0:
                idfDict[word] +=1 
                
   
    for word, value in idfDict.items():
        idfDict[word] = math.log(N/value)
    
    return idfDict
    

In [9]:
idfs = computeIDF([numOfWordsA, numOfWordsB])

In [10]:
def computeTFIDF(tfs, idfs):
    tfidf = {}
    for word, value in tfs.items():
        tfidf[word] = value * idfs[word]
        
    return tfidf
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
tfidfA
tfidfB

{'out': 0.09902102579427789,
 'man': 0.09902102579427789,
 'fire': 0.0,
 'for': 0.09902102579427789,
 'childern': 0.0,
 'the': 0.0,
 'a': 0.09902102579427789,
 'set': 0.0,
 'around': 0.0,
 'walk': 0.09902102579427789,
 'went': 0.09902102579427789}

In [11]:
vectorizer = TfidfVectorizer()
vecotors = vectorizer.fit_transform([documentA,documentB])
vecotors

<2x10 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [12]:
feature_names = vectorizer.get_feature_names()
feature_names



['around',
 'childern',
 'fire',
 'for',
 'man',
 'out',
 'set',
 'the',
 'walk',
 'went']

In [13]:
dense = vecotors.todense()
dense

matrix([[0.        , 0.        , 0.        , 0.4261596 , 0.4261596 ,
         0.4261596 , 0.        , 0.30321606, 0.4261596 , 0.4261596 ],
        [0.40740124, 0.40740124, 0.40740124, 0.        , 0.        ,
         0.        , 0.40740124, 0.57973867, 0.        , 0.        ]])

In [14]:
import pandas as pd

In [17]:
denselist = dense.tolist()
denselist
df = pd.DataFrame(denselist, columns=feature_names)
df

Unnamed: 0,around,childern,fire,for,man,out,set,the,walk,went
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0
