In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
d1 = 'the man went out for a walk'
d2 = 'the children sat around fire'

In [5]:
bow1 = d1.split(' ')

In [7]:
bow2 = d2.split(' ')

In [8]:
uniqueWords = set(bow1).union(set(bow2))

In [9]:
uniqueWords

{'a',
 'around',
 'children',
 'fire',
 'for',
 'man',
 'out',
 'sat',
 'the',
 'walk',
 'went'}

##### let's create a dictionary of words and their occurence for each document in the corpus

In [10]:
numofWords1 = dict.fromkeys(uniqueWords,0)

for word in bow1:
    numofWords1[word] += 1
    
numofWords2 = dict.fromkeys(uniqueWords,0)

for word in bow2:
    numofWords2[word] += 1

In [12]:
print(numofWords1)

{'out': 1, 'children': 0, 'for': 1, 'around': 0, 'sat': 0, 'walk': 1, 'the': 1, 'went': 1, 'a': 1, 'fire': 0, 'man': 1}


**problem : importance of words is not taken into account**

* Solution 1 - remove stop words

* Solution 2 - use tf idf

In [16]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /Users/vipul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### TF IDF - Term Frequency and Inverse Document Frequency

#### Term Frequency - Ratio of number of times a word appears in a document and total words in the document

In [17]:
def tfcompute(dictWords, bagofWords):
    tfDict = {}
    bagofWordsCount = len(bagofWords)
    
    for word, Count in dictWords.items():
        tfDict[word] = Count / float(bagofWordsCount)
        
    return tfDict

In [21]:
tf1 = tfcompute(numofWords1, bow1)
print(f'\n\ntf1 >>> \n\n{tf1}')
tf2 = tfcompute(numofWords2, bow2)
print(f'\n\ntf2 >>> \n\n{tf2}')



tf1 >>> 

{'out': 0.14285714285714285, 'children': 0.0, 'for': 0.14285714285714285, 'around': 0.0, 'sat': 0.0, 'walk': 0.14285714285714285, 'the': 0.14285714285714285, 'went': 0.14285714285714285, 'a': 0.14285714285714285, 'fire': 0.0, 'man': 0.14285714285714285}


tf2 >>> 

{'out': 0.0, 'children': 0.2, 'for': 0.0, 'around': 0.2, 'sat': 0.2, 'walk': 0.0, 'the': 0.2, 'went': 0.0, 'a': 0.0, 'fire': 0.2, 'man': 0.0}


#### Inverse Document Frequency - logarithm of the number of documents divided by number of documents that contain the word *w*.

*This way it determines the weight of rare words across all the documents in the corpus*

In [22]:
def idfCompute(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(),0)
    
    for document in documents:
        for word, val in document.items():
            if val>0:
                idfDict[word] += 1
                
    for word, val in idfDict.items():
        idfDict[word] = math.log(N/float(val))
        
    return idfDict

In [23]:
idf12 = idfCompute([numofWords1,numofWords2])

In [24]:
idf12

{'out': 0.6931471805599453,
 'children': 0.6931471805599453,
 'for': 0.6931471805599453,
 'around': 0.6931471805599453,
 'sat': 0.6931471805599453,
 'walk': 0.6931471805599453,
 'the': 0.0,
 'went': 0.6931471805599453,
 'a': 0.6931471805599453,
 'fire': 0.6931471805599453,
 'man': 0.6931471805599453}

In [25]:
def tfidfCompute(tfbow, idf=idf12):
    tfidf = {}
    
    for word, val in tfbow.items():
        tfidf[word] = val * idf[word]
        
    return tfidf

In [27]:
tfidf1 = tfidfCompute(tf1,idf12)
tfidf1

{'out': 0.09902102579427789,
 'children': 0.0,
 'for': 0.09902102579427789,
 'around': 0.0,
 'sat': 0.0,
 'walk': 0.09902102579427789,
 'the': 0.0,
 'went': 0.09902102579427789,
 'a': 0.09902102579427789,
 'fire': 0.0,
 'man': 0.09902102579427789}

In [28]:
tfidf2 = tfidfCompute(tf2,idf12)
tfidf2

{'out': 0.0,
 'children': 0.13862943611198905,
 'for': 0.0,
 'around': 0.13862943611198905,
 'sat': 0.13862943611198905,
 'walk': 0.0,
 'the': 0.0,
 'went': 0.0,
 'a': 0.0,
 'fire': 0.13862943611198905,
 'man': 0.0}

In [44]:
vectorizer = TfidfVectorizer(stop_words='english')

vectors = vectorizer.fit_transform([d1,d2])

feature_names = vectorizer.get_feature_names()

In [45]:
vectors

<2x5 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [46]:
feature_names

['children', 'man', 'sat', 'walk', 'went']

In [47]:
dense = vectors.todense()
dense

matrix([[0.        , 0.57735027, 0.        , 0.57735027, 0.57735027],
        [0.70710678, 0.        , 0.70710678, 0.        , 0.        ]])

In [48]:
denselist = dense.tolist()

In [49]:
df = pd.DataFrame(denselist,columns=feature_names)

In [50]:
df

Unnamed: 0,children,man,sat,walk,went
0,0.0,0.57735,0.0,0.57735,0.57735
1,0.707107,0.0,0.707107,0.0,0.0


In [51]:
s = 'I am happy because I got happily treated with happy treats by someone who was even happier that the happiest guy in the world.'

In [52]:
s

'I am happy because I got happily treated with happy treats by someone who was even happier that the happiest guy in the world.'

##### root form = happy

###### Stemming ------ happi 

###### Lemmatization ------ happy