In [None]:
import pandas as pd
import numpy as np
import autocorrect
import spacy

docs = [
    'He playeed football',
    'He plays cricket',
    'He had sandwich for dinner'
]
'''
docs = [
    'new york times',
    'new york post',
    'los angeles times'
]

'''

class SpellTokenizer(object):
    
    def __init__(self, nlp):
        self.vocab = nlp.vocab
    
    def __call__(self, text):
        doc = nlp.tokenizer(text)
        words = [autocorrect.spell(i.orth_) for i in doc]
        return spacy.tokens.Doc(self.vocab, words = words)
    
nlp = spacy.load("en_core_web_sm")
nlp.make_doc = SpellTokenizer(nlp)    

In [None]:
def Tokens(doc, lemma = False):
    s_doc = nlp(doc)
    tokens = []
    for token in s_doc:
        if(lemma):
        #print(token, token.lemma_, token.pos_)
            if(token.lemma_ == 'PRON' or token.lemma_ == '-PRON-'):
                tokens.append(token.lower_)
            else:
                tokens.append(token.lemma_)  
        else:
            tokens.append(token.lower_)
    #print (tokens)
    return tokens

def Vocab(tokens):
    alltokens = []
    for doc_token in tokens:
        alltokens = alltokens + doc_token        
    return set(alltokens)

def Dict(tokens, vocab):
    docsDict = []
    docDict = dict.fromkeys(vocab, 0)     
    for  doc_token  in  tokens:
        docDict = dict.fromkeys(vocab, 0)    
        for word in doc_token:        
            docDict [word]+=1
        docsDict.append(docDict)
    return docsDict
    
    

In [None]:
tokens = [Tokens(doc, True) for doc in docs]
vocab = Vocab(tokens)
docsDict = Dict(tokens, vocab)
pd.DataFrame(docsDict, index = docs)


In [None]:
def computeTF(docsDict, tokens):
    tfDicts = []
    for idx in range(len(tokens)):
        doc_token = tokens[idx]
        wordDict = docsDict[idx]
        tfDict = {}
        bowCount = len(doc_token)
        for word, count in wordDict.items():
            tfDict[word] = count/float(bowCount)
        tfDicts.append(tfDict)
    return tfDicts

In [None]:
docsTF = computeTF(docsDict, tokens)
pd.DataFrame(docsTF, index = docs)

In [None]:
def computeIDFList(docList):
    import math
    docsIdfDict = []
    N = len(docList)
    
    for doc in docList:
        idfDict = dict.fromkeys(doc.keys(), 0)
        #print(idfDict, doc.items())    
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
        
        #print(idfDict.items())   
        for word, val in idfDict.items():
            idfDict[word] = math.log2(N / float(val)) if(val > 0) else 0
        #print(idfDict.items())     
        docsIdfDict.append(idfDict)
    
    return docsIdfDict

def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            #print(word, val)
            if val > 0:
                idfDict[word] += val
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val), 2)
        #print (word, idfDict[word], val)
    
    return idfDict

docsIDF = computeIDF(docsTF)
pd.DataFrame(docsIDF, index = [0])

In [None]:
def computeTFIDF(docsTF, idfs):
    docsTFDIF = []
    for idx, tfBow in enumerate(docsTF) :
        tfidf = {}        
        for word, val in tfBow.items():            
            tfidf[word] = val*idfs[word]
        docsTFDIF.append(tfidf)
    return docsTFDIF

docsTFDIF = computeTFIDF(docsTF, docsIDF)
pd.DataFrame(docsTFDIF, index = docs)