In [None]:
import pandas as pd
import numpy as np
import autocorrect
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Document Term Matrix

In [None]:
docs = [
    'He playeed football',
    'He plays cricket',
    'He had sandwich for dinner'
]

In [None]:
def Summary (vectorizer, docs):
    denseVector = vectorizer.fit_transform(docs).todense()
    words = vectorizer.get_feature_names()
    summary = pd.DataFrame(denseVector,columns = words, index = docs)
    return summary
    
cv = CountVectorizer(lowercase = True)
DTM = cv.fit_transform(docs).todense()
words = cv.get_feature_names()
summary = pd.DataFrame(DTM,columns = words, index = docs)
summary

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def process(doc):
    s_doc = nlp(doc)
    tokens = []
    for token in s_doc:
        #print(token, token.lemma_, token.pos_)
        if(token.lemma_ == 'PRON' or token.lemma_ == '-PRON-'):
            tokens.append(token.lower_)
        else:
            tokens.append(token.lemma_)    
    #print (tokens)
    return " ".join(tokens)

class SpellTokenizer(object):
    
    def __init__(self, nlp):
        self.vocab = nlp.vocab
    
    def __call__(self, text):
        doc = nlp.tokenizer(text)
        words = [autocorrect.spell(i.orth_) for i in doc]
        return spacy.tokens.Doc(self.vocab, words = words)

nlp.make_doc = SpellTokenizer(nlp)    

In [None]:
cv = CountVectorizer(preprocessor = process)
Summary(cv, docs)

In [None]:
Summary(TfidfVectorizer(preprocessor = process), docs)

Term Frequency–Inverse Document Frequency (TFIDF)

In [None]:

docs = [
    'He playeed football',
    'He plays cricket',
    'He had sandwich for dinner',
    'Sandwich i had for lunch was great',
    "He is neither a friend nor is he a foe",    
    
]
#Summary(CountVectorizer(preprocessor = process, ngram_range=(1, 3)), docs)

#cv = CountVectorizer(preprocessor = process)
#DTM = cv.fit_transform(docs).todense()
#words = cv.get_feature_names()
#summary = pd.DataFrame(DTM,columns = words, index = docs)
#summary

In [None]:
#Summary(TfidfVectorizer(preprocessor = process, ngram_range=(1, 3)), docs)

In [None]:
def getSimilarDocsCV(query, docs):
    
    vectorizer = CountVectorizer(preprocessor = process, ngram_range=(1, 2)) #CountVectorizer
    dtm = vectorizer.fit_transform(docs).todense()
    
    query_vector = getVector(query, vectorizer)
    similarities = computeSimilarities(query_vector, dtm)
    mostSimilarDocIdx = getMostSimilarIdx(similarities)
    return docs[mostSimilarDocIdx], mostSimilarDocIdx;

def getSimilarDocsTfidf(query, docs):
    
    vectorizer = TfidfVectorizer(preprocessor = process, ngram_range=(1, 2)) #TfidfVectorizer
    dtm = vectorizer.fit_transform(docs).todense()
    
    query_vector = getVector(query, vectorizer)
    similarities = computeSimilarities(query_vector, dtm)
    mostSimilarDocIdx = getMostSimilarIdx(similarities)
    return docs[mostSimilarDocIdx], mostSimilarDocIdx;

def getVector(query, vectorizer):
    query_vector = vectorizer.transform([query]).todense()
    return query_vector

def computeSimilarities(query_vector, dtm):
    all_vectors = np.concatenate((dtm, query_vector))
    similarities = cosine_similarity(all_vectors)[-1][:-1]
    return similarities

def getMostSimilarIdx(similarities):
    return np.argmax(similarities)

def getLeastSimilarIdx(similarities):
    return np.argmin(similarities)

print(getSimilarDocsTfidf("dinner was awesome", docs))
print(getSimilarDocsCV("dinner was awesome", docs))

In [None]:
docs = [
    'Welcome to the weekly book review, my favorite' ,
    'This isnt news, but the president discussed his favorite book',
    'In the news today the president said',
    'Obama stands by EPA about pollution',
    'Obama against Wall street'
]



Word Vector 

In [None]:
def getSimilarDocsWordVector(query, docs):
    dtm = list(map(lambda doc: nlp(doc).vector,  docs) )         # Word Vectors
    query_vector = nlp(query).vector
    all_vectors = dtm + [query_vector]
    similarities = cosine_similarity(all_vectors)[-1][:-1]
    mostSimilarDocIdx = getMostSimilarIdx(similarities)
    return docs[mostSimilarDocIdx], mostSimilarDocIdx;

#getSimilarDocsWordVector("President coal", docs)

In [None]:
print("TFDIF Vector: ", getSimilarDocsTfidf("President coal", docs))
print("Count Vector: ", getSimilarDocsCV("President coal", docs))
print("Word Vector: ", getSimilarDocsWordVector("President coal", docs))

In [None]:
nlp("President").vector.shape

In [None]:
nlp("President").vector 

In [None]:
doc1 = nlp("President coal")
doc2 = nlp("Obama stands by EPA about pollution")
doc1.similarity(doc2)