In [1]:
import pandas as pd
import numpy as np
import autocorrect
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Document Term Matrix

In [3]:
docs = [
    'He playeed football',
    'He plays cricket',
    'He had sandwich for dinner'
]

In [4]:
def Summary (vectorizer, docs):
    denseVector = vectorizer.fit_transform(docs).todense()
    words = vectorizer.get_feature_names()
    summary = pd.DataFrame(denseVector,columns = words, index = docs)
    return summary
    
cv = CountVectorizer(lowercase = True)
DTM = cv.fit_transform(docs).todense()
words = cv.get_feature_names()
summary = pd.DataFrame(DTM,columns = words, index = docs)
summary

Unnamed: 0,cricket,dinner,football,for,had,he,playeed,plays,sandwich
He playeed football,0,0,1,0,0,1,1,0,0
He plays cricket,1,0,0,0,0,1,0,1,0
He had sandwich for dinner,0,1,0,1,1,1,0,0,1


In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

def process(doc):
    s_doc = nlp(doc)
    tokens = []
    for token in s_doc:
        #print(token, token.lemma_, token.pos_)
        if(token.lemma_ == 'PRON' or token.lemma_ == '-PRON-'):
            tokens.append(token.lower_)
        else:
            tokens.append(token.lemma_)    
    #print (tokens)
    return " ".join(tokens)

class SpellTokenizer(object):
    
    def __init__(self, nlp):
        self.vocab = nlp.vocab
    
    def __call__(self, text):
        doc = nlp.tokenizer(text)
        words = [autocorrect.spell(i.orth_) for i in doc]
        return spacy.tokens.Doc(self.vocab, words = words)

nlp.make_doc = SpellTokenizer(nlp)    

In [6]:
cv = CountVectorizer(preprocessor = process)
Summary(cv, docs)

Unnamed: 0,cricket,dinner,football,for,have,he,play,sandwich
He playeed football,0,0,1,0,0,1,1,0
He plays cricket,1,0,0,0,0,1,1,0
He had sandwich for dinner,0,1,0,1,1,1,0,1


In [7]:
Summary(TfidfVectorizer(preprocessor = process), docs)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Unnamed: 0,cricket,dinner,football,for,have,he,play,sandwich
He playeed football,0.0,0.0,0.720333,0.0,0.0,0.425441,0.547832,0.0
He plays cricket,0.720333,0.0,0.0,0.0,0.0,0.425441,0.547832,0.0
He had sandwich for dinner,0.0,0.479528,0.0,0.479528,0.479528,0.283217,0.0,0.479528


Term Frequency–Inverse Document Frequency (TFIDF)

In [8]:

docs = [
    'He playeed football',
    'He plays cricket',
    'He had sandwich for dinner',
    'Sandwich i had for lunch was great',
    "He is neither a friend nor is he a foe",    
    
]
#Summary(CountVectorizer(preprocessor = process, ngram_range=(1, 3)), docs)

#cv = CountVectorizer(preprocessor = process)
#DTM = cv.fit_transform(docs).todense()
#words = cv.get_feature_names()
#summary = pd.DataFrame(DTM,columns = words, index = docs)
#summary

In [None]:
#Summary(TfidfVectorizer(preprocessor = process, ngram_range=(1, 3)), docs)

In [9]:
def getSimilarDocsCV(query, docs):
    
    vectorizer = CountVectorizer(preprocessor = process, ngram_range=(1, 2)) #CountVectorizer
    dtm = vectorizer.fit_transform(docs).todense()
    
    query_vector = getVector(query, vectorizer)
    similarities = computeSimilarities(query_vector, dtm)
    mostSimilarDocIdx = getMostSimilarIdx(similarities)
    return docs[mostSimilarDocIdx], mostSimilarDocIdx;

def getSimilarDocsTfidf(query, docs):
    
    vectorizer = TfidfVectorizer(preprocessor = process, ngram_range=(1, 2)) #TfidfVectorizer
    dtm = vectorizer.fit_transform(docs).todense()
    
    query_vector = getVector(query, vectorizer)
    similarities = computeSimilarities(query_vector, dtm)
    mostSimilarDocIdx = getMostSimilarIdx(similarities)
    return docs[mostSimilarDocIdx], mostSimilarDocIdx;

def getVector(query, vectorizer):
    query_vector = vectorizer.transform([query]).todense()
    return query_vector

def computeSimilarities(query_vector, dtm):
    all_vectors = np.concatenate((dtm, query_vector))
    similarities = cosine_similarity(all_vectors)[-1][:-1]
    return similarities

def getMostSimilarIdx(similarities):
    return np.argmax(similarities)

def getLeastSimilarIdx(similarities):
    return np.argmin(similarities)

print(getSimilarDocsTfidf("dinner was awesome", docs))
print(getSimilarDocsCV("dinner was awesome", docs))

('He had sandwich for dinner', 2)
('He is neither a friend nor is he a foe', 4)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [10]:
docs = [
    'Welcome to the weekly book review, my favorite' ,
    'This isnt news, but the president discussed his favorite book',
    'In the news today the president said',
    'Obama stands by EPA about pollution',
    'Obama against Wall street'
]



Word Vector 

In [11]:
def getSimilarDocsWordVector(query, docs):
    dtm = list(map(lambda doc: nlp(doc).vector,  docs) )         # Word Vectors
    query_vector = nlp(query).vector
    all_vectors = dtm + [query_vector]
    similarities = cosine_similarity(all_vectors)[-1][:-1]
    mostSimilarDocIdx = getMostSimilarIdx(similarities)
    return docs[mostSimilarDocIdx], mostSimilarDocIdx;

#getSimilarDocsWordVector("President coal", docs)

In [12]:
print("TFDIF Vector: ", getSimilarDocsTfidf("President coal", docs))
print("Count Vector: ", getSimilarDocsCV("President coal", docs))
print("Word Vector: ", getSimilarDocsWordVector("President coal", docs))

TFDIF Vector:  ('In the news today the president said', 2)
Count Vector:  ('In the news today the president said', 2)
Word Vector:  ('Obama stands by EPA about pollution', 3)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [13]:
nlp("President").vector.shape

(384,)

In [14]:
nlp("President").vector 

array([-1.94145131e+00,  1.30941749e+00,  5.03938103e+00,  2.40374613e+00,
       -1.32454109e+00,  4.81367064e+00, -3.52691984e+00, -2.37943292e+00,
        1.00874412e+00, -8.12833607e-02, -6.13715649e-01,  1.07234538e+00,
       -5.55517972e-02, -2.64275670e+00,  4.40189362e-01,  2.88332200e+00,
        4.80573475e-01, -3.61144185e+00, -1.87545180e+00, -8.40450287e-01,
       -1.44577527e+00,  8.83387804e-01, -2.09199727e-01, -7.49470472e-01,
       -3.38787842e+00, -6.93196595e-01, -1.42271012e-01,  5.82755804e-01,
       -3.62339759e+00, -2.46968341e+00, -9.55463350e-01, -2.34903288e+00,
        2.64658833e+00, -1.75681674e+00,  3.16621542e-01, -1.17007875e+00,
       -1.64229321e+00,  1.12336838e+00, -3.52560341e-01,  8.93644214e-01,
       -1.95932913e+00,  3.50284338e+00,  1.79270291e+00, -1.82425094e+00,
       -3.59568787e+00,  2.62061596e+00, -7.66131222e-01,  7.05506206e-01,
        2.88515615e+00, -4.78165269e-01, -1.67668509e+00, -1.52667725e+00,
       -2.33446097e+00, -

In [None]:
doc1 = nlp("President coal")
doc2 = nlp("Obama stands by EPA about pollution")
doc1.similarity(doc2)