# Document Similarity Using LSI

In [117]:
import os
import gensim
from gensim.models import LsiModel
from gensim import models
from gensim import corpora
from gensim.utils import lemmatize
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import remove_stopwords, stem_text
from gensim.parsing.preprocessing import strip_numeric
import pandas as pd
from gensim import similarities

In [118]:
cor = pd.read_csv("for_nlp.txt", sep='\n', header=None)[0]

In [119]:
def preprocessing():
    for document in cor:
        doc = strip_numeric(stem_text(document))
        yield gensim.utils.tokenize(doc, lower=True)

In [120]:
texts = preprocessing()
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=1, keep_n=700)

In [121]:
doc_term_matrix = [dictionary.doc2bow(tokens) for tokens in preprocessing()]
tfidf = models.TfidfModel(doc_term_matrix)
corpus_tfidf = tfidf[doc_term_matrix]

In [122]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)  
doc = 'Lohith'
vec_bow = dictionary.doc2bow(doc.lower().split())

In [123]:
vec_lsi = lsi[vec_bow]  
index = similarities.MatrixSimilarity(lsi[doc_term_matrix])
unsorted_similarity = index[vec_lsi]
sorted_similarity = sorted(enumerate(unsorted_similarity), key=lambda item: -item[1])
for index, similarity in sorted_similarity:
    print(similarity, cor[index])

1.0 Lohith has Mtech in Data Science & Engineering.
0.0 Vikram works for Optum Global Solution.
0.0 Vivek is studying in MVIT college.
0.0 Prathik resides in Germany.
0.0 Ganesh works for TCS Bangalore.
