# Document Similarity & Topic Modelling

In [1]:
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd


def convert_tag(tag):
    #Converting the tag given by nltk.pos_tag to the tag used by wordnet.synsets
    
    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
    try:
        return tag_dict[tag[0]]
    except KeyError:
        return None


def doc_to_synsets(doc):
    
#     Returns a list of synsets in document.
#     Tokenizes and tags the words in the document doc.
#     Then finds the first synset for each word/tag combination.
#     If a synset is not found for that combination it is skipped.

    tokens = nltk.word_tokenize(doc)
    pos_tags = nltk.pos_tag(tokens)
    tags = [i[1] for i in pos_tags]
    tags_convert = [convert_tag(i) for i in tags]
    synsets_list = list(zip(tokens,tags_convert))
    synsets_convert = [wn.synsets(i, j) for i,j in synsets_list]
    synsets_match = [i[0] for i in synsets_convert if len(i) > 0]
    
    return synsets_match


def similarity_score(s1, s2):
#     Calculating the normalized similarity score of s1 onto s2    
    
    similarity_values = []
    for i in s1:
        scores = [score for score in [i.path_similarity(j) for j in s2] if score is not None]
        if scores:        
            similarity_values.append(max(scores))
    normalized_value = sum(similarity_values) / len(similarity_values)
    return normalized_value


def document_path_similarity(doc1, doc2):
#     Finding the symmetrical similarity between doc1 and doc2

    synsets1 = doc_to_synsets(doc1)
    synsets2 = doc_to_synsets(doc2)

    return (similarity_score(synsets1, synsets2) + similarity_score(synsets2, synsets1)) / 2

### test_document_path_similarity

Use this function to check if doc_to_synsets and similarity_score are correct.

*This function should return the similarity score as a float.*

In [12]:
#Returns the similarity score
def test_document_path_similarity():
    doc1 = 'This is a function to test document_path_similarity.'
    doc2 = 'Use this function to see if your code in doc_to_synsets \
    and similarity_score is correct!'
    return document_path_similarity(doc1, doc2)

In [13]:
test_document_path_similarity()

0.554265873015873

In [122]:
paraphrases = pd.read_csv('paraphrases.csv')
paraphrases.head()

Unnamed: 0,Quality,D1,D2
0,1,"Ms Stewart, the chief executive, was not expec...","Ms Stewart, 61, its chief executive officer an..."
1,1,After more than two years' detention under the...,After more than two years in detention by the ...
2,1,"""It still remains to be seen whether the reven...","""It remains to be seen whether the revenue rec..."
3,0,"And it's going to be a wild ride,"" said Allan ...","Now the rest is just mechanical,"" said Allan H..."
4,1,The cards are issued by Mexico's consulates to...,The card is issued by Mexico's consulates to i...



### most_similar_docs


In [124]:
def most_similar_docs():
    
    paraphrases = pd.read_csv('paraphrases.csv')
    df = paraphrases[paraphrases['Quality'] == 1]
    list_of_tuples = []
    for index, row in df.iterrows():
        list_of_tuples.append((row['D1'], row['D2'], document_path_similarity(row['D1'], row['D2'])))
        
    from operator import itemgetter
    max_similarity_tuple = max(list_of_tuples,key=itemgetter(2))
    return max_similarity_tuple

most_similar_docs()

('"Indeed, Iran should be put on notice that efforts to try to remake Iraq in their image will be aggressively put down," he said.',
 '"Iran should be on notice that attempts to remake Iraq in Iran\'s image will be aggressively put down," he said.\n',
 0.9753086419753086)

### label_accuracy


In [126]:
def label_accuracy():
    
    from sklearn.metrics import accuracy_score
    paraphrases = pd.read_csv('paraphrases.csv')
    similarity_score = []
    
    for index, row in paraphrases.iterrows():
        score = document_path_similarity(row['D1'], row['D2'])
        s = 1 if score > 0.75 else 0
        similarity_score.append(s)
        
    paraphrases['Similarity_score'] = similarity_score
    accuracy = accuracy_score(paraphrases['Similarity_score'], paraphrases['Quality'])
    
    return accuracy
label_accuracy()

0.80000000000000004

## Part 2 - Topic Modelling


In [128]:
import pickle
import gensim
from sklearn.feature_extraction.text import CountVectorizer

# Loading the list of documents
with open('newsgroups', 'rb') as f:
    newsgroup_data = pickle.load(f)

# Using CountVectorizor to find three letter tokens, remove stop_words, 
# removing tokens that don't appear in at least 20 documents,
# removing tokens that appear in more than 20% of the documents
vect = CountVectorizer(min_df=20, max_df=0.2, stop_words='english', 
                       token_pattern='(?u)\\b\\w\\w\\w+\\b')
# Fit and transform
X = vect.fit_transform(newsgroup_data)

# Converting sparse matrix to gensim corpus.
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

# Mapping from word IDs to words
id_map = dict((v, k) for k, v in vect.vocabulary_.items())


In [130]:
# Using the gensim.models.ldamodel.LdaModel constructor to estimate 
ldamodel = gensim.models.ldamodel.LdaModel(corpus, id2word=id_map, num_topics=10, passes=25, random_state=0)
ldamodel 

<gensim.models.ldamodel.LdaModel at 0x7f3180e73588>

### lda_topics

Finding a list of the 10 topics and the most significant 10 words in each topic

In [132]:
def lda_topics():
        
    return ldamodel.print_topics(num_topics=10, num_words=10)
lda_topics()

[(0,
  '0.059*"car" + 0.019*"cars" + 0.018*"000" + 0.013*"100" + 0.012*"high" + 0.011*"speed" + 0.011*"oil" + 0.011*"driving" + 0.010*"auto" + 0.010*"200"'),
 (1,
  '0.025*"time" + 0.018*"just" + 0.017*"bike" + 0.014*"don" + 0.014*"good" + 0.011*"know" + 0.010*"used" + 0.009*"use" + 0.009*"want" + 0.009*"turn"'),
 (2,
  '0.019*"just" + 0.019*"don" + 0.014*"new" + 0.013*"good" + 0.012*"know" + 0.011*"think" + 0.010*"years" + 0.009*"little" + 0.009*"people" + 0.009*"really"'),
 (3,
  '0.026*"don" + 0.020*"think" + 0.020*"does" + 0.016*"say" + 0.015*"people" + 0.014*"know" + 0.013*"just" + 0.013*"did" + 0.011*"believe" + 0.011*"argument"'),
 (4,
  '0.015*"way" + 0.014*"pain" + 0.014*"edu" + 0.014*"good" + 0.013*"things" + 0.013*"pick" + 0.012*"pitt" + 0.012*"soon" + 0.011*"gordon" + 0.011*"low"'),
 (5,
  '0.023*"people" + 0.021*"god" + 0.016*"ground" + 0.013*"science" + 0.011*"religion" + 0.011*"current" + 0.011*"point" + 0.010*"theory" + 0.009*"example" + 0.009*"evidence"'),
 (6,
  '0.02

### topic_distribution

Finding the topic distribution

In [133]:
new_doc = ["\n\nIt's my understanding that the freezing will start to occur because \
of the\ngrowing distance of Pluto and Charon from the Sun, due to it's\nelliptical orbit. \
It is not due to shadowing effects. \n\n\nPluto can shadow Charon, and vice-versa.\n\nGeorge \
Krumins\n-- "]

In [139]:
def topic_distribution():
    
    new_doc_vt = vect.transform(new_doc)
    corpus = corpus = gensim.matutils.Sparse2Corpus(new_doc_vt, documents_columns=False)
    return list(ldamodel[corpus])[0]

topic_distribution()

[(0, 0.020000452424091701),
 (1, 0.020002134173931124),
 (2, 0.020001910923382809),
 (3, 0.020002012138214002),
 (4, 0.020001258337738501),
 (5, 0.020001088685979801),
 (6, 0.020000190954947098),
 (7, 0.57220067832463106),
 (8, 0.020001762152735143),
 (9, 0.26778851188434882)]

### topic_names

In [140]:
def topic_names():
        
    return ['Automobiles', 'Travel', 'Politics', 
            'Society & Lifestyle','Health' ,'Religion', 
            'Computers & IT', 'Business', 'Sports',
             'Science']