This notebook shows an example recommendation system using doc2vec. We will use a dataset called CMU Book summaries [dataset](http://www.cs.cmu.edu/~dbamman/booksummaries.html). Alternateively, the dataset's link can be found in the `BookSummaries_Link.md` file under the Data folder in Ch7.


In [1]:
# !pip install gensim
# !pip install nltk

In [2]:
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
# Read the dataset’s README to understand the data format. 

data_path = "../data/bigdata/booksumms/booksummaries.txt"
mydata = {} #titles-summaries dictionary object
for line in open(data_path, encoding="utf-8"):
    temp = line.split("\t")
    mydata[temp[2]] = temp[6]

In [7]:
#prepare the data for doc2vec, build and save a doc2vec model
train_doc2vec = [TaggedDocument((word_tokenize(mydata[t])), tags=[t]) for t in mydata.keys()]
model = Doc2Vec(vector_size=50, alpha=0.025, min_count=10, dm =1, epochs=2)
model.build_vocab(train_doc2vec)
model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)
model.save("../output/book_summ_d2v.model")

In [9]:
#Use the model to look for similar texts
model= Doc2Vec.load("../output/book_summ_d2v.model")

#This is a sentence from the summary of “Animal Farm” on Wikipedia:
#https://en.wikipedia.org/wiki/Animal_Farm
sample = """
Napoleon enacts changes to the governance structure of the farm, replacing meetings with a committee of pigs who will run the farm.
 """
new_vector = model.infer_vector(word_tokenize(sample))
sims = model.docvecs.most_similar([new_vector])
print(sims)

[('The Old Red Hippopotamus', 0.7321776151657104), ("Father Malachy's Miracle", 0.6923838257789612), ('Dave at Night', 0.6897841095924377), ('Attica', 0.6746035218238831), ('No Thoroughfare', 0.6661922335624695), ('The Dressmaker of Khair Khana', 0.6614097952842712), ('The Extremes', 0.660919725894928), ('Wycliffe and the Cycle of Death', 0.6549147963523865), ('The Secret River', 0.6527315974235535), ('Wycliffe and the Winsor Blue', 0.6500592827796936)]


  sims = model.docvecs.most_similar([new_vector])
