In [45]:
# MatchingBookNamesAndDescriptions.ipynb
#
# Solve HackerRank problem by assigning a book name to each description
########################################################################

import sys

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Return book_names, descriptions read from "file_name" (STDIN if 
# "file_name" is None ):
def read_inputs( file_name ):
    with sys.stdin if file_name is None else open( file_name, "r" ) as stream:
        N = int( stream.readline() )
        book_names = []
        descriptions = []
        for i in range( N ):
            book_names.append( stream.readline() )
        stream.readline() # ***** separator
        for i in range( N ):
            descriptions.append( stream.readline() )
    return book_names, descriptions

# Return vectorizer representing dictionary of words (except
# stop-words) in "book_names_and_descriptions":
def create_count_vectorizer( book_names_and_descriptions ):
    return CountVectorizer().fit( book_names_and_descriptions )
    
def extract_tfidf( documents, count_vectorizer ):
    return TfidfTransformer().fit_transform( count_vectorizer.transform( documents ) )
    
# First version:  For each description, independently choose the
# book_name with most similar tfidf; this can result in the same
# book_name being assigned to multiple descriptions:
# Returns list of 1-origin indices of best-fit title for each
# description
def find_matching_book_names( description_tfidfs, book_name_tfidfs ):
    prod = description_tfidfs.dot( book_name_tfidfs.transpose() )
    return 1 + prod.argmax( 0 )

def matching_book_names_and_descriptions( file_name ):
    book_names, descriptions = read_inputs( file_name )
    count_vectorizer = create_count_vectorizer( book_names + descriptions )
    book_name_tfidfs = extract_tfidf( book_names, count_vectorizer )
    description_tfidfs = extract_tfidf( descriptions, count_vectorizer )
    matching_book_name_indexes = find_matching_book_names( description_tfidfs, book_name_tfidfs )
    for i in range( matching_book_name_indexes.shape[ 1 ] ):
        print( matching_book_name_indexes[ 0, i ] )
    
#############################################################################

file_name = "C:/Users/IBM_ADMIN/MatchingBookNamesAndDescriptions.Sample0.input"
matching_book_names_and_descriptions( file_name )


3
1
2
4
5
[[3 1 2 4 5]]


prod:
 [[0.10811388 0.15268131 0.13828193 0.         0.        ]
 [0.13458328 0.         0.60004395 0.         0.        ]
 [0.70305106 0.         0.20257538 0.         0.        ]
 [0.20432894 0.         0.05254303 0.36264873 0.        ]
 [0.10019668 0.         0.08086898 0.         0.65024353]]
