In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("./input"))

import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

# Any results you write to the current directory are saved as output.

['ratings.csv', 'tags.csv', 'books.csv', 'to_read.csv', 'book_tags.csv']


In [3]:
!pip install sklearn

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [5]:
books = pd.read_csv('./input/books.csv', encoding = "utf-8").drop_duplicates(subset=['title'])
ratings = pd.read_csv('./input/ratings.csv', encoding = "ISO-8859-1")
book_tags = pd.read_csv('./input/book_tags.csv', encoding = "ISO-8859-1")
tags = pd.read_csv('./input/tags.csv')

tags_join_DF = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')
tags_join_DF.head()

to_read = pd.read_csv('./input/to_read.csv')
to_read.head()

Unnamed: 0,user_id,book_id
0,9,8
1,15,398
2,15,275
3,37,7173
4,34,380


In [6]:
books_with_tags = pd.merge(books, tags_join_DF, left_on='book_id', right_on='goodreads_book_id', how='inner')
temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
temp_df.head()
books = pd.merge(books, temp_df, left_on='book_id', right_on='book_id', how='outer')
books['corpus'] = (pd.Series(books[['authors', 'tag_name']]
                .fillna('')
                .values.tolist()
                ).str.join(' '))

In [7]:
tf_corpus = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix_corpus = tf_corpus.fit_transform(books['corpus'])
cosine_sim_corpus = linear_kernel(tfidf_matrix_corpus, tfidf_matrix_corpus)
indices1 = pd.Series(books.index, index=books['title'])

# Build a 1-dimensional array with book titles
titles = books['title']
indices = pd.Series(books.index, index=books['title'])

# Function that get book recommendations based on the cosine similarity score of books tags
def corpus_recommendations(title):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim_corpus[idx]))
    if len(sim_scores[0])==2:
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    else:
        sim_scores = sorted(sim_scores, key=lambda x: x[1][0], reverse=True)
    sim_scores = sim_scores[1:11]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

corpus_recommendations("The Hobbit")

18      The Fellowship of the Ring (The Lord of the Ri...
154            The Two Towers (The Lord of the Rings, #2)
160     The Return of the King (The Lord of the Rings,...
4883          Unfinished Tales ofNúmenor and Middle-Earth
2271                                The Children of Húrin
605              The Silmarillion (Middle-Earth Universe)
8097                   The Complete Guide to Middle-Earth
1115     The History of the Hobbit, Part One: Mr. Baggins
461                             The Hobbit: Graphic Novel
35                              The Giver (The Giver, #1)
Name: title, dtype: object

In [8]:
temp = indices["The Hunger Games (The Hunger Games, #1)"]
sim_scores = list(enumerate(cosine_sim_corpus[temp]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
len(sim_scores[0])

2

In [15]:
corpus_recommendations("Twilight (Twilight, #1)").values

array([u'The Great Gatsby', u'The Hunger Games (The Hunger Games, #1)',
       u'The Fault in Our Stars', u'Pride and Prejudice',
       u"Harry Potter and the Sorcerer's Stone (Harry Potter, #1)",
       u'Private Games (Private #3)', u'The Death of Ivan Ilych',
       u'The Namesake', u'The Catcher in the Rye', u'Our Mutual Friend'],
      dtype=object)

In [10]:
recommendations = [] 
error = []

for index, row in books.iterrows():
    temp = [row['book_id'],row['title']]
    rec =  corpus_recommendations(row['title'])
    for r in rec.values.tolist():
        df = books[books['title']==r]
        temp.append(df.book_id.tolist()[0])
    recommendations.append(temp)


In [14]:
corpus_recommendations("The Hunger Games (The Hunger Games, #1)")

4                                        The Great Gatsby
2                                 Twilight (Twilight, #1)
5                                  The Fault in Our Stars
1       Harry Potter and the Sorcerer's Stone (Harry P...
9                                     Pride and Prejudice
6174                           Private Games (Private #3)
1974                              The Death of Ivan Ilych
7                                  The Catcher in the Rye
374                                          The Namesake
5794                                    Our Mutual Friend
Name: title, dtype: object

In [11]:
recommendations

[[1,
  u'The Hunger Games (The Hunger Games, #1)',
  5,
  3,
  6,
  2,
  10,
  6294,
  2002,
  8,
  378,
  5907],
 [2,
  u"Harry Potter and the Sorcerer's Stone (Harry Potter, #1)",
  1,
  6,
  5,
  3,
  10,
  6294,
  7996,
  7983,
  2002,
  8],
 [3, u'Twilight (Twilight, #1)', 5, 1, 6, 10, 2, 6294, 2002, 378, 8, 5907],
 [4,
  u'To Kill a Mockingbird',
  533,
  7240,
  9888,
  6493,
  8394,
  9362,
  477,
  1358,
  1424,
  1600],
 [5, u'The Great Gatsby', 3, 1, 6, 10, 2, 6294, 2002, 8, 378, 5907],
 [6, u'The Fault in Our Stars', 5, 3, 1, 2, 10, 6294, 2002, 8, 5907, 378],
 [7, u'The Hobbit', 19, 155, 161, 4976, 2309, 611, 8272, 1129, 466, 36],
 [8, u'The Catcher in the Rye', 10, 5, 1, 3, 6, 2, 2002, 6294, 30, 7996],
 [9,
  u'Angels & Demons  (Robert Langdon, #1)',
  145,
  170,
  201,
  240,
  729,
  1484,
  3246,
  3247,
  4885,
  5332],
 [10, u'Pride and Prejudice', 1, 5, 3, 6, 2, 8, 2002, 7996, 6294, 7983],
 [11, u'The Kite Runner', 8695, 13, 8694, 365, 8698, 2493, 357, 33, 830, 8909

In [38]:
df = pd.DataFrame(recommendations)

df.columns = ['book_id', 'title', '1', '2','3','4','5','6','7','8','9','10']

df

Unnamed: 0,book_id,title,1,2,3,4,5,6,7,8,9,10
0,1,"The Hunger Games (The Hunger Games, #1)",5,3,6,2,10,6175,1975,8,375,5795
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,1,6,5,3,10,6175,7833,7820,1975,8
2,3,"Twilight (Twilight, #1)",5,1,6,10,2,6175,1975,375,8,5795
3,4,To Kill a Mockingbird,528,7096,9676,6368,8219,9164,473,1343,1405,1580
4,5,The Great Gatsby,3,1,6,10,2,6175,1975,8,375,5795
5,6,The Fault in Our Stars,5,3,1,2,10,6175,1975,8,5795,375
6,7,The Hobbit,19,155,161,4884,2272,606,8098,1116,462,36
7,8,The Catcher in the Rye,10,5,1,3,6,2,1975,6175,30,7833
8,9,"Angels & Demons (Robert Langdon, #1)",145,170,199,237,723,1464,3195,3196,4796,5233
9,10,Pride and Prejudice,1,5,3,6,2,8,1975,7833,6175,7820


In [39]:
df.to_csv("recommendations.csv",index=False, header=True)