In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset = pd.read_csv('preprocessed_ratings.csv', sep=',', error_bad_lines=False, encoding='latin-1')

In [3]:
dataset

Unnamed: 0,User_ID,ISBN,Book_Rating,Book_Title,Book-Author,Year_Of_Publication,Publisher,Location,Age
0,53,451,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"strafford, missouri, usa",34.0
1,53,280,0,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown","strafford, missouri, usa",34.0
2,53,647,0,The Da Vinci Code,Dan Brown,2003,Doubleday,"strafford, missouri, usa",34.0
3,53,2028,0,Wild Animus,Rich Shapero,2004,Too Far,"strafford, missouri, usa",34.0
4,53,241,0,Four To Score (A Stephanie Plum Novel),Janet Evanovich,1999,St. Martin's Paperbacks,"strafford, missouri, usa",34.0
...,...,...,...,...,...,...,...,...,...
137568,482,1739,8,Seven Habits Of Highly Effective People,Stephen R. Covey,1990,Free Press,"n/a, channel islands, guernsey",35.0
137569,482,892,0,Illusions: The Adventures of a Reluctant Messiah,Richard Bach,1994,Dell Publishing Company,"n/a, channel islands, guernsey",35.0
137570,1666,16,9,The Professor and the Madman,Simon Winchester,1998,HarperCollins Publishers,"kansas city, missouri, usa",44.0
137571,1666,1705,5,"ROAD LESS TRAVELED : A New Psychology of Love,...",M. Scott Peck,1980,Touchstone,"kansas city, missouri, usa",44.0


In [4]:
books = dataset.filter(['Book_Title' , 'Book-Author', 'Year_Of_Publication','Publisher']).copy()
books = books.drop_duplicates(subset='Book_Title', keep='first')

users = dataset.filter(['User_ID', 'Location', 'Age']).copy()
users = users.drop_duplicates(subset='User_ID', keep='first')

In [5]:
books

Unnamed: 0,Book_Title,Book-Author,Year_Of_Publication,Publisher
0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown"
2,The Da Vinci Code,Dan Brown,2003,Doubleday
3,Wild Animus,Rich Shapero,2004,Too Far
4,Four To Score (A Stephanie Plum Novel),Janet Evanovich,1999,St. Martin's Paperbacks
...,...,...,...,...
19416,Childhood's End,Arthur C. Clarke,1987,Del Rey Books
20246,Creature,John Saul,1990,Bantam Books
20958,Behind the Scenes at the Museum,Kate Atkinson,1996,Black Swan
29271,Wuthering Heights (Penguin Classics),Emily Bronte,1985,Penguin USA


In [6]:
users.iloc[0]

User_ID                           53
Location    strafford, missouri, usa
Age                             34.0
Name: 0, dtype: object

In [7]:
def similarity_matrix(dataset):
    string = dataset.to_string(header=False,index=False).split('\n')
    tfidf = TfidfVectorizer(stop_words='english')
    dataset_matrix = tfidf.fit_transform(string)
    similar_matrix = linear_kernel(dataset_matrix,dataset_matrix)
    return similar_matrix

In [8]:
# book_str = books.to_string(header=False,index=False).split('\n')


In [9]:
# tfidf = TfidfVectorizer(stop_words='english')
# books_matrix = tfidf.fit_transform(book_str)

In [10]:
# similarity_matrix = linear_kernel(books_matrix,books_matrix)

In [11]:
book_mapping = pd.Series(books.index,index = books['Book_Title'])
user_mapping = pd.Series(users.index,index = users['User_ID'])
user_mapping

User_ID
53           0
94         100
217        172
273        612
586        679
         ...  
1250    137561
2332    137567
482     137568
1666    137570
2580    137572
Length: 2954, dtype: int64

In [12]:
def cal_sim_score(similarity_matrix,book_index):
     #get similarity values with other books
    #similarity_score is the list of index and similarity matrix
    similarity_score = list(enumerate(similarity_matrix[book_index]))#sort in descending order the similarity score of book inputted with all the other books
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)# Get the scores of the 15 most similar movies. Ignore the first movie.
    similarity_score = similarity_score[1:15]#return movie names using the mapping series
    return similarity_score

In [13]:
def recommend_books(book):
    book_index = book_mapping[book]
    sim_matrix = similarity_matrix(books)
    similarity_score= cal_sim_score(sim_matrix,book_index)
    book_indices = [i[0] for i in similarity_score]
    return (books['Book_Title'].iloc[book_indices])

In [14]:
def similar_users(user):
    user_index = user_mapping[user]
    sim_matrix = similarity_matrix(users)
    similarity_score = cal_sim_score(sim_matrix,user_index)
    user_indices = [i[0] for i in similarity_score]
    return (users['User_ID'].iloc[user_indices])

In [15]:
recommend_books('Flesh Tones: A Novel')

3836                                      Flesh and Blood
1980                                          Rose Madder
2139    Back When We Were Grownups : A Novel (Ballanti...
697                                  Name of the Rose-Nla
4904    What We Keep : A Novel (Ballantine Reader's Ci...
1638                                         What We Keep
336                                        P Is for Peril
71                                        The Murder Book
62      Big Stone Gap: A Novel (Ballantine Reader's Ci...
1104     The Diary of Ellen Rimbauer: My Life at Rose Red
77      Big Cherry Holler: A Big Stone Gap Novel (Ball...
69                                            The Surgeon
1330                             Billy Straight : A Novel
337                                         Summer Island
Name: Book_Title, dtype: object

In [17]:
similar_users(53)

132009    1829
113138     359
97148     1645
93796     1090
95310     1979
131101    1466
126013    1164
131995    2877
120075     910
30588     2716
102634    1292
5519       285
61801      161
76190     1246
Name: User_ID, dtype: int64