Kaynak [Books : Visualisation and recommendation](https://www.kaggle.com/nayansakhiya/books-visualisation-and-recommendation)

In [1]:
import re
import string
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
# goodreads data
books_data = pd.read_csv('../../data/book_10k/books.csv',error_bad_lines = False)
tags_data = pd.read_csv('../../data/book_10k/book_tags.csv')
ratings_data = pd.read_csv('../../data/book_10k/ratings.csv')
book_tags = pd.read_csv('../../data/book_10k/tags.csv')

# book crossing data
user_cols = ['user_id', 'location', 'age']
cross_users_data = pd.read_csv('../../data/book_x/BX-Users.csv', sep=';', names=user_cols, encoding='latin-1', low_memory=False, skiprows=1)
book_cols = ['isbn', 'book_title' ,'book_author','year_of_publication', 'publisher', 'img_s', 'img_m', 'img_l']
cross_books_data = pd.read_csv('../../data/book_x/BX_Books.csv', sep=';', names=book_cols, encoding='latin-1', low_memory=False, skiprows=1)
rating_cols = ['user_id', 'isbn', 'rating']
cross_ratings_data = pd.read_csv('../../data/book_x/BX-Book-Ratings.csv', sep=';', names=rating_cols, encoding='latin-1', low_memory=False, skiprows=1)

In [3]:
books_data = books_data.drop(columns=['id', 'best_book_id', 'work_id', 'isbn', 'isbn13', 'title','work_ratings_count',
                                   'work_text_reviews_count', 'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 
                                    'image_url','small_image_url'])

In [4]:
books_data

Unnamed: 0,book_id,books_count,authors,original_publication_year,original_title,language_code,average_rating,ratings_count
0,2767052,272,Suzanne Collins,2008.0,The Hunger Games,eng,4.34,4780653
1,3,491,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,eng,4.44,4602479
2,41865,226,Stephenie Meyer,2005.0,Twilight,en-US,3.57,3866839
3,2657,487,Harper Lee,1960.0,To Kill a Mockingbird,eng,4.25,3198671
4,4671,1356,F. Scott Fitzgerald,1925.0,The Great Gatsby,eng,3.89,2683664
...,...,...,...,...,...,...,...,...
9995,7130616,19,Ilona Andrews,2010.0,Bayou Moon,eng,4.09,17204
9996,208324,19,Robert A. Caro,1990.0,Means of Ascent,eng,4.25,12582
9997,77431,60,Patrick O'Brian,1977.0,The Mauritius Command,eng,4.35,9421
9998,8565083,7,Peggy Orenstein,2011.0,Cinderella Ate My Daughter: Dispatches from th...,eng,3.65,11279


In [5]:
books_data = books_data.dropna()
cross_books_data = cross_books_data.drop(columns=['img_s', 'img_m', 'img_l'])

In [6]:
ratings_data = ratings_data.sort_values("user_id")
ratings_data.drop_duplicates(subset =["user_id","book_id"], keep = False, inplace = True) 
books_data.drop_duplicates(subset='original_title',keep=False,inplace=True)
book_tags.drop_duplicates(subset='tag_id',keep=False,inplace=True)
tags_data.drop_duplicates(subset=['tag_id','goodreads_book_id'],keep=False,inplace=True)
cross_ratings_data.drop_duplicates(subset =["user_id","isbn"], keep = False, inplace = True) 
cross_books_data.drop_duplicates(subset='book_title',keep=False,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [8]:
cross_books_data['book_title'] = cross_books_data['book_title'].apply(lambda x:clean_text(x))

In [9]:
merge_data = pd.merge(cross_books_data, cross_ratings_data, on='isbn')
merge_data =  merge_data.sort_values('isbn', ascending=True)
merge_data.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,user_id,rating
626418,0000913154,the way things work an illustrated encyclopedi...,C. van Amerongen (translator),1967,Simon & Schuster,171118,8
587195,0001010565,mogs christmas,Judith Kerr,1992,Collins,209516,0
587194,0001010565,mogs christmas,Judith Kerr,1992,Collins,86123,0
441049,0001046713,twopence to cross the mersey,Helen Forrester,1992,HarperCollins Publishers,196149,0
263949,000104687X,ts eliot reading the wasteland and other poems,T.S. Eliot,1993,HarperCollins Publishers,23902,6


# content based recommondation

In [10]:
content_data = books_data[['original_title','authors','average_rating']]
content_data = content_data.astype(str)

In [11]:
content_data['content'] = content_data['original_title'] + ' ' + content_data['authors'] + ' ' + content_data['average_rating']

In [13]:
content_data.head()

Unnamed: 0,original_title,authors,average_rating,content
0,The Hunger Games,Suzanne Collins,4.34,The Hunger Games Suzanne Collins 4.34
1,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré",4.44,Harry Potter and the Philosopher's Stone J.K. ...
3,To Kill a Mockingbird,Harper Lee,4.25,To Kill a Mockingbird Harper Lee 4.25
4,The Great Gatsby,F. Scott Fitzgerald,3.89,The Great Gatsby F. Scott Fitzgerald 3.89
5,The Fault in Our Stars,John Green,4.26,The Fault in Our Stars John Green 4.26


In [14]:
content_data = content_data.reset_index()
indices = pd.Series(content_data.index, index=content_data['original_title'])

In [16]:
indices

original_title
The Hunger Games                                                                                0
Harry Potter and the Philosopher's Stone                                                        1
To Kill a Mockingbird                                                                           2
The Great Gatsby                                                                                3
The Fault in Our Stars                                                                          4
                                                                                             ... 
Billy Budd, Sailor                                                                           8170
Bayou Moon                                                                                   8171
Means of Ascent                                                                              8172
The Mauritius Command                                                                        8173
Cinde

In [17]:
#removing stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(content_data['authors'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(8175, 5484)

In [18]:
cosine_sim_author = linear_kernel(tfidf_matrix, tfidf_matrix)

In [19]:
def get_recommendations_books(title, cosine_sim=cosine_sim_author):
    idx = indices[title]

    # Get the pairwsie similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim_author[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar books
    sim_scores = sim_scores[1:11]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return list(content_data['original_title'].iloc[book_indices])

In [26]:
def author_book_shows(book):
    for book in book:
        print(book)
    print("-------" * 9)

In [27]:
books1 = get_recommendations_books('The Lord of the Rings', cosine_sim_author)
author_book_shows(books1)
books2 =get_recommendations_books('Shadow Kiss', cosine_sim_author)
author_book_shows(books2)

 The Fellowship of the Ring
The Two Towers
The Return of the King
The Lord of the Rings
The Hobbit and The Lord of the Rings
The Children of Húrin
The Silmarillion
The History of the Hobbit, Part One: Mr. Baggins
The Hobbit
The Hunger Games
---------------------------------------------------------------
Shadow Kiss
Spirit Bound
Blood Promise
Last Sacrifice 
Bloodlines
The Golden Lily
The Indigo Spell
The Fiery Heart
Succubus Blues
Silver Shadows
---------------------------------------------------------------


# colloaborative recommendation

collab yapmadan once

In [28]:
merge_data = merge_data[:40000]

In [29]:
book_rating = pd.pivot_table(merge_data, index='user_id', values='rating', columns='book_title', fill_value=0)
book_rating

book_title,Unnamed: 1_level_0,the year china discovered america,a beginners guide,a space odyssey a novel by arthur c clarke,allamerican favorites,an action plan to protect yourself your family your assets and your community on january,backup recovery,barbary lane a tales of the city omnibus,beers and a chinese meal,black chicks review flicks a film and video guide with flava,...,zone food blocks the quick and easy mixandmatch counter for staying in the zone,zone perfect meals in minutes fast and simple healthy recipes from the bestselling authorof the zone and mastering the zone,zoo animals a smithsonian guide smithsonian guides,zoology coloring book,zoyas story an afghan womans struggle for freedom,zucchini out west,zuleika dobson penguin modern classics,zulu dawn,â¡dã­melo tu,â¡trato hecho spanish for real life edition
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278692,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278771,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278818,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278843,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
book_corr = np.corrcoef(book_rating.T)
book_corr.shape

(12020, 12020)

In [37]:
book_list=  list(book_rating)
book_titles =[] 
for i in range(len(book_list)):
    book_titles.append(book_list[i])

In [38]:
def get_recommendation_collabarative(books_list):
    similar_books = np.zeros(book_corr.shape[0])
    
    for book in books_list:    
        book_index = book_titles.index(book)
        similar_books += book_corr[book_index] 
    book_preferences = []
    for i in range(len(book_titles)):
        book_preferences.append((book_titles[i],similar_books[i]))
        
    return sorted(book_preferences, key= lambda x: x[1], reverse=True)

In [39]:
list_of_books = ['one hundred years of solitude',
                 'stardust',
                 'mogs christmas',
                 'dragonmede',
                 'twopence to cross the mersey',
                 'the candywine development']

In [45]:
books8 = get_recommendation_collabarative(list_of_books)

In [48]:
i=0
n =0
while n < 9:
    similar_books_to_read= books8[i][0]
    i += 1
    if similar_books_to_read in list_of_books:
        continue
    else:
        print(similar_books_to_read)
        n += 1


  the year china discovered america
 a beginners guide
 a space odyssey  a novel by arthur c clarke
 allamerican favorites
 an action plan to protect yourself your family your assets  and your community on january  
 backup  recovery
 barbary lane  a tales of the city omnibus
 beers and a chinese meal
