# 95_ML 실습02 - good_books 2

### `이전시간`

In [6]:
import numpy as np 
import pandas as pd

books = pd.read_csv('./Datasets/goodbooks/books.csv', encoding = 'ISO-8859-1')
ratings = pd.read_csv('./Datasets/goodbooks/ratings.csv', encoding = 'ISO-8859-1')
book_tags = pd.read_csv('./Datasets/goodbooks/book_tags.csv', encoding = 'ISO-8859-1')
tags = pd.read_csv('./Datasets/goodbooks/tags.csv',)

tags_join_DF = pd.merge(book_tags, tags, left_on='tag_id', right_on = 'tag_id', how='inner')
to_read = pd.read_csv('./Datasets/goodbooks/to_read.csv')

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(books['authors'])

In [8]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
titles = books['title']
indices = pd.Series(books.index, index=books['title'])

In [10]:
sim_scores = list(enumerate(cosine_sim[indices['The Hobbit']]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

In [11]:
sim_scores = sim_scores[1:11]
book_indices = [i[0] for i in sim_scores]
titles.iloc[book_indices]

18      The Fellowship of the Ring (The Lord of the Ri...
154            The Two Towers (The Lord of the Rings, #2)
160     The Return of the King (The Lord of the Rings,...
188     The Lord of the Rings (The Lord of the Rings, ...
963     J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
4975        Unfinished Tales of NÃºmenor and Middle-Earth
2308                               The Children of HÃºrin
610              The Silmarillion (Middle-Earth Universe)
8271                   The Complete Guide to Middle-Earth
1128     The History of the Hobbit, Part One: Mr. Baggins
Name: title, dtype: object

In [12]:
books_with_tags = pd.merge(books, tags_join_DF, left_on = 'book_id',
                            right_on = 'goodreads_book_id', how='inner')

-----------------------

In [13]:
# 이번에는 tag로 Tfidf
tf1 = TfidfVectorizer(analyzer = 'word', ngram_range=(1,2),
                    # analyzer : Whether the feature should be made of word or character n-grams. // {'word', 'char', 'char_wb'} or callable, default='word'
                                        # ngram_range : tuple (min_n, max_n), default=(1, 1) 
                                        # The lower and upper boundary of the range of n-values for different n-grams to be extracted.
                        min_df=0, stop_words='english')
                        # min_df : When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold // default 1
                                    # stop_words : {'english'}, list, default=None 
                                    # If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 
                                    # 'english' is currently the only supported string value.

tfidf_matrix1 = tf1.fit_transform(books_with_tags['tag_name'].head(1000))
                    # fit_transform : Learn vocabulary and idf, return document-term matrix.
                                # raw_documents : An iterable which generates either str, unicode or file objects.

cosine_sim1 = linear_kernel(tfidf_matrix1, tfidf_matrix1)   
            # linear_kernel : Compute the linear kernel between X and Y.

In [17]:
titles1 = books['title']
indices1 = pd.Series(books.index, index=books['title'])

def tags_recommendations(title):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim1[idx]))
    sim_scores = sorted(sim_scores, key=lambda x : x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    book_indices = [i[0] for i in sim_scores]

    return titles.iloc[book_indices]

In [18]:
# 태그로 찾아본 The Hobbit와 유사한 책
tags_recommendations('The Hobbit').head(20)

## tag를 사용하니 연관성이 좋아진다고 함.

16             Catching Fire (The Hunger Games, #2)
31                                  Of Mice and Men
107    Confessions of a Shopaholic (Shopaholic, #1)
125                       Dune (Dune Chronicles #1)
149                                    The Red Tent
206          One for the Money (Stephanie Plum, #1)
214                                Ready Player One
231             The Gunslinger (The Dark Tower, #1)
253          Shiver (The Wolves of Mercy Falls, #1)
313                         Inkheart (Inkworld, #1)
Name: title, dtype: object

-----------------

In [19]:
# 임시로 book id 마다 tag를 붙이고
temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
temp_df.head()

Unnamed: 0,book_id,tag_name
0,1,to-read fantasy favorites currently-reading yo...
1,2,to-read fantasy favorites currently-reading yo...
2,3,to-read fantasy favorites currently-reading yo...
3,5,to-read fantasy favorites currently-reading yo...
4,6,to-read fantasy young-adult fiction harry-pott...


In [20]:
# books에 합치기
books = pd.merge(books, temp_df, left_on='book_id', right_on = 'book_id', how='inner')
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,to-read fantasy favorites currently-reading yo...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,to-read fantasy favorites currently-reading yo...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,to-read fantasy favorites currently-reading yo...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,to-read favorites currently-reading young-adul...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,to-read favorites currently-reading young-adul...


In [23]:
pd.Series(books[['authors', 'tag_name']].fillna('').values.tolist())
        # 두 컬럼을 리스트로 합치고,

0       [Suzanne Collins, to-read fantasy favorites cu...
1       [J.K. Rowling, Mary GrandPrÃ©, to-read fantasy...
2       [Stephenie Meyer, to-read fantasy favorites cu...
3       [Harper Lee, to-read favorites currently-readi...
4       [F. Scott Fitzgerald, to-read favorites curren...
                              ...                        
9995    [Ilona Andrews, to-read fantasy favorites curr...
9996    [Robert A. Caro, to-read favorites currently-r...
9997    [Patrick O'Brian, to-read favorites currently-...
9998    [Peggy Orenstein, to-read favorites currently-...
9999    [John Keegan, to-read favorites currently-read...
Length: 10000, dtype: object

In [21]:
# 저자 이름과 태그를 합치기
books['corpus'] = (pd.Series(books[['authors', 'tag_name']].fillna('').values.tolist()).str.join(' '))
                        
books['corpus'][:3]

0    Suzanne Collins to-read fantasy favorites curr...
1    J.K. Rowling, Mary GrandPrÃ© to-read fantasy f...
2    Stephenie Meyer to-read fantasy favorites curr...
Name: corpus, dtype: object

In [24]:
tf_corpus = TfidfVectorizer(analyzer = 'word', ngram_range=(1,2),
                    # analyzer : Whether the feature should be made of word or character n-grams. // {'word', 'char', 'char_wb'} or callable, default='word'
                                        # ngram_range : tuple (min_n, max_n), default=(1, 1) 
                                        # The lower and upper boundary of the range of n-values for different n-grams to be extracted.
                        min_df=0, stop_words='english')
                        # min_df : When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold // default 1
                                    # stop_words : {'english'}, list, default=None 
                                    # If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 
                                    # 'english' is currently the only supported string value.

tfidf_matrix_corpus = tf_corpus.fit_transform(books['corpus'])
                    # fit_transform : Learn vocabulary and idf, return document-term matrix.
                                # raw_documents : An iterable which generates either str, unicode or file objects.

cosine_sim_corpus = linear_kernel(tfidf_matrix_corpus, tfidf_matrix_corpus)   
            # linear_kernel : Compute the linear kernel between X and Y.

In [25]:
titles = books['title']
indices = pd.Series(books.index, index = books['title'])

In [26]:
# 추천함수 만들기
def corpus_recommendations(title):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim_corpus[idx]))
    sim_scores = sorted(sim_scores, key=lambda x : x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    book_indices = [i[0] for i in sim_scores]

    return titles.iloc[book_indices]

In [27]:
# Hobbit과 비슷한 것은
corpus_recommendations('The Hobbit')

188     The Lord of the Rings (The Lord of the Rings, ...
154            The Two Towers (The Lord of the Rings, #2)
160     The Return of the King (The Lord of the Rings,...
18      The Fellowship of the Ring (The Lord of the Ri...
610              The Silmarillion (Middle-Earth Universe)
4975        Unfinished Tales of NÃºmenor and Middle-Earth
2308                               The Children of HÃºrin
963     J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
465                             The Hobbit: Graphic Novel
8271                   The Complete Guide to Middle-Earth
Name: title, dtype: object

----------------

In [28]:
corpus_recommendations('Twilight (Twilight, #1)')

51                                 Eclipse (Twilight, #3)
48                                New Moon (Twilight, #2)
991                    The Twilight Saga (Twilight, #1-4)
833                         Midnight Sun (Twilight, #1.5)
731     The Short Second Life of Bree Tanner: An Eclip...
1618    The Twilight Saga Complete Collection  (Twilig...
4087    The Twilight Saga: The Official Illustrated Gu...
2020             The Twilight Collection (Twilight, #1-3)
72                                The Host (The Host, #1)
219     Twilight: The Complete Illustrated Movie Compa...
Name: title, dtype: object

In [29]:
corpus_recommendations('Romeo and Juliet')

352                      Othello
769                Julius Caesar
124                       Hamlet
153                      Macbeth
247    A Midsummer Night's Dream
838       The Merchant of Venice
854                Twelfth Night
529       Much Ado About Nothing
713                    King Lear
772      The Taming of the Shrew
Name: title, dtype: object