# EXPLORING BOOK DATA

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gzip
import json

In [2]:
liked_books= ['11250317','1059502','16131027','70355','26083576']

In [7]:
liked_book_mapping = {}
with open('book_id_map.csv','rt') as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id,book_id = line.strip().split(',') #since it is a csv file
        liked_book_mapping[csv_id]=book_id    

In [14]:
liked_book_mapping['0']

'34684622'

In [8]:
#length of the mapping file
len(liked_book_mapping)

2360651

In [15]:
#streaming goodreads_interactions file- 4.0 GB
overlap_users = set() #for unique users
with open("goodreads_interactions.csv",'rt') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id,csv_id,_,rating,_= line.split(',')
        
        if user_id in overlap_users:
            continue
            
        try:
            rating = int(rating)
        
        except ValueError:
            continue
            
        #to create the book_id as csv_id used in mapping dict
        book_id = liked_book_mapping[csv_id]
        
        #if book_id is present in list of liked books with a higher rating 
        if book_id in liked_books and rating>= 4:
            overlap_users.add(user_id)

In [16]:
len(overlap_users) #number of users 

5115

This content manager overlaps the user ids that have liked the same books as the ones in the list. If the users have rated a book higher, then the program would assume that the user has a similar taste in books. 
The set created adds the users with similar book choices by creating unique objects.

This process is a part of collection of data and analysing user experience. The recommendation system's algorithm would collect this data and compare it with further collected user engagement to analyse and filter the necessary information. 

In [17]:
#to create a list of books liked by the users with a similar taste
pot_lines = [] #list of books liked by the users
with open('goodreads_interactions.csv','rt') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id,csv_id,_,rating,_= line.split(',')
        if user_id in overlap_users:
            book_id = liked_book_mapping[csv_id]
            pot_lines.append([user_id,book_id,rating])
    

In [18]:
len(pot_lines)

3266995

In [19]:
recs = pd.DataFrame(pot_lines,columns=['user_id','book_id','rating'])
#convert book_id to str type
recs['book_id'] = recs['book_id'].astype(str)

In [21]:
recs

Unnamed: 0,user_id,book_id,rating
0,43,7603,0
1,43,12813565,5
2,43,87640,5
3,43,91767,0
4,43,48855,4
...,...,...,...
3266990,875482,19620877,5
3266991,875482,717174,0
3266992,875482,717172,1
3266993,875482,7947434,4


In [22]:
#top recommendations based on the users' similar book choices
top_rec = recs['book_id'].value_counts().head(10)
top_rec = top_rec.index.values
top_rec

array(['11250317', '2767052', '3', '11870085', '2657', '4671', '6148028',
       '19063', '15881', '5'], dtype=object)

In [23]:
#checking if the top recommendations are available in the book_titles json file
book_titles = pd.read_json("book_titles.json")

In [24]:
book_titles.head()

Unnamed: 0,book_id,title,ratings,cover_image,url,description,modified_title,word_count
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://images.gr-assets.com/books/1304100136m...,https://www.goodreads.com/book/show/7327624-th...,Omnibus book club edition containing the Ladie...,the unschooled wizard sun wolf and starhawk 12,14
1,6066819,Best Friends Forever,51184,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/6066819-be...,Addie Downs and Valerie Adler were eight when ...,best friends forever,104
2,287141,The Aeneid for Boys and Girls,46,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/287141.The...,"Relates in vigorous prose the tale of Aeneas, ...",the aeneid for boys and girls,55
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://images.gr-assets.com/books/1316637798m...,https://www.goodreads.com/book/show/6066812-al...,"To Kara's astonishment, she discovers that a p...",alls fairy in love and war avalon web of magic 8,164
4,287149,The Devil's Notebook,986,https://images.gr-assets.com/books/1328768789m...,https://www.goodreads.com/book/show/287149.The...,"Wisdom, humor, and dark observations by the fo...",the devils notebook,34


In [28]:
book_titles.drop(['description','word_count'],axis=1,inplace=True)
book_titles

Unnamed: 0,book_id,title,ratings,cover_image,url,modified_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://images.gr-assets.com/books/1304100136m...,https://www.goodreads.com/book/show/7327624-th...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/6066819-be...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/287141.The...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://images.gr-assets.com/books/1316637798m...,https://www.goodreads.com/book/show/6066812-al...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://images.gr-assets.com/books/1328768789m...,https://www.goodreads.com/book/show/287149.The...,the devils notebook
...,...,...,...,...,...,...
1308952,17805813,"Ondine (Ondine Quartet, #0.5)",327,https://images.gr-assets.com/books/1379766592m...,https://www.goodreads.com/book/show/17805813-o...,ondine ondine quartet 05
1308953,331839,Jacqueline Kennedy Onassis: Friend of the Arts,18,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/331839.Jac...,jacqueline kennedy onassis friend of the arts
1308954,2685097,The Spaniard's Blackmailed Bride,112,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/2685097-th...,the spaniards blackmailed bride
1308955,2342551,The Children's Classic Poetry Collection,36,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/2342551.Th...,the childrens classic poetry collection


In [30]:
book_titles[book_titles["book_id"].isin(top_rec)]

  mask |= (ar1 == a)


Unnamed: 0,book_id,title,ratings,cover_image,url,modified_title


In [31]:
top_rec.dtype

dtype('O')

In [32]:
book_titles['book_id']= book_titles['book_id'].astype(str)

In [33]:
book_titles[book_titles["book_id"].isin(top_rec)]

Unnamed: 0,book_id,title,ratings,cover_image,url,modified_title
284473,2767052,"The Hunger Games (The Hunger Games, #1)",4899965,https://images.gr-assets.com/books/1447303603m...,https://www.goodreads.com/book/show/2767052-th...,the hunger games the hunger games 1
373254,19063,The Book Thief,1193697,https://images.gr-assets.com/books/1390053681m...,https://www.goodreads.com/book/show/19063.The_...,the book thief
435078,11250317,The Song of Achilles,48003,https://images.gr-assets.com/books/1331154660m...,https://www.goodreads.com/book/show/11250317-t...,the song of achilles
463463,4671,The Great Gatsby,2758812,https://images.gr-assets.com/books/1490528560m...,https://www.goodreads.com/book/show/4671.The_G...,the great gatsby
569831,5,Harry Potter and the Prisoner of Azkaban (Harr...,1876252,https://images.gr-assets.com/books/1499277281m...,https://www.goodreads.com/book/show/5.Harry_Po...,harry potter and the prisoner of azkaban harry...
790927,2657,To Kill a Mockingbird,3255518,https://images.gr-assets.com/books/1361975680m...,https://www.goodreads.com/book/show/2657.To_Ki...,to kill a mockingbird
833311,6148028,"Catching Fire (The Hunger Games, #2)",1854746,https://images.gr-assets.com/books/1358273780m...,https://www.goodreads.com/book/show/6148028-ca...,catching fire the hunger games 2
878545,3,Harry Potter and the Sorcerer's Stone (Harry P...,4765497,https://images.gr-assets.com/books/1474154022m...,https://www.goodreads.com/book/show/3.Harry_Po...,harry potter and the sorcerers stone harry pot...
995137,15881,Harry Potter and the Chamber of Secrets (Harry...,1821802,https://images.gr-assets.com/books/1474169725m...,https://www.goodreads.com/book/show/15881.Harr...,harry potter and the chamber of secrets harry ...
1095301,11870085,The Fault in Our Stars,2429317,https://images.gr-assets.com/books/1360206420m...,https://www.goodreads.com/book/show/11870085-t...,the fault in our stars


These recommendations are generic top tier popular books that does not showcase the intricacy of a recommendation system. 
We require a system that displays books that are not as popular as these based on the liked book ratings. 

In [34]:
#calculating a score for the books to recommend accordingly
all_potential = recs['book_id'].value_counts()

In [35]:
all_potential

11250317    4916
2767052     3386
3           3160
11870085    2853
2657        2726
            ... 
21529367       1
19241702       1
351522         1
22925248       1
100905         1
Name: book_id, Length: 481324, dtype: int64

In [39]:
all_potential = all_potential.to_frame().reset_index()
all_potential.columns=['book_id','book_count']

In [40]:
all_potential.head()

Unnamed: 0,book_id,book_count
0,11250317,4916
1,2767052,3386
2,3,3160
3,11870085,2853
4,2657,2726


In [41]:
#merging the recommended titles with the book title dataframe, based on common titles. -- inner join on book_id
all_potential = all_potential.merge(book_titles, how='inner',on='book_id')

In [42]:
all_potential.head()

Unnamed: 0,book_id,book_count,title,ratings,cover_image,url,modified_title
0,11250317,4916,The Song of Achilles,48003,https://images.gr-assets.com/books/1331154660m...,https://www.goodreads.com/book/show/11250317-t...,the song of achilles
1,2767052,3386,"The Hunger Games (The Hunger Games, #1)",4899965,https://images.gr-assets.com/books/1447303603m...,https://www.goodreads.com/book/show/2767052-th...,the hunger games the hunger games 1
2,3,3160,Harry Potter and the Sorcerer's Stone (Harry P...,4765497,https://images.gr-assets.com/books/1474154022m...,https://www.goodreads.com/book/show/3.Harry_Po...,harry potter and the sorcerers stone harry pot...
3,11870085,2853,The Fault in Our Stars,2429317,https://images.gr-assets.com/books/1360206420m...,https://www.goodreads.com/book/show/11870085-t...,the fault in our stars
4,2657,2726,To Kill a Mockingbird,3255518,https://images.gr-assets.com/books/1361975680m...,https://www.goodreads.com/book/show/2657.To_Ki...,to kill a mockingbird


In [43]:
all_potential['book_score'] = all_potential['book_count'] * (all_potential['book_count']/all_potential['ratings'])

In [44]:
all_potential.sort_values('book_score',ascending=False).head(10)

Unnamed: 0,book_id,book_count,title,ratings,cover_image,url,modified_title,book_score
1068,32454291,315,Circe,77,https://images.gr-assets.com/books/1503079581m...,https://www.goodreads.com/book/show/32454291-c...,circe,1288.636364
2110,26856502,185,"Vengeful (Villains, #2)",35,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/26856502-v...,vengeful villains 2,977.857143
3334,36307629,125,"King of Scars (King of Scars, #1)",22,https://images.gr-assets.com/books/1506962795m...,https://www.goodreads.com/book/show/36307629-k...,king of scars king of scars 1,710.227273
0,11250317,4916,The Song of Achilles,48003,https://images.gr-assets.com/books/1331154660m...,https://www.goodreads.com/book/show/11250317-t...,the song of achilles,503.448868
2127,24909347,183,"Obsidio (The Illuminae Files, #3)",82,https://images.gr-assets.com/books/1501704611m...,https://www.goodreads.com/book/show/24909347-o...,obsidio the illuminae files 3,408.402439
2514,29749098,158,"Catwoman: Soulstealer (DC Icons, #3)",73,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/29749098-c...,catwoman soulstealer dc icons 3,341.972603
5176,31373184,86,"Untitled (Untitled Trilogy, #1)",31,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/31373184-u...,untitled untitled trilogy 1,238.580645
4004,29422091,107,"These Rebel Waves (Stream Raiders, #1)",55,https://images.gr-assets.com/books/1507232288m...,https://www.goodreads.com/book/show/29422091-t...,these rebel waves stream raiders 1,208.163636
1089,18162954,311,Galatea,479,https://images.gr-assets.com/books/1375741967m...,https://www.goodreads.com/book/show/18162954-g...,galatea,201.922756
1269,26032825,278,"The Cruel Prince (The Folk of the Air, #1)",400,https://images.gr-assets.com/books/1493047153m...,https://www.goodreads.com/book/show/26032825-t...,the cruel prince the folk of the air 1,193.21


The books recommended by this system fall under the similar genre of the liked books list.

In [47]:
all_potential[all_potential['book_count']>300].sort_values('book_score',ascending=False).head(10)

Unnamed: 0,book_id,book_count,title,ratings,cover_image,url,modified_title,book_score
1068,32454291,315,Circe,77,https://images.gr-assets.com/books/1503079581m...,https://www.goodreads.com/book/show/32454291-c...,circe,1288.636364
0,11250317,4916,The Song of Achilles,48003,https://images.gr-assets.com/books/1331154660m...,https://www.goodreads.com/book/show/11250317-t...,the song of achilles,503.448868
1089,18162954,311,Galatea,479,https://images.gr-assets.com/books/1375741967m...,https://www.goodreads.com/book/show/18162954-g...,galatea,201.922756
1013,33590260,326,"Untitled (Throne of Glass, #7)",1190,https://images.gr-assets.com/books/1488914165m...,https://www.goodreads.com/book/show/33590260-u...,untitled throne of glass 7,89.307563
630,30025336,456,All the Crooked Saints,2722,https://images.gr-assets.com/books/1500451773m...,https://www.goodreads.com/book/show/30025336-a...,all the crooked saints,76.390889
711,28220826,420,When the Moon Was Ours,2631,https://images.gr-assets.com/books/1489647354m...,https://www.goodreads.com/book/show/28220826-w...,when the moon was ours,67.04675
674,67696,438,The Last of the Wine,3113,https://images.gr-assets.com/books/1403194003m...,https://www.goodreads.com/book/show/67696.The_...,the last of the wine,61.626727
476,23447923,560,The Inexplicable Logic of My Life,5137,https://images.gr-assets.com/books/1465572387m...,https://www.goodreads.com/book/show/23447923-t...,the inexplicable logic of my life,61.047304
1092,25760792,310,"Timekeeper (Timekeeper, #1)",1599,https://images.gr-assets.com/books/1478538580m...,https://www.goodreads.com/book/show/25760792-t...,timekeeper timekeeper 1,60.100063
209,29283884,872,The Gentleman's Guide to Vice and Virtue,12854,https://images.gr-assets.com/books/1492601464m...,https://www.goodreads.com/book/show/29283884-t...,the gentlemans guide to vice and virtue,59.155438


In [48]:

def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)
all_potential[~all_potential['book_id'].isin(liked_books)].head(10).style.format({'url':make_clickable,'cover_image':show_image})

Unnamed: 0,book_id,book_count,title,ratings,cover_image,url,modified_title,book_score
1,2767052,3386,"The Hunger Games (The Hunger Games, #1)",4899965,,Goodreads,the hunger games the hunger games 1,2.339812
2,3,3160,"Harry Potter and the Sorcerer's Stone (Harry Potter, #1)",4765497,,Goodreads,harry potter and the sorcerers stone harry potter 1,2.095395
3,11870085,2853,The Fault in Our Stars,2429317,,Goodreads,the fault in our stars,3.350575
4,2657,2726,To Kill a Mockingbird,3255518,,Goodreads,to kill a mockingbird,2.282609
5,4671,2632,The Great Gatsby,2758812,,Goodreads,the great gatsby,2.511017
6,6148028,2533,"Catching Fire (The Hunger Games, #2)",1854746,,Goodreads,catching fire the hunger games 2,3.459282
7,19063,2522,The Book Thief,1193697,,Goodreads,the book thief,5.328391
8,15881,2472,"Harry Potter and the Chamber of Secrets (Harry Potter, #2)",1821802,,Goodreads,harry potter and the chamber of secrets harry potter 2,3.354253
9,5,2459,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",1876252,,Goodreads,harry potter and the prisoner of azkaban harry potter 3,3.222745
10,41865,2455,"Twilight (Twilight, #1)",3941381,,Goodreads,twilight twilight 1,1.529166


In [50]:
popular = all_potential[all_potential['book_count']>300].sort_values('book_score',ascending=False).head(10)

In [51]:
popular[~popular['book_id'].isin(liked_books)].head(10).style.format({'url':make_clickable,'cover_image':show_image})

Unnamed: 0,book_id,book_count,title,ratings,cover_image,url,modified_title,book_score
1068,32454291,315,Circe,77,,Goodreads,circe,1288.636364
1089,18162954,311,Galatea,479,,Goodreads,galatea,201.922756
1013,33590260,326,"Untitled (Throne of Glass, #7)",1190,,Goodreads,untitled throne of glass 7,89.307563
630,30025336,456,All the Crooked Saints,2722,,Goodreads,all the crooked saints,76.390889
711,28220826,420,When the Moon Was Ours,2631,,Goodreads,when the moon was ours,67.04675
674,67696,438,The Last of the Wine,3113,,Goodreads,the last of the wine,61.626727
476,23447923,560,The Inexplicable Logic of My Life,5137,,Goodreads,the inexplicable logic of my life,61.047304
1092,25760792,310,"Timekeeper (Timekeeper, #1)",1599,,Goodreads,timekeeper timekeeper 1,60.100063
209,29283884,872,The Gentleman's Guide to Vice and Virtue,12854,,Goodreads,the gentlemans guide to vice and virtue,59.155438


This recommendation displays the popular books that match the liked book choices unlike the previous recommendation, that only catered to the needs of displaying popular books. Those books too were read by the users but were not similar to the next set. 