In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = 100

## Import datasets

In [2]:
# Source:
# https://www.kaggle.com/zygmunt/goodbooks-10k

btag = pd.read_csv("../data/book_tags.csv")
book = pd.read_csv("../data/books.csv")
rati = pd.read_csv("../data/ratings.csv")
tags = pd.read_csv("../data/tags.csv")
trea = pd.read_csv("../data/to_read.csv")

# https://www.kaggle.com/meetnaren/goodreads-best-books
bdat = pd.read_csv("../data/book_data.csv")

In [3]:
book.head(1)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...


In [4]:
rati.head(1)

Unnamed: 0,user_id,book_id,rating
0,1,258,5


In [5]:
trea.head(1)

Unnamed: 0,user_id,book_id
0,9,8


In [6]:
btag.head(1)

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697


In [7]:
tags.head(1)

Unnamed: 0,tag_id,tag_name
0,0,-


In [8]:
bdat.head(1)

Unnamed: 0,book_authors,book_desc,book_edition,book_format,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url
0,Suzanne Collins,Winning will make you famous. Losing means cer...,,Hardcover,9780440000000.0,374 pages,4.33,5519135,160706,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...


## Cleaning

### book

In [9]:
book.isna().sum()

book_id                         0
goodreads_book_id               0
best_book_id                    0
work_id                         0
books_count                     0
isbn                          700
isbn13                        585
authors                         0
original_publication_year      21
original_title                585
title                           0
language_code                1084
average_rating                  0
ratings_count                   0
work_ratings_count              0
work_text_reviews_count         0
ratings_1                       0
ratings_2                       0
ratings_3                       0
ratings_4                       0
ratings_5                       0
image_url                       0
small_image_url                 0
dtype: int64

In [10]:
# what language are the language_code = NaN books in?
titles = list(book[book["language_code"].isna()]["title"])
titles[900:]

['A Perfect Spy',
 "Screw It, Let's Do It: Lessons In Life",
 'Revolting Rhymes',
 'The Guns of Navarone',
 'Tell-All',
 'The Lady & Sons Savannah Country Cookbook',
 'No One Writes to the Colonel and Other Stories',
 'The Climb: Tragic Ambitions on Everest',
 'An Old-Fashioned Girl',
 'Richard II',
 'Prisoner of My Desire',
 'The Book on the Taboo Against Knowing Who You Are',
 'Tampa',
 'The Unofficial Harry Potter Cookbook: From Cauldron Cakes to Knickerbocker Glory--More Than 150 Magical Recipes for Wizards and Non-Wizards Alike (Unofficial Cookbook)',
 'The Automatic Millionaire: A Powerful One-Step Plan to Live and Finish Rich',
 'Ghettoside: A True Story of Murder in America',
 'Once a Runner',
 'The Ear, the Eye, and the Arm',
 'Stay Out of the Basement  (Goosebumps, #2)',
 'One Day at Horrorland (Goosebumps, #16)',
 'Footfall',
 'Stone Soup',
 'Girl With Curious Hair',
 'The Revenge of the Baby-Sat',
 'The Secret Between Us',
 'Knife Edge (Noughts & Crosses, #2)',
 'The Cake M

In [11]:
# all NaN in language_codes seem to refer to books in English.
# filling value
book["language_code"] = np.where(book["language_code"].isna(), "en", book["language_code"])

In [12]:
book["language_code"].sort_values().value_counts()

eng      6341
en-US    2070
en       1088
en-GB     257
ara        64
en-CA      58
fre        25
ind        21
spa        20
ger        13
jpn         7
per         7
por         6
pol         6
dan         3
nor         3
fil         2
ita         2
rus         1
nl          1
tur         1
rum         1
swe         1
mul         1
vie         1
Name: language_code, dtype: int64

In [13]:
# let's work only with English books
langs = ['eng', 'en-US', 'en-GB', 'en-CA', 'en']
enbook = book[book['language_code'].isin(langs)]
enbook.shape

(9814, 23)

In [14]:
enbook = enbook.astype({"isbn13":"object"})
enbook.dtypes

book_id                        int64
goodreads_book_id              int64
best_book_id                   int64
work_id                        int64
books_count                    int64
isbn                          object
isbn13                        object
authors                       object
original_publication_year    float64
original_title                object
title                         object
language_code                 object
average_rating               float64
ratings_count                  int64
work_ratings_count             int64
work_text_reviews_count        int64
ratings_1                      int64
ratings_2                      int64
ratings_3                      int64
ratings_4                      int64
ratings_5                      int64
image_url                     object
small_image_url               object
dtype: object

In [15]:
enbook.isna().sum()

book_id                        0
goodreads_book_id              0
best_book_id                   0
work_id                        0
books_count                    0
isbn                         628
isbn13                       542
authors                        0
original_publication_year     20
original_title               575
title                          0
language_code                  0
average_rating                 0
ratings_count                  0
work_ratings_count             0
work_text_reviews_count        0
ratings_1                      0
ratings_2                      0
ratings_3                      0
ratings_4                      0
ratings_5                      0
image_url                      0
small_image_url                0
dtype: int64

In [16]:
# leaving isbn, isbn13, original_publication_year and original_title as is

### bdat

In [17]:
# how is bdat?
bdat.isna().sum()

book_authors             0
book_desc             1331
book_edition         48848
book_format           1656
book_isbn            12866
book_pages            2522
book_rating              0
book_rating_count        0
book_review_count        0
book_title               0
genres                3242
image_url              683
dtype: int64

In [18]:
bdat["book_isbn"].unique()

array(['9.78044E+12', '9.78006E+12', '9.78068E+12', '9.78032E+12',
       '9.78038E+12', '9.78007E+12', '9.78045E+12', '9.78035E+12',
       '9.78053E+12', '9.78039E+12', '9.78031E+12', '9.78074E+12',
       '9.78014E+12', '9.78081E+12', '9.78067E+12', '9.78142E+12',
       '9.7804E+12', nan, '9.78016E+12', '9.78052E+12', '9.78159E+12',
       '9.78055E+12', '9.78079E+12', '9.7801E+12', '9.78077E+12',
       '9.78157E+12', '9.78019E+12', '9.78062E+12', '9.78001E+12',
       '9.78037E+12', '9.78034E+12', '9.78024E+12', '9.78075E+12',
       '9.78076E+12', '9.7814E+12', '9.78057E+12', '9.78093E+12',
       '9.78156E+12', '9.7816E+12', '9.78158E+12', '9.78E+12',
       '9.78189E+12', '9.78141E+12', '9.78161E+12', '9.7808E+12',
       '9.78043E+12', '9.78015E+12', '9.78144E+12', '9.78069E+12',
       '9.78059E+12', '9.78097E+12', '9.78096E+12', '9.78098E+12',
       '9.78184E+12', '9.78125E+12', '9.78049E+12', '9.78054E+12',
       '9.78186E+12', '9.78155E+12', '9.78092E+12', '9.78162E+12'

In [19]:
# dropping book_isbn and image_url columns
# isbn values are rubbish due to bad conversion from float to object
bdat.drop(["book_isbn", "image_url"], axis=1, inplace=True)

In [20]:
bdat.dtypes

book_authors          object
book_desc             object
book_edition          object
book_format           object
book_pages            object
book_rating          float64
book_rating_count      int64
book_review_count      int64
book_title            object
genres                object
dtype: object

### Merging bdat and enbook

In [21]:
bdat["book_authors"].unique()

array(['Suzanne Collins', 'J.K. Rowling|Mary GrandPré', 'Harper Lee', ...,
       'Howard Megdal', 'Mimi Baird|Eve Claxton', 'Leah Price'],
      dtype=object)

In [22]:
enbook["authors"].unique()

array(['Suzanne Collins', 'J.K. Rowling, Mary GrandPré',
       'Stephenie Meyer', ..., 'Ian Mortimer', 'Peggy Orenstein',
       'John Keegan'], dtype=object)

In [23]:
enbook["nauthors"] = enbook["authors"].str.replace(", ", "|")
enbook["nauthors"].unique()

array(['Suzanne Collins', 'J.K. Rowling|Mary GrandPré', 'Stephenie Meyer',
       ..., 'Ian Mortimer', 'Peggy Orenstein', 'John Keegan'],
      dtype=object)

In [24]:
bdat[bdat["book_authors"].isin(enbook["nauthors"])]["book_authors"]

0                   Suzanne Collins
1        J.K. Rowling|Mary GrandPré
2                        Harper Lee
4                   Stephenie Meyer
5                      Markus Zusak
                    ...            
54248              Sherrilyn Kenyon
54257                Robert Cormier
54262                 Richard Yates
54278       Kiyohiko Azuma|あずま きよひこ
54294                 Siri Hustvedt
Name: book_authors, Length: 18563, dtype: object

In [25]:
bdat[bdat["book_title"].isin(enbook["title"])]["book_title"]

2                                    To Kill a Mockingbird
3                                      Pride and Prejudice
5                                           The Book Thief
7                                              Animal Farm
8                                       Gone with the Wind
                               ...                        
54121                                            The Girls
54152    What Got You Here Won't Get You There: How Suc...
54162            The Amazing Adventures of Kavalier & Clay
54238                                       The Confession
54251                                          The Promise
Name: book_title, Length: 6189, dtype: object

In [26]:
ds = pd.merge(bdat, enbook, right_on=["title", "nauthors"], left_on=["book_title", "book_authors"], how="inner")
ds.shape

(4898, 34)

In [27]:
cols = ['book_authors', 'book_title', 'original_title', 'genres', 'book_desc', 'book_edition', 'book_format',
       'book_pages', 'original_publication_year', 'book_rating', 'book_rating_count', 'book_review_count',
       'book_id', 'goodreads_book_id']
ds = ds[cols]
ds

Unnamed: 0,book_authors,book_title,original_title,genres,book_desc,book_edition,book_format,book_pages,original_publication_year,book_rating,book_rating_count,book_review_count,book_id,goodreads_book_id
0,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,The unforgettable novel of a childhood in a sl...,50th Anniversary,Paperback,324 pages,1960.0,4.27,3745197,79450,4,2657
1,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,Harper Lee's Pulitzer Prize-winning masterwork...,,Mass Market Paperback,309 pages,1960.0,4.27,3746569,79475,4,2657
2,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,One of a series of fiction titles for schools....,New Windmill,Hardcover,287 pages,1960.0,4.27,3746774,79478,4,2657
3,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,"An unforgettable story of the violent, intoler...",,Paperback,285 pages,1960.0,4.27,3747139,79489,4,2657
4,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,"'Shoot all the bluejays you want, if you can h...",,Kindle Edition,385 pages,1960.0,4.27,3747228,79492,4,2657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,Mercer Mayer,I Was So Mad,I Was So Mad (Look-Look),Childrens|Picture Books|Childrens,Mercer Mayer's Little Critter is having quite ...,,Paperback,24 pages,1983.0,4.22,17125,193,6498,386421
4894,Kazuo Ishiguro,When We Were Orphans,When We Were Orphans,Fiction|Historical|Historical Fiction|Mystery|...,"From the Booker Prize-winning, bestselling aut...",,Paperback,336 pages,2000.0,3.47,21291,2037,6364,28923
4895,Sun Tzu|Thomas Cleary,The Art of War,孫子兵法 [Sūnzi bīngfǎ],Nonfiction|Classics|Philosophy|History|War,Here is a seminal work on the philosophy of su...,,Paperback,172 pages,-500.0,3.96,247330,7816,403,10534
4896,Emma Cline,The Girls,The Girls,Fiction|Historical|Historical Fiction|Adult,Evie Boyd is desperate to be noticed. In the s...,,Hardcover,355 pages,2016.0,3.47,130881,14439,1174,26893819


In [28]:
# grouping genres into broader genres

broad_genres = ["Anthology","Art","Autobiography and biography","Business and money","Childrens","Comics and graphic novels","Computers and technology","Cookbooks, Food and Wine","Crafts, Hobbies and Home","Education and Teaching","Engineering and Transportation","Health and fitness","History","Humor and Entertainment","Law","LGBTQ and gender studies","Literature and fiction","Maths and science","Medical books","Mystery, thriller, suspense and horror","Parenting and relationships","Politics and social sciences","Reference","Religion and spirituality","Romance","Science fiction and fantasy","Self-Help","Sports and outdoors","Teen and young adult","Travel"]
genre_dict = {'Childrens':"Childrens",
 'Folk Tales':"Literature and fiction",
 'Suspense':"Mystery, thriller, suspense and horror",
 'Prayer':"Religion and spirituality",
 'Social Issues':"Politics and social sciences",
 'Graphic Novels':"Comics and graphic novels",
 'Teaching':"Education and Teaching",
 'Marriage':"Parenting and relationships",
 'Dogs':"Crafts, Hobbies and Home",
 'Germany':"Travel",
 'Southern Gothic':"Science fiction and fantasy",
 'Superheroes':"Comics and graphic novels",
 'Young Adult Fantasy':"Teen and young adult",
 'Movies':"Art",
 'College':"Education and Teaching",
 'Romantic':"Romance",
 'Plays':"Literature and fiction",
 'Catholic':"Religion and spirituality",
 'Love':"Parenting and relationships",
 'Cultural':"Politics and social sciences",
 'Media Tie In':"",
 'Money':"Business and money",
 'Relationships':"Parenting and relationships",
 'Shapeshifters':"Science fiction and fantasy",
 'Mystery':"Mystery, thriller, suspense and horror",
 'Islam':"Religion and spirituality",
 'Werewolves':"Science fiction and fantasy",
 'Memoir':"Autobiography and biography",
 'Mental Health':"Health and fitness",
 'How To':"Crafts, Hobbies and Home",
 '20th Century':"History",
 'Adoption':"Parenting and relationships",
 'Arthurian':"Science fiction and fantasy",
 'Philosophy':"Politics and social sciences",
 'Politics':"Politics and social sciences",
 'Education':"Education and Teaching",
 'Sequential Art':"Art",
 'Poetry Plays':"Literature and fiction",
 'Space Opera':"Science fiction and fantasy",
 'Juvenile':"Teen and young adult",
 'Dystopia':"Science fiction and fantasy",
 'Pirates':"Science fiction and fantasy",
 'Witches':"Science fiction and fantasy",
 'Christian Non Fiction':"Religion and spirituality",
 'Chick Lit':"Romance",
 'Evolution':"Maths and science",
 'The United States Of America':"Travel",
 'World History':"History",
 'Genetics':"Maths and science",
 'Urban':"Literature and fiction",
 'Abuse':"Self-Help",
 'Theory':"",
 'Latin American Literature':"Literature and fiction",
 'Vegan':"Cookbooks, Food and Wine",
 'English Literature':"Literature and fiction",
 'Parenting':"Parenting and relationships",
 'Steampunk':"Science fiction and fantasy",
 'Road Trip':"Humor and Entertainment",
 'Teen':"Teen and young adult",
 'Historical Romance':"Literature and fiction",
 'Greece':"Travel",
 'Buddhism':"Religion and spirituality",
 'Church History':"History",
 'Management':"Business and money",
 'Finance':"Business and money",
 'Police':"Mystery, thriller, suspense and horror",
 'Bdsm':"Parenting and relationships",
 'Ethiopia':"Travel",
 'Punk':"Art",
 'Jazz':"Art",
 'Crime':"Mystery, thriller, suspense and horror",
 'Literary Fiction':"Literature and fiction",
 'Batman':"Comics and graphic novels",
 'War':"History",
 'Academic':"Education and Teaching",
 'Womens Fiction':"Literature and fiction",
 'French Literature':"Literature and fiction",
 'Skepticism':"Politics and social sciences",
 'Fables':"Literature and fiction",
 'Currency':"Business and money",
 'Taoism':"Religion and spirituality",
 'India':"Travel",
 'Did Not Finish':"",
 'Ghosts':"Mystery, thriller, suspense and horror",
 'Drawing':"Art",
 'Personal Finance':"Business and money",
 'Portugal':"Travel",
 'Basketball':"Sports and outdoors",
 'Nutrition':"Health and fitness",
 'Literary Criticism':"Literature and fiction",
 'Gardening':"Crafts, Hobbies and Home",
 'Animals':"Crafts, Hobbies and Home",
 'Aliens':"Science fiction and fantasy",
 'Spanish Literature':"Literature and fiction",
 'Read For School':"Education and Teaching",
 'Mythology':"Religion and spirituality",
 'Anthropology':"Politics and social sciences",
 'Indian Literature':"Literature and fiction",
 'Biography Memoir':"Autobiography and biography",
 'Chemistry':"Maths and science",
 'United States':"Travel",
 'Medieval History':"History",
 'Sports Romance':"Romance",
 'Medievalism':"History",
 'Medicine':"Medical books",
 'Fairies':"Science fiction and fantasy",
 'Books About Books':"",
 'Paranormal':"Mystery, thriller, suspense and horror",
 'Mathematics':"Maths and science",
 'Denmark':"Travel",
 'Realistic Fiction':"Literature and fiction",
 'Inspirational':"Self-Help",
 'Anthologies':"Anthology",
 'Scotland':"Travel",
 'Iran':"Travel",
 'Gender Studies':"LGBTQ and gender studies",
 'Gender':"LGBTQ and gender studies",
 'Self Help':"Self-Help",
 'Personal Development':"Self-Help",
 'Sweden':"Travel",
 'European Literature':"Literature and fiction",
 'Marathi':"Politics and social sciences",
 'Transport':"Engineering and Transportation",
 'Womens':"LGBTQ and gender studies",
 'Fiction':"Literature and fiction",
 'Hinduism':"Religion and spirituality",
 'Fairy Tales':"Literature and fiction",
 'Dark':"Mystery, thriller, suspense and horror",
 'Spider Man':"Comics and graphic novels",
 'Theatre':"Literature and fiction",
 'Victorian':"History",
 'Spain':"Travel",
 '17th Century':"History",
 'Asian Literature':"Literature and fiction",
 'History':"History",
 'Autobiography':"Autobiography and biography",
 'Lovecraftian':"Science fiction and fantasy",
 'Psychology':"Politics and social sciences",
 'Italy':"Travel",
 'Vegetarian':"Cookbooks, Food and Wine",
 'Western Africa':"Travel",
 'Bangladesh':"Travel",
 'Poverty':"Politics and social sciences",
 'Japanese Literature':"Literature and fiction",
 'Sudan':"Travel",
 'Medical':"Medical books",
 'Fantasy':"Science fiction and fantasy",
 'New Age':"Religion and spirituality",
 'Comics':"Comics and graphic novels",
 'Adult':"Literature and fiction",
 'Productivity':"Self-Help",
 'Apocalyptic':"Science fiction and fantasy",
 'Business':"Business and money",
 'Military Fiction':"Literature and fiction",
 'Funny':"Humor and Entertainment",
 'Science':"Maths and science",
 'Classic Literature':"Literature and fiction",
 'Dc Comics':"Comics and graphic novels",
 'Erotica':"Romance",
 'Theology':"Religion and spirituality",
 'Sociology':"Politics and social sciences",
 'Detective':"Mystery, thriller, suspense and horror",
 'Unfinished':"",
 'Hugo Awards':"",
 'Nonfiction':"Politics and social sciences",
 'Academia':"Education and Teaching",
 'North American Hi...':"History",
 'Contemporary Romance':"Literature and fiction",
 'Dragons':"Science fiction and fantasy",
 'Cthulhu Mythos':"Science fiction and fantasy",
 'Emergency Services':"Politics and social sciences",
 'Historical':"History",
 'Outdoors':"Sports and outdoors",
 'Technology':"Computers and technology",
 'Civil War':"History",
 'Novella':"Literature and fiction",
 'Dark Fantasy':"Science fiction and fantasy",
 'Drama':"Literature and fiction",
 'Journalism':"Politics and social sciences",
 'Nobel Prize':"",
 'X Men':"Comics and graphic novels",
 'Art History':"Art",
 'Writing':"Literature and fiction",
 'School':"Education and Teaching",
 'Queer':"LGBTQ and gender studies",
 'Canadian Literature':"Literature and fiction",
 'Middle Grade':"Teen and young adult",
 'Christianity':"Religion and spirituality",
 'M M Romance':"Romance",
 '40k':"",
 'Polyamory':"Parenting and relationships",
 'Spirituality':"Religion and spirituality",
 'China':"Travel",
 'Russia':"Travel",
 'Biology':"Maths and science",
 'Gothic':"Horror and mystery",
 'Wicca':"Religion and spirituality",
 'New Adult':"Teen and young adult",
 'Turkish Literature':"Literature and fiction",
 'Alternate History':"Science fiction and fantasy",
 'Microhistory':"History",
 'Linguistics':"Politics and social sciences",
 'Kids':"Childrens",
 'Superman':"Comics and graphic novels",
 'Counselling':"Politics and social sciences",
 'Mental Illness':"Health and fitness",
 'Zen':"Religion and spirituality",
 'Physics':"Maths and science",
 'American':"Travel",
 'Christian':"Religion and spirituality",
 'Culinary':"Cookbooks, Food and Wine",
 'French Revolution':"History",
 'Latin American':"Travel",
 'Fitness':"Health and fitness",
 'Biography':"Autobiography and biography",
 'Star Wars':"Comics and graphic novels",
 'Death':"Self-Help",
 '18th Century':"History",
 'Nature':"Sports and outdoors",
 'Climbing':"Sports and outdoors",
 'Time Travel':"Science fiction and fantasy",
 'Israel':"Travel",
 'Lds':"",
 'Buisness':"Business and money",
 'Photography':"Art",
 'Space':"Maths and science",
 'Family':"Parenting and relationships",
 'Horses':"Crafts, Hobbies and Home",
 'Czech Literature':"Literature and fiction",
 'Military':"History",
 'Native Americans':"Politics and social sciences",
 'Italian Literature':"Literature and fiction",
 'Lgbt':"LGBTQ and gender studies",
 'Short Stories':"Anthology",
 'Tragedy':"Literature and fiction",
 'Baseball':"Sports and outdoors",
 'Young Adult Contemporary':"Teen and young adult",
 'Health':"Health and fitness",
 'Essays':"Anthology",
 'Sports':"Sports and outdoors",
 'Canada':"Travel",
 'Novels':"Literature and fiction",
 'Food':"Cookbooks, Food and Wine",
 'Turkish':"Travel",
 'True Crime':"Literature and fiction",
 'Clean Romance':"Romance",
 'Lesbian':"LGBTQ and gender studies",
 'Abandoned':"",
 'Christian Living':"Religion and spirituality",
 'Language':"Politics and social sciences",
 'Natural History':"Maths and science",
 'Poetry':"Poetry",
 'Vampires':"Mystery, thriller, suspense and horror",
 'Ancient':"History",
 'Sports and Games':"Sports and outdoors",
 'European History':"History",
 'Disability':"Health and fitness",
 'Criticism':"Literature and fiction",
 'Zombies':"Mystery, thriller, suspense and horror",
 'Magical Realism':"Science fiction and fantasy",
 'Football':"Sports and outdoors",
 'English History':"History",
 'Metaphysics':"Politics and social sciences",
 'Communication':"Politics and social sciences",
 'Picture Books':"Childrens",
 'Storytime':"Childrens",
 'Romantic Suspense':"Romance",
 'Adventure':"Action and adventure",
 'Christian Fiction':"Religion and spirituality",
 'Audiobook':"",
 'Rwanda':"Travel",
 'Witchcraft':"Crafts, Hobbies and Home",
 'German Literature':"Literature and fiction",
 'Art and Photography':"Art",
 'Nigeria':"Travel",
 '19th Century':"History",
 'Urban Fantasy':"Science fiction and fantasy",
 'Roman':"History",
 'Church':"Religion and spirituality",
 'African Literature':"Literature and fiction",
 'Social Science':"Politics and social sciences",
 'Africa':"Travel",
 'Reference':"Reference",
 'Paranormal Romance':"Science fiction and fantasy",
 'Epic Fantasy':"Science fiction and fantasy",
 'Scandinavian Literature':"Literature and fiction",
 'American History':"History",
 'Leadership':"Personal development",
 'Engineering':"Engineering and Transportation",
 'Science Fiction Fantasy':"Science fiction and fantasy",
 'Vegetarianism':"Cookbooks, Food and Wine",
 'Australia':"Travel",
 'Gay':"LGBTQ and gender studies",
 'Judaism':"Religion and spirituality",
 'Book Club':"Literature and fiction",
 'Religion':"Religion and spirituality",
 'Humanities':"Politics and social sciences",
 'High Fantasy':"Science fiction and fantasy",
 'Popular Science':"Maths and science",
 'Romance':"Romance",
 'Irish Literature':"Literature and fiction",
 'Faith':"Religion and spirituality",
 'Comic Strips':"Comics and graphic novels",
 'Social Movements':"Politics and social sciences",
 'Art':"Art",
 'True Story':"History",
 'Brain':"Maths and science",
 'Geology':"Maths and science",
 'Neuroscience':"Maths and science",
 'Combat':"Sports and outdoors",
 'Mountaineering':"Sports and outdoors",
 'Comedy':"Humor and Entertainment",
 'Swedish Literature':"Literature and fiction",
 'American Revolution':"History",
 'Birds':"Crafts, Hobbies and Home",
 'Christmas':"Crafts, Hobbies and Home",
 'Psychological Thriller':"Mystery, thriller, suspense and horror",
 'Spanish Civil War':"History",
 'Espionage':"Mystery, thriller, suspense and horror",
 'African American':"Politics and social sciences",
 'Regency':"History",
 'Terrorism':"History",
 'World War II':"History",
 'Southern':"History",
 'Speculative Fiction':"Science fiction and fantasy",
 'Shojo':"Comics and graphic novels",
 'Russian Literature':"Literature and fiction",
 'Horror':"Mystery, thriller, suspense and horror",
 'Cookbooks':"Cookbooks, Food and Wine",
 'Political Science':"Politics and social sciences",
 'Pop Culture':"Humor and Entertainment",
 'Games':"Humor and Entertainment",
 'Social Justice':"Politics and social sciences",
 'Psychoanalysis':"Politics and social sciences",
 'Travel':"Travel",
 'Astronomy':"Maths and science",
 'Polygamy':"LGBTQ and gender studies",
 'World War I':"History",
 'Architecture':"Art",
 'Contemporary':"History",
 'Zimbabwe':"Travel",
 'Eastern Africa':"Travel",
 'Weird Fiction':"Literature and fiction",
 'France':"Travel",
 'Law':"Law",
 'British Literature':"Literature and fiction",
 'Economics':"Business and money",
 'Sexuality':"Parenting and relationships",
 'Graphic Novels Comics':"Comics and graphic novels",
 'Ancient History':"History",
 'Americana':"History",
 'Video Games':"Humor and Entertainment",
 'Paganism':"Religion and spirituality",
 'Holocaust':"History",
 'Mystery Thriller':"Mystery, thriller, suspense and horror",
 'Jewish':"Religion and spirituality",
 'African American Literature':"Literature and fiction",
 'Classical Studies':"History",
 'Entrepreneurship':"Personal development",
 'Survival':"Sports and outdoors",
 'Egypt':"Travel",
 'Southern Africa':"Travel",
 'Rabbits':"Crafts, Hobbies and Home",
 'Holiday':"Personal development",
 'Presidents':"Autobiography and biography",
 'Science Fiction':"Science fiction and fantasy",
 'Asia':"Travel",
 'Eastern Philosophy':"Religion and spirituality",
 'Supernatural':"Science fiction and fantasy",
 'American Civil War':"History",
 'Historical Fiction':"Literature and fiction",
 'Environment':"Engineering and Transportation",
 'Art Design':"Art",
 'Musicians':"Art",
 'Spy Thriller':"Literature and fiction",
 'South Africa':"Travel",
 'Sustainability':"Engineering and Transportation",
 'Epic':"Literature and fiction",
 'Historical Fantasy':"Science fiction and fantasy",
 'Logic':"Politics and social sciences",
 'Culture':"Politics and social sciences",
 'Action':"Literature and fiction",
 'Westerns':"Literature and fiction",
 'Folklore':"Literature and fiction",
 'Atheism':"Religion and spirituality",
 'Ireland':"Travel",
 'International Relations':"Politics and social sciences",
 'Marvel':"Comics and graphic novels",
 'Northern Africa':"Travel",
 'Noir':"Art",
 'Feminism':"Politics and social sciences",
 'Martial Arts':"Sports and outdoors",
 'Young Adult':"Teen and young adult",
 'Adult Fiction':"Literature and fiction",
 'Manga':"Comics and graphic novels",
 'Cycling':"Sports and outdoors",
 'Military History':"History",
 'Pakistan':"Travel",
 'High School':"Teen and young adult",
 'Literature':"Literature and fiction",
 'Cyberpunk':"Science fiction and fantasy",
 'Love Story':"Romance",
 'Ghost Stories':"Mystery, thriller, suspense and horror",
 'Food and Drink':"Cookbooks, Food and Wine",
 'Race':"Politics and social sciences",
 'Foodie':"Cookbooks, Food and Wine",
 'Tudor Period':"History",
 'Design':"Art",
 'Magic':"Science fiction and fantasy",
 'Comic Book':"Comics and graphic novels",
 'Cats':"Crafts, Hobbies and Home",
 'Retellings':"History",
 'New York':"Travel",
 'Food Writing':"Cookbooks, Food and Wine",
 'Portuguese Literature':"Literature and fiction",
 'Soccer':"Sports and outdoors",
 '16th Century':"History",
 'Humor':"Humor and Entertainment",
 'Legal Thriller':"Mystery, thriller, suspense and horror",
 'Glbt':"LGBTQ and gender studies",
 'Japan':"Travel",
 'Classics':"Literature and fiction",
 'Collections':"Anthology",
 'Coming Of Age':"Teen and young adult",
 'Thriller':"Mystery, thriller, suspense and horror",
 'Cooking':"Cookbooks, Food and Wine",
 'Romanticism':"Romance",
 'Music':"Art"
}

In [29]:
def categorise(lst, dct):
    """
    This function receives a list and a dictionary as input
    and returns a list where each value of the input list is replaced with the corresponding
    value on the dictionary without duplicates.
    
    Input:
    lst: a list of strings that match the keys of the dictionary
    dct: a dictionary with a mapping of the replacement values
    
    Output:
    a list of strings where each value occurs only once
    """
    keys = list(dct.keys())
    return "|".join(list(set([dct[i] for i in lst if i in keys and len(dct[i])>0])))

ds["genres"] = ds["genres"].str.split("|")
ds["genres"] = ds["genres"].map(lambda x: categorise(x, genre_dict))
ds.head()

Unnamed: 0,book_authors,book_title,original_title,genres,book_desc,book_edition,book_format,book_pages,original_publication_year,book_rating,book_rating_count,book_review_count,book_id,goodreads_book_id
0,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,Literature and fiction|History|Education and T...,The unforgettable novel of a childhood in a sl...,50th Anniversary,Paperback,324 pages,1960.0,4.27,3745197,79450,4,2657
1,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,Literature and fiction|History|Education and T...,Harper Lee's Pulitzer Prize-winning masterwork...,,Mass Market Paperback,309 pages,1960.0,4.27,3746569,79475,4,2657
2,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,Literature and fiction|History|Education and T...,One of a series of fiction titles for schools....,New Windmill,Hardcover,287 pages,1960.0,4.27,3746774,79478,4,2657
3,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,Literature and fiction|History|Education and T...,"An unforgettable story of the violent, intoler...",,Paperback,285 pages,1960.0,4.27,3747139,79489,4,2657
4,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,Literature and fiction|History|Education and T...,"'Shoot all the bluejays you want, if you can h...",,Kindle Edition,385 pages,1960.0,4.27,3747228,79492,4,2657


### Ratings and tags

In [30]:
rati_ds = rati[rati["book_id"].isin(ds["book_id"])]
rati_ds

Unnamed: 0,user_id,book_id,rating
1,2,4081,4
2,2,260,5
3,2,9296,5
6,2,315,3
7,2,33,4
...,...,...,...
5976462,15292,854,4
5976466,35336,317,5
5976467,21879,1403,5
5976468,21879,5674,5


In [31]:
enbtag = btag[btag["goodreads_book_id"].isin(ds["goodreads_book_id"])]
tags_ds = pd.merge(enbtag, tags, on="tag_id", how="inner")
tags_ds.drop("tag_id", axis=1, inplace=True)
tags_ds

Unnamed: 0,goodreads_book_id,count,tag_name
0,8,2823,to-read
1,13,75870,to-read
2,21,33603,to-read
3,24,1009,to-read
4,26,27858,to-read
...,...,...,...
396589,32848471,12,single-dad
396590,32848471,10,best-of-2017
396591,32848471,9,kickass-heroine
396592,32848471,7,workplace-romance


### Working datasets

In [32]:
ds.to_csv("../data/_clean_books.csv")
rati_ds.to_csv("../data/_clean_ratings.csv")
tags_ds.to_csv("../data/_clean_tags.csv")