Create the dataset for the Genre table

In [1]:
# Imports
import pandas as pd
import numpy as np

In [2]:
# Read the CSV file and put it into a dataframe

csv_file = 'filtered_data.csv'
book_df = pd.read_csv(csv_file)

#keep only the ISBN and Genre Column
genre_df = book_df[['isbn', 'genres']].copy()

genre_df.head()

Unnamed: 0,isbn,genres
0,9780062348678,"['Young Adult', 'Contemporary', 'LGBT', 'Roman..."
1,B0745KTJSY,"['Historical Fiction', 'Fiction', 'Christian F..."
2,9781408857908,"['Fantasy', 'Romance', 'Young Adult', 'New Adu..."
3,9781786495259,"['Fiction', 'Romance', 'LGBT', 'Contemporary',..."
4,9780143129554,"['Fiction', 'Classics', 'Literature', 'Novels'..."


In [3]:
# eliminate the [ ' ] marks in the genres column

genre_df2 = genre_df.copy()
genre_df2['genres'] = genre_df2['genres'].str.replace('[', '')
genre_df2['genres'] = genre_df2['genres'].str.replace(']', '')
genre_df2['genres'] = genre_df2['genres'].str.replace("'", "")
genre_df2

Unnamed: 0,isbn,genres
0,9780062348678,"Young Adult, Contemporary, LGBT, Romance, Fict..."
1,B0745KTJSY,"Historical Fiction, Fiction, Christian Fiction..."
2,9781408857908,"Fantasy, Romance, Young Adult, New Adult, Fae,..."
3,9781786495259,"Fiction, Romance, LGBT, Contemporary, Queer, A..."
4,9780143129554,"Fiction, Classics, Literature, Novels, America..."
...,...,...
3810,9781250078285,"Graphic Novels, Young Adult, Science Fiction, ..."
3811,9781447256274,"Fantasy, Fiction, Steampunk, Mystery, Urban Fa..."
3812,B01KZOA32U,"Mystery, Thriller, Fiction, Suspense, Mystery ..."
3813,B00XKC1KTK,"Survival, Contemporary Romance"


In [4]:
#create a new df and flatten the genres into a new list

new_df = genre_df2['genres'].copy()

genre_values = np.core.defchararray.split(new_df.values.astype('str'),', ')
flatten_list = [item for sublist in genre_values for item in sublist]
flatten_list

['Young Adult',
 'Contemporary',
 'LGBT',
 'Romance',
 'Fiction',
 'Audiobook',
 'Queer',
 'Realistic Fiction',
 'Coming Of Age',
 'Young Adult Contemporary',
 'Historical Fiction',
 'Fiction',
 'Christian Fiction',
 'Classics',
 'Christian',
 'Romance',
 'Historical',
 'Young Adult',
 'Adult',
 'Inspirational',
 'Fantasy',
 'Romance',
 'Young Adult',
 'New Adult',
 'Fae',
 'Magic',
 'Fiction',
 'Young Adult Fantasy',
 'High Fantasy',
 'Paranormal',
 'Fiction',
 'Romance',
 'LGBT',
 'Contemporary',
 'Queer',
 'Audiobook',
 'Adult',
 'Gay',
 'Coming Of Age',
 'M M Romance',
 'Fiction',
 'Classics',
 'Literature',
 'Novels',
 'American',
 'Literary Fiction',
 'Contemporary',
 'Science Fiction',
 '20th Century',
 'School',
 'Contemporary',
 'Novels',
 'Book Club',
 'Adult',
 'True Story',
 'Fiction',
 'Drama',
 'Abuse',
 'New Adult',
 'Inspirational',
 'Fiction',
 'Contemporary',
 'LGBT',
 'Literary Fiction',
 'Adult',
 'Novels',
 'Adult Fiction',
 'Queer',
 'Audiobook',
 'Literature',
 '

In [5]:
#take the genres list and add unique values to the unique_genre_list

unique_genre_list = []
for x in flatten_list:
    if x not in unique_genre_list:
        unique_genre_list.append(x)

unique_genre_list

['Young Adult',
 'Contemporary',
 'LGBT',
 'Romance',
 'Fiction',
 'Audiobook',
 'Queer',
 'Realistic Fiction',
 'Coming Of Age',
 'Young Adult Contemporary',
 'Historical Fiction',
 'Christian Fiction',
 'Classics',
 'Christian',
 'Historical',
 'Adult',
 'Inspirational',
 'Fantasy',
 'New Adult',
 'Fae',
 'Magic',
 'Young Adult Fantasy',
 'High Fantasy',
 'Paranormal',
 'Gay',
 'M M Romance',
 'Literature',
 'Novels',
 'American',
 'Literary Fiction',
 'Science Fiction',
 '20th Century',
 'School',
 'Book Club',
 'True Story',
 'Drama',
 'Abuse',
 'Adult Fiction',
 'Adventure',
 'Science Fiction Fantasy',
 'Urban Fantasy',
 'Teen',
 'Young Adult Romance',
 'Retellings',
 'Dystopia',
 'Fairy Tales',
 'Contemporary Romance',
 'Disability',
 'Erotica',
 'Angels',
 'Vampires',
 'Epic',
 'Poetry',
 'Philosophy',
 'Portugal',
 'Portuguese Literature',
 'Unfinished',
 'Mystery',
 'Love Story',
 'Space',
 'Thriller',
 'Chick Lit',
 'Picture Books',
 'Childrens',
 'Animals',
 'Humor',
 'Kids'

In [6]:
# check the lengths of the two lists to make sure it worked 
print(len(flatten_list))
print(len(unique_genre_list))

33829
561


In [7]:
#create a table of the unique genre values
genre_tables = pd.DataFrame(unique_genre_list, columns = ['genre'])

# Add a Primary Key to each genre
genre_tables['genre_id'] = genre_tables.index + 500

genre_tables

Unnamed: 0,genre,genre_id
0,Young Adult,500
1,Contemporary,501
2,LGBT,502
3,Romance,503
4,Fiction,504
...,...,...
556,Soldiers,1056
557,Microhistory,1057
558,Outdoors,1058
559,Paranormal Urban Fantasy,1059


Now that we have unique genres and a genre_id, we need to connect the ISBN to the genre in a new table

In [8]:
# create a list of ISBN and Genres that will be zipped into a df
last_isbn_list = []
last_genre_list = []

for index, row in genre_df2.iterrows():
    row['genres'].split(',')
    row['genres'].split(' ')
    isbn = row['isbn']
    for x in row['genres'].split(','):
        last_isbn_list.append(isbn)
        last_genre_list.append(x)

#Check the length of each

print(len(last_isbn_list))
print(len(last_genre_list))

33829
33829


In [9]:
# zip the isbn and genres into a df
book_genre_table = pd.DataFrame(zip(last_isbn_list, last_genre_list), columns = ['isbn', 'genre'])

# strip leading and trailing white space in the genre column
book_genre_table['genre'] = book_genre_table['genre'].str.strip()

book_genre_table

Unnamed: 0,isbn,genre
0,9780062348678,Young Adult
1,9780062348678,Contemporary
2,9780062348678,LGBT
3,9780062348678,Romance
4,9780062348678,Fiction
...,...,...
33824,9781760111236,Book Club
33825,9781760111236,Contemporary
33826,9781760111236,Literary Fiction
33827,9781760111236,Literature
