# Initial EDA

In [59]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import json
import gzip
import os
import random

random.seed(42) # seeding for rerunning purposes

In [146]:
head = 'goodreads_books_children.json.gz'
filename_test = os.path.join(root, head)
with gzip.open(filename_test) as file: 
    for l in file:
        d = json.loads(l)
        print(d['book_id'])
        break

287141


In [93]:
def load_data(file_name, head = 2999):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        # non performant for all data
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break point > head
            #### NEED BETTER WAY TO GET ALL INFORMATION ####
            if (head is not None) and (count > head):
                break
    return data 

def combine_data(root):
    predefined_genres = os.listdir(root)
    print('')
    
    data_agg = []
    for genre in predefined_genres:
        raw_data = load_data(os.path.join(root, genre))
        selected_books = random.sample(raw_data, 1000) # sample 1000 without replacement, list 
        data_agg.append(selected_books)
        
    final_book_data = [book for selected_books in data_agg for book in selected_books]
    return pd.DataFrame(final_book_data)    

In [51]:
#### Feel free to change to whatever the data is located (removed data from Git due to size haha)
root = r'/Users/michaelfronda/Desktop/Main/OMSA/22-FA-ISYE6740/Project-data'

In [94]:
df = combine_data(root)




In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   isbn                  7000 non-null   object
 1   text_reviews_count    7000 non-null   object
 2   series                7000 non-null   object
 3   country_code          7000 non-null   object
 4   language_code         7000 non-null   object
 5   popular_shelves       7000 non-null   object
 6   asin                  7000 non-null   object
 7   is_ebook              7000 non-null   object
 8   average_rating        7000 non-null   object
 9   kindle_asin           7000 non-null   object
 10  similar_books         7000 non-null   object
 11  description           7000 non-null   object
 12  format                7000 non-null   object
 13  link                  7000 non-null   object
 14  authors               7000 non-null   object
 15  publisher             7000 non-null   

In [149]:
interactions_children = pd.DataFrame(load_data(os.path.join(root, 'goodreads_interactions_children.json.gz'), head=99999))

In [150]:
len(interactions_children)

100000

In [151]:
is_read = interactions_children[interactions_children['is_read'] == True]['book_id'].unique()

In [152]:
np.size(is_read)

19009

In [165]:
def load_data2(file_name, head = 999):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        # non performant for all data
        for l in fin:
            d = json.loads(l)
            if d['book_id'] in is_read:
                count += 1
                data.append(d)
            
            # break point > head
            #### NEED BETTER WAY TO GET ALL INFORMATION ####
            if (head is not None) and (count > head):
                break
    return data 

children_books_read = load_data2(os.path.join(root, 'goodreads_books_children.json.gz'))

In [183]:
children_books_read[0]['book_id']

'6066812'

In [179]:
reviews_children = pd.DataFrame(load_data(os.path.join(root, 'goodreads_reviews_children.json.gz'),head=999999))

In [170]:
reviews_children.sample(2)

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
73871,0806f14a989d4c5c6057f5155f009df8,30312747,075adfb3c59b82ff8ce0819940ffe844,5,Beautiful illustrations + great for discussion...,Sat Apr 01 08:05:10 -0700 2017,Sat Apr 01 08:07:01 -0700 2017,Wed Mar 01 00:00:00 -0800 2017,,0,0
12316,da76b2e808d445af86cc59044dda319e,138959,edc763332f193ba0750136cacef54220,3,"This was not at all like the movie, but still ...",Wed May 29 03:46:38 -0700 2013,Mon Jun 17 19:23:44 -0700 2013,Mon Jun 17 00:00:00 -0700 2013,Wed May 29 00:00:00 -0700 2013,0,0


In [168]:
df_children = pd.DataFrame(children_books_read)

In [211]:
combined = pd.merge(df_children, reviews_children, how='left', on=['book_id'])

In [212]:
combined[combined['book_id'] == '10806008'].T

Unnamed: 0,60,61,62,63,64,65,66,67,68,69,...,250,251,252,253,254,255,256,257,258,259
isbn,1419700251,1419700251,1419700251,1419700251,1419700251,1419700251,1419700251,1419700251,1419700251,1419700251,...,1419700251,1419700251,1419700251,1419700251,1419700251,1419700251,1419700251,1419700251,1419700251,1419700251
text_reviews_count,890,890,890,890,890,890,890,890,890,890,...,890,890,890,890,890,890,890,890,890,890
series,[799654],[799654],[799654],[799654],[799654],[799654],[799654],[799654],[799654],[799654],...,[799654],[799654],[799654],[799654],[799654],[799654],[799654],[799654],[799654],[799654]
country_code,US,US,US,US,US,US,US,US,US,US,...,US,US,US,US,US,US,US,US,US,US
language_code,en-US,en-US,en-US,en-US,en-US,en-US,en-US,en-US,en-US,en-US,...,en-US,en-US,en-US,en-US,en-US,en-US,en-US,en-US,en-US,en-US
popular_shelves,"[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...",...,"[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun...","[{'count': '10607', 'name': 'to-read'}, {'coun..."
asin,,,,,,,,,,,...,,,,,,,,,,
is_ebook,false,false,false,false,false,false,false,false,false,false,...,false,false,false,false,false,false,false,false,false,false
average_rating,4.04,4.04,4.04,4.04,4.04,4.04,4.04,4.04,4.04,4.04,...,4.04,4.04,4.04,4.04,4.04,4.04,4.04,4.04,4.04,4.04
kindle_asin,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40,...,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40,B006LGNU40


In [242]:
combined['text'] = combined[['book_id','review_text']].groupby(['book_id'])['review_text'].transform(lambda x: '\t'.join(x))


In [243]:
bid_rev = combined[['book_id', 'text']].drop_duplicates()

In [244]:
final_final = pd.merge(df_children, bid_rev, how='inner', on=['book_id'])

In [245]:
# scale this process to each genre!!! 
final_final

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,text
0,1934876569,6,[151854],US,,"[{'count': '515', 'name': 'to-read'}, {'count'...",,false,4.22,,...,,2009,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,6066812,98,701117,All's Fairy in Love and War (Avalon: Web of Ma...,All's Fairy in Love and War (Avalon: Web of Ma...,"This was a really cute book, though to be hone..."
1,0590417010,193,[],US,eng,"[{'count': '450', 'name': 'to-read'}, {'count'...",,false,4.43,B017RORXNI,...,,1995,https://www.goodreads.com/book/show/89378.Dog_...,https://images.gr-assets.com/books/1360057676m...,89378,1331,86259,Dog Heaven,Dog Heaven,"Really cute, sweet, and charming.\tThis beauti..."
2,0531301060,3,[],US,,"[{'count': '10', 'name': 'to-read'}, {'count':...",,false,3.68,,...,,1999,https://www.goodreads.com/book/show/2592648-it...,https://s.gr-assets.com/assets/nophoto/book/11...,2592648,21,2613165,It's Funny Where Ben's Train Takes Him,It's Funny Where Ben's Train Takes Him,"Rated and reviewed by Dylan, age 7: \n This bo..."
3,9512310201,1,[223802],US,fin,"[{'count': '4', 'name': 'to-read'}, {'count': ...",,false,3.53,,...,,1976,https://www.goodreads.com/book/show/8030991-ka...,https://images.gr-assets.com/books/1295309151m...,8030991,34,12634950,"Katso eteesi, Lotta!","Katso eteesi, Lotta!","Ihan normaali Lotta-kirja, nakojaan luen kaikk..."
4,0374428115,7,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",,false,4.38,,...,,2008,https://www.goodreads.com/book/show/926662.Gro...,https://s.gr-assets.com/assets/nophoto/book/11...,926662,45,911665,Growltiger's Last Stand and Other Poems,Growltiger's Last Stand and Other Poems,"Beatifully illustrated, classic children's tal..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0141301074,110,[],US,eng,"[{'count': '1199', 'name': 'to-read'}, {'count...",,false,3.94,B0096HFOZO,...,,1998,https://www.goodreads.com/book/show/430537.The...,https://images.gr-assets.com/books/1327868821m...,430537,1598,866909,The Twits,The Twits,How can you not like Roald Dahl?\tFalse\tRead ...
996,9025833926,105,[522686],US,nl,"[{'count': '2528', 'name': 'to-read'}, {'count...",,false,4.15,,...,,2003,https://www.goodreads.com/book/show/1835829.De...,https://s.gr-assets.com/assets/nophoto/book/11...,1835829,4224,1329309,De brief voor de koning,De brief voor de koning,A really lovely story. Reading it as an adult ...
997,0688099386,8,[691570],US,eng,"[{'count': '25', 'name': 'to-read'}, {'count':...",,false,3.66,,...,,,https://www.goodreads.com/book/show/4987435-sc...,https://s.gr-assets.com/assets/nophoto/book/11...,4987435,25,5053308,"School's Out (Class Clown, #4)","School's Out (Class Clown, #4)","The ""Class Clown"" is back!Lucas Cott is ready ..."
998,0544939077,48,[1131304],US,,"[{'count': '14', 'name': 'to-read'}, {'count':...",,false,3.87,B01N5F6FQI,...,,2017,https://www.goodreads.com/book/show/30971640-w...,https://images.gr-assets.com/books/1473599117m...,30971640,150,51589424,"What Is Chasing Duck? (The Giggle Gang, #1)","What Is Chasing Duck? (The Giggle Gang, #1)",What's chasing Duck? It has big teeth and is s...


In [246]:
final_final.loc[999]['text']

'A hungry dinosaur is looking for lunch and his lunch is looking for a snack who\'s looking for a bite to eat. With all of these creatures wiggling around in the dinosaur\'s stomach he lets out a monster burp and everyone escapes! \n A funny read aloud that brings to mind the Old Lady who Swallowed a Fly.\tcumulative picture book where each creature munches on the smaller one and all are devoured by a hungry T-Rex.\tCute -- dinosaur storytime possibility.\tThis book is perfect for ECE and read aloud! It is the story of a dinosaur who is looking for his lunch (and what the lunch itself is looking to eat). It is a great sequence book with good vocabulary, large text and pictures, and a simple story. Awesome!\tGood for Dino fans and Dino themed ST. Also good for "I\'m Gonna Eat You Up" story time.\tDemonstrates chain reactions/cause and effect well. May upset younger audiences, but hey, it\'s the circle of life.\tI had tremendous success in storytime with this one. The kids loved it! Shor

In [236]:
# Monday 
# TODO: scale process to get review data for 1000 books for each pre defined genre
# TODO: Initial Clusters 

In [238]:
# Tuesday 
# TODO: Continue Clustering books, different methods?

In [54]:
raw_data.sample(10)

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
3556,1467712345.0,27,[],US,,"[{'count': '38', 'name': 'to-read'}, {'count':...",,False,3.26,B00SG65DMU,...,4.0,,2015.0,https://www.goodreads.com/book/show/19354685-t...,https://images.gr-assets.com/books/1416180554m...,19354685,81,27415839,Trash Mountain,Trash Mountain
4727,340917555.0,53,[146020],US,en-GB,"[{'count': '1012', 'name': 'to-read'}, {'count...",,False,3.78,B007LO1KVQ,...,,,2006.0,https://www.goodreads.com/book/show/124675.The...,https://images.gr-assets.com/books/1415582629m...,124675,3205,120063,"The Secret Seven Adventure (The Secret Seven, #2)","The Secret Seven Adventure (The Secret Seven, #2)"
335,750013931.0,3,[381005],US,,"[{'count': '224', 'name': 'to-read'}, {'count'...",,False,4.23,,...,,,,https://www.goodreads.com/book/show/1914630.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,1914630,41,80619,"The Oaken Throne (The Deptford Histories, #2)","The Oaken Throne (The Deptford Histories, #2)"
584,823424006.0,25,[],US,,"[{'count': '23', 'name': 'to-read'}, {'count':...",,False,3.47,,...,2.0,,2012.0,https://www.goodreads.com/book/show/12710065-f...,https://images.gr-assets.com/books/1328019578m...,12710065,87,17843561,Fox Tails: Four Fables from Aesop,Fox Tails: Four Fables from Aesop
8990,9953875367.0,3,[],US,ara,"[{'count': '2505', 'name': 'to-read'}, {'count...",,False,3.9,,...,,,2008.0,https://www.goodreads.com/book/show/8082219,https://s.gr-assets.com/assets/nophoto/book/11...,8082219,6,1021311,عفريت الأرقام,عفريت الأرقام
9120,1612440916.0,6,[],US,en-US,"[{'count': '187', 'name': 'to-read'}, {'count'...",,False,4.17,B008AK7ALE,...,8.0,,2012.0,https://www.goodreads.com/book/show/15831747-v...,https://images.gr-assets.com/books/1367002342m...,15831747,13,21373662,Vin and the Dorky Duet,Vin and the Dorky Duet
8751,,3,[155079],US,eng,"[{'count': '89', 'name': 'to-read'}, {'count':...",,False,3.96,B00GVFU7R4,...,,,2009.0,https://www.goodreads.com/book/show/1137256.Ca...,https://images.gr-assets.com/books/1338051257m...,1137256,14,1124501,"Caddy Ever After (Casson Family, #4)","Caddy Ever After (Casson Family, #4)"
5684,8877828226.0,2,[206569],US,ita,"[{'count': '216', 'name': 'to-read'}, {'count'...",,False,3.95,,...,1.0,Brutte Storie,1999.0,https://www.goodreads.com/book/show/9749359-i-...,https://images.gr-assets.com/books/1445556344m...,9749359,15,413020,I cinici celti,I cinici celti
5131,,1,[],US,,"[{'count': '568', 'name': 'to-read'}, {'count'...",B00JMV8TAG,True,4.05,B00JMV8TAG,...,,,,https://www.goodreads.com/book/show/22930358-a...,https://s.gr-assets.com/assets/nophoto/book/11...,22930358,13,3719102,AN OLD-FASHIONED GIRL. (Annotated) (Louisa May...,AN OLD-FASHIONED GIRL. (Annotated) (Louisa May...
6135,,1,[],US,,"[{'count': '976', 'name': 'to-read'}, {'count'...",,False,3.89,,...,,,,https://www.goodreads.com/book/show/26217381-g...,https://images.gr-assets.com/books/1441263911m...,26217381,23,2801485,Gümüş Patenler,Gümüş Patenler


In [9]:
raw_data.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')

In [33]:
# popular shelves - top user generated shelves for a book, defines genres 
raw_data['popular_shelves'][96]

[{'count': '7173', 'name': 'to-read'},
 {'count': '235', 'name': 'vampires'},
 {'count': '195', 'name': 'currently-reading'},
 {'count': '194', 'name': 'young-adult'},
 {'count': '186', 'name': 'favorites'},
 {'count': '109', 'name': 'fantasy'},
 {'count': '104', 'name': 'vampire'},
 {'count': '81', 'name': 'owned'},
 {'count': '80', 'name': 'books-i-own'},
 {'count': '77', 'name': 'ya'},
 {'count': '76', 'name': 'series'},
 {'count': '73', 'name': 'paranormal'},
 {'count': '50', 'name': 'supernatural'},
 {'count': '42', 'name': 'heather-brewer'},
 {'count': '41', 'name': 'fiction'},
 {'count': '37', 'name': 'horror'},
 {'count': '31', 'name': 'teen'},
 {'count': '31', 'name': 'urban-fantasy'},
 {'count': '30', 'name': 'the-chronicles-of-vladimir-tod'},
 {'count': '30', 'name': 'vladimir-tod'},
 {'count': '21', 'name': 'romance'},
 {'count': '21', 'name': 'library'},
 {'count': '18', 'name': 'owned-books'},
 {'count': '18', 'name': 'to-buy'},
 {'count': '16', 'name': 'high-school'},
 {

In [32]:
display(raw_data['similar_books'][96]) # list of books that users who like the current book also like

display(raw_data.iloc[96,:])

['25861113',
 '7430195',
 '18765937',
 '6120544',
 '3247550',
 '9266753',
 '6976966',
 '25764778',
 '17696585',
 '23256849',
 '642463',
 '2842796',
 '263172',
 '10874337']

isbn                                                                     
text_reviews_count                                                      9
series                                                           [176160]
country_code                                                           US
language_code                                                         eng
popular_shelves         [{'count': '7173', 'name': 'to-read'}, {'count...
asin                                                           B0042JSOQC
is_ebook                                                             true
average_rating                                                       4.35
kindle_asin                                                    B004IYJDXY
similar_books           [25861113, 7430195, 18765937, 6120544, 3247550...
description             It all comes down to this.\nVlad's running out...
format                                                                   
link                    https://www.go

### Checks for 'similar_books' 

Are they book ids? or ISBNs?

In [25]:
raw_data[raw_data['book_id'] == 7430195]

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series


In [40]:
mask = raw_data.isbn.apply(lambda x: '25861113' == x) # 
raw_data[mask] 

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series


In [41]:
# check entire dataset for similar books