*Goal*: 
1. find out if the dataset fits the data requirements
2. get interactions dataframe ['user_id', 'item_id', 'date'(%Y-%m), 'timetsamp']
3. get item meta dataframe ['item_id', meta_col('genre_id' or 'tag_id', etc...)]
4. get list of item meta [meta_col]

*Data Requirements*:
* user activity is >80% of all time intervals
* month 1, user has >5 rates
* month 0, to pre-train model
* items are comparable
* timestamp
* implicit feedback
* ranking problem


*Conclusion*:

In [1]:
import os
import sys
sys.path.append(os.path.abspath('') + '/..')

In [2]:
import pandas as pd 
import re
import numpy as np

from data_utils import getDF, getDF_n_lines, load_data
from dataset_evaluation_utils import * 

from datetime import datetime, timezone

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')

# import plotly.offline as py
# pd.options.plotting.backend = "plotly"
# py.init_notebook_mode() # graphs charts inline (IPython).

a4_dims = (11.7, 8.27)

# paths

In [3]:
dataset_folderpath = '../datasets/goodreads/'

dataset_name = 'Goodreads'
dump_foldername ='goodreads_dump/'

# rule: what/which_data_set/sample_version/what/
images_path, output_path, heatmaps_path, diversity_graphpath, diversity_filepath = get_folderpaths(dump_foldername)

___

# Goodreads

https://mengtingwan.github.io/data/goodreads#datasets

## column names

In [None]:
inter_columns = ['user_id', 'item_id', 'date','timestamp']
# meta_col = ''
# item_meta_columns = ['item_id', meta_col]

In [5]:
# takes 5min on my pc
orig_df = pd.read_csv(dataset_folderpath+'goodreads_interactions.csv')
orig_df


# snipet of datasets

In [6]:
orig_df

Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed
0,0,948,1,5,0
1,0,947,1,5,1
2,0,946,1,5,0
3,0,945,1,5,0
4,0,944,1,5,0
...,...,...,...,...,...
228648337,876144,24772,0,0,0
228648338,876144,23847,1,4,0
228648339,876144,23950,1,3,0
228648340,876144,374106,1,5,1


In [4]:
load_data(dataset_folderpath+'goodreads_interactions_dedup.json.gz', head=3)

[{'user_id': '8842281e1d1347389f2ab93d60773d4d',
  'book_id': '34684622',
  'review_id': 'a53868823f065a0e20fd4ae98b820674',
  'is_read': False,
  'rating': 0,
  'review_text_incomplete': '',
  'date_added': 'Tue Oct 17 09:40:11 -0700 2017',
  'date_updated': 'Tue Oct 17 09:40:12 -0700 2017',
  'read_at': '',
  'started_at': ''},
 {'user_id': '8842281e1d1347389f2ab93d60773d4d',
  'book_id': '34536488',
  'review_id': '9f08c5f991f87f3b7ae4ce779c2aac10',
  'is_read': False,
  'rating': 0,
  'review_text_incomplete': '',
  'date_added': 'Fri Oct 13 07:19:50 -0700 2017',
  'date_updated': 'Fri Oct 13 07:19:50 -0700 2017',
  'read_at': '',
  'started_at': ''},
 {'user_id': '8842281e1d1347389f2ab93d60773d4d',
  'book_id': '34017076',
  'review_id': '14da595c5b0c38b1247888f62f74a772',
  'is_read': False,
  'rating': 0,
  'review_text_incomplete': '',
  'date_added': 'Fri Oct 06 09:32:42 -0700 2017',
  'date_updated': 'Fri Oct 06 09:32:43 -0700 2017',
  'read_at': '',
  'started_at': ''},
 {'u

In [5]:
load_data(dataset_folderpath+'goodreads_book_authors.json.gz', head=3)

[{'average_rating': '3.98',
  'author_id': '604031',
  'text_reviews_count': '7',
  'name': 'Ronald J. Fields',
  'ratings_count': '49'},
 {'average_rating': '4.08',
  'author_id': '626222',
  'text_reviews_count': '28716',
  'name': 'Anita Diamant',
  'ratings_count': '546796'},
 {'average_rating': '3.92',
  'author_id': '10333',
  'text_reviews_count': '5075',
  'name': 'Barbara Hambly',
  'ratings_count': '122118'},
 {'average_rating': '3.68',
  'author_id': '9212',
  'text_reviews_count': '36262',
  'name': 'Jennifer Weiner',
  'ratings_count': '888522'}]

In [6]:
load_data(dataset_folderpath+'goodreads_book_genres_initial.json.gz', head=3)

[{'book_id': '5333265',
  'genres': {'history, historical fiction, biography': 1}},
 {'book_id': '1333909',
  'genres': {'fiction': 219, 'history, historical fiction, biography': 5}},
 {'book_id': '7327624',
  'genres': {'fantasy, paranormal': 31,
   'fiction': 8,
   'mystery, thriller, crime': 1,
   'poetry': 1}},
 {'book_id': '6066819',
  'genres': {'fiction': 555, 'romance': 23, 'mystery, thriller, crime': 10}}]

In [7]:
load_data(dataset_folderpath+'goodreads_book_series.json.gz', head=3)

[{'numbered': 'true',
  'note': '',
  'description': '',
  'title': 'Sun Wolf and Starhawk',
  'series_works_count': '9',
  'series_id': '189911',
  'primary_work_count': '3'},
 {'numbered': 'true',
  'note': '',
  'description': 'This series is also known as * Avalon : Jalinan Sihir (Bahasa Indonesia) See also the spin-off manga series \x01.',
  'title': 'Avalon: Web of Magic',
  'series_works_count': '14',
  'series_id': '151854',
  'primary_work_count': '12'},
 {'numbered': 'true',
  'note': '',
  'description': 'Plot-wise, "Crowner\'s Crusade" is a prequel to the series, but #15 in publication order.',
  'title': 'Crowner John Mystery',
  'series_works_count': '15',
  'series_id': '169353',
  'primary_work_count': '15'},
 {'numbered': 'true',
  'note': '',
  'description': '',
  'title': "Pluto's Snitch",
  'series_works_count': '3',
  'series_id': '1052227',
  'primary_work_count': '3'}]

In [8]:
load_data(dataset_folderpath+'goodreads_book_works.json.gz', head=3)

[{'books_count': '1',
  'reviews_count': '6',
  'original_publication_month': '8',
  'default_description_language_code': '',
  'text_reviews_count': '1',
  'best_book_id': '5333265',
  'original_publication_year': '1984',
  'original_title': 'W. C. Fields: A Life on Film',
  'rating_dist': '5:1|4:1|3:1|2:0|1:0|total:3',
  'default_chaptering_book_id': '',
  'original_publication_day': '',
  'original_language_id': '',
  'ratings_count': '3',
  'media_type': 'book',
  'ratings_sum': '12',
  'work_id': '5400751'},
 {'books_count': '22',
  'reviews_count': '10162',
  'original_publication_month': '',
  'default_description_language_code': '',
  'text_reviews_count': '741',
  'best_book_id': '25717',
  'original_publication_year': '2001',
  'original_title': 'Good Harbor',
  'rating_dist': '5:517|4:1787|3:2763|2:966|1:196|total:6229',
  'default_chaptering_book_id': '',
  'original_publication_day': '',
  'original_language_id': '',
  'ratings_count': '6229',
  'media_type': 'book',
  'ra

In [9]:
load_data(dataset_folderpath+'goodreads_books.json.gz', head=3)

[{'isbn': '0312853122',
  'text_reviews_count': '1',
  'series': [],
  'country_code': 'US',
  'language_code': '',
  'popular_shelves': [{'count': '3', 'name': 'to-read'},
   {'count': '1', 'name': 'p'},
   {'count': '1', 'name': 'collection'},
   {'count': '1', 'name': 'w-c-fields'},
   {'count': '1', 'name': 'biography'}],
  'asin': '',
  'is_ebook': 'false',
  'average_rating': '4.00',
  'kindle_asin': '',
  'similar_books': [],
  'description': '',
  'format': 'Paperback',
  'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
  'authors': [{'author_id': '604031', 'role': ''}],
  'publisher': "St. Martin's Press",
  'num_pages': '256',
  'publication_day': '1',
  'isbn13': '9780312853129',
  'publication_month': '9',
  'edition_information': '',
  'publication_year': '1984',
  'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
  'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
  'book_id': '5333265',
  'ratings_count': '3',
  'w

In [10]:
load_data(dataset_folderpath+'goodreads_reviews_spoiler.json.gz', head=3)

[{'user_id': '8842281e1d1347389f2ab93d60773d4d',
  'timestamp': '2017-08-30',
  'review_sentences': [[0, 'This is a special book.'],
   [0,
    'It started slow for about the first third, then in the middle third it started to get interesting, then the last third blew my mind.'],
   [0,
    'This is what I love about good science fiction - it pushes your thinking about where things can go.'],
   [0,
    "It is a 2015 Hugo winner, and translated from its original Chinese, which made it interesting in just a different way from most things I've read."],
   [0,
    'For instance the intermixing of Chinese revolutionary history - how they kept accusing people of being "reactionaries", etc.'],
   [0, 'It is a book about science, and aliens.'],
   [0,
    'The science described in the book is impressive - its a book grounded in physics and pretty accurate as far as I could tell.'],
   [1,
    'Though when it got to folding protons into 8 dimensions I think he was just making stuff up - intere

In [28]:
# getDF_n_lines(dataset_folderpath+'goodreads_interactions_dedup.json.gz', head=3)
inter_dedup = getDF(dataset_folderpath+'goodreads_interactions_dedup.json.gz')
inter_dedup

# get all genres

In [24]:
_ = pd.json_normalize(load_data(dataset_folderpath+'goodreads_book_genres_initial.json.gz', head=10000))
_

Unnamed: 0,book_id,"genres.history, historical fiction, biography",genres.fiction,"genres.fantasy, paranormal","genres.mystery, thriller, crime",genres.poetry,genres.romance,genres.non-fiction,genres.children,genres.young-adult,"genres.comics, graphic"
0,5333265,1.0,,,,,,,,,
1,1333909,5.0,219.0,,,,,,,,
2,7327624,,8.0,31.0,1.0,1.0,,,,,
3,6066819,,555.0,,10.0,,23.0,,,,
4,287140,,,,,,,3.0,,,
...,...,...,...,...,...,...,...,...,...,...,...
9996,34745794,,,,,,,,,,
9997,13036881,,6.0,,,,35.0,,,1.0,
9998,13036880,,1.0,2.0,,,3.0,,,,
9999,34679927,,,,,,,,,,


In [22]:
_.columns

Index(['book_id', 'genres.history, historical fiction, biography',
       'genres.fiction', 'genres.fantasy, paranormal',
       'genres.mystery, thriller, crime', 'genres.poetry', 'genres.romance',
       'genres.non-fiction', 'genres.children', 'genres.young-adult',
       'genres.comics, graphic'],
      dtype='object')

In [None]:
'genres.history, historical fiction, biography'[len('genres.'):].split(', ')

In [None]:
_list = [c[len('genres.'):].split(', ') for c in _.columns[1:]]
import itertools
merged = list(itertools.chain(*_list))
merged

In [None]:
[c[len('genres.'):] for c in _.columns[1:]]