In [4]:
import os
import pandas as pd
import numpy as np
import gzip
import json

In [2]:
path_folder = 'G:\BIG DATA\Recommender System Udacity\Books\data'

In [3]:
def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return data

In [5]:
book_genres_path = os.path.join(path_folder, 'goodreads_book_genres_initial.json.gz')
book_work_path = os.path.join(path_folder, 'goodreads_book_works.json.gz')
book_authors_path = os.path.join(path_folder, 'goodreads_book_authors.json.gz')
books_path = os.path.join(path_folder, 'goodreads_books.json.gz')


user_maps_path = os.path.join(path_folder, 'user_id_map.csv') 
book_maps_path = os.path.join(path_folder, 'book_id_map.csv')
interactions_path = os.path.join(path_folder, 'goodreads_interactions.csv')

In [6]:
val = None

In [None]:
book_genres_data = load_data(book_genres_path, val)
#book_work_data = load_data(book_work_path, val)
book_authors_data = load_data(book_authors_path, val)
books_data = load_data(books_path, val)

### Book data

In [None]:
books_df = pd.DataFrame.from_dict(books_data)

In [None]:
books_df.columns

In [None]:
books_df['authors'][:50][45]

In [None]:
selected_vars_book = ['book_id', 'is_ebook', 'average_rating', 'num_pages',
                      'authors', 'publication_year', 'text_reviews_count',
                      'ratings_count']

books_df = books_df[selected_vars_book]

selected_vars_book_names = ['book_id', 'is_ebook', 'average_rating_book', 'num_pages',
                          'authors', 'publication_year', 'text_reviews_count_book',
                          'ratings_count_book']

books_df.columns = selected_vars_book_names


In [None]:
def get_author_id(l):
    try:
        for d in l:
            if d['role'] == '':
                return str(d['author_id'])
        return str(l[0]['author_id'])
    except IndexError:
        return 'Anonymous'
    

In [None]:
books_df['author_id'] = books_df['authors'].apply(lambda l: get_author_id(l))

### Interacciones

In [13]:
interactions = pd.read_csv(interactions_path, nrows=val)

In [14]:
print(interactions.user_id.nunique()) 
print(interactions.book_id.nunique()) 
print(interactions.user_id.nunique() * interactions.book_id.nunique())
print(len(interactions)) 

228
59139
13483692
100000


### Authors data

In [15]:
book_authors_data

[{'average_rating': '3.98',
  'author_id': '604031',
  'text_reviews_count': '7',
  'name': 'Ronald J. Fields',
  'ratings_count': '49'},
 {'average_rating': '4.08',
  'author_id': '626222',
  'text_reviews_count': '28716',
  'name': 'Anita Diamant',
  'ratings_count': '546796'},
 {'average_rating': '3.92',
  'author_id': '10333',
  'text_reviews_count': '5075',
  'name': 'Barbara Hambly',
  'ratings_count': '122118'},
 {'average_rating': '3.68',
  'author_id': '9212',
  'text_reviews_count': '36262',
  'name': 'Jennifer Weiner',
  'ratings_count': '888522'},
 {'average_rating': '3.82',
  'author_id': '149918',
  'text_reviews_count': '96',
  'name': 'Nigel Pennick',
  'ratings_count': '1740'},
 {'average_rating': '3.89',
  'author_id': '3041852',
  'text_reviews_count': '85',
  'name': 'Alfred J. Church',
  'ratings_count': '947'},
 {'average_rating': '4.17',
  'author_id': '215594',
  'text_reviews_count': '6',
  'name': 'Michael Halberstam',
  'ratings_count': '23'},
 {'average_rati

In [16]:
authors_df = pd.DataFrame.from_dict(book_authors_data).drop('name', axis=1)

### Generos

In [17]:
genres_df = pd.DataFrame.from_dict(book_genres_data)
genres_df['genres_mode'] = genres_df['genres'].apply(lambda d: max(d, key=lambda k: d[k]) if len(d)!=0 else 'No genre')
genres_df['genres_mode_conf'] = genres_df['genres'].apply(lambda d: max(d.values())/sum(d.values()) 
                                                          if ((len(d)!=0) and (sum(d.values())!=0)) else 0)


In [18]:
genres_df

Unnamed: 0,book_id,genres,genres_mode,genres_mode_conf
0,5333265,"{'history, historical fiction, biography': 1}","history, historical fiction, biography",1.000000
1,1333909,"{'fiction': 219, 'history, historical fiction,...",fiction,0.977679
2,7327624,"{'fantasy, paranormal': 31, 'fiction': 8, 'mys...","fantasy, paranormal",0.756098
3,6066819,"{'fiction': 555, 'romance': 23, 'mystery, thri...",fiction,0.943878
4,287140,{'non-fiction': 3},non-fiction,1.000000
...,...,...,...,...
99996,24679885,"{'young-adult': 42, 'mystery, thriller, crime'...",young-adult,0.420000
99997,27883170,"{'fantasy, paranormal': 60, 'fiction': 16, 'yo...","fantasy, paranormal",0.659341
99998,35992614,{'romance': 2},romance,1.000000
99999,8713454,"{'fiction': 88, 'history, historical fiction, ...",fiction,0.478261


In [19]:
genres_df.genres_mode.unique()

array(['history, historical fiction, biography', 'fiction',
       'fantasy, paranormal', 'non-fiction', 'romance',
       'mystery, thriller, crime', 'No genre', 'children', 'poetry',
       'young-adult', 'comics, graphic'], dtype=object)

In [20]:
genres_df_fil = genres_df[['book_id', 'genres_mode', 'genres_mode_conf']]

### Work

In [21]:
work_df = pd.DataFrame.from_dict(book_work_data)

In [22]:
work_df

Unnamed: 0,books_count,reviews_count,original_publication_month,default_description_language_code,text_reviews_count,best_book_id,original_publication_year,original_title,rating_dist,default_chaptering_book_id,original_publication_day,original_language_id,ratings_count,media_type,ratings_sum,work_id
0,1,6,8,,1,5333265,1984,W. C. Fields: A Life on Film,5:1|4:1|3:1|2:0|1:0|total:3,,,,3,book,12,5400751
1,22,10162,,,741,25717,2001,Good Harbor,5:517|4:1787|3:2763|2:966|1:196|total:6229,,,,6229,book,20150,1323437
2,2,268,,,7,7327624,1987,,5:49|4:58|3:26|2:5|1:3|total:141,,,,141,book,568,8948723
3,38,89252,7,,3504,6066819,2009,Best Friends Forever,5:9152|4:16855|3:19507|2:6210|1:1549|total:53273,,14,,53273,book,185670,6243154
4,2,49,,,5,287140,1990,Runic Astrology: Starcraft and Timekeeping in ...,5:6|4:1|3:3|2:3|1:2|total:15,,,,15,book,51,278577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99996,46,1214,,,75,1094330,1926,The Benson Murder Case,5:83|4:157|3:182|2:51|1:19|total:492,,,,492,book,1710,1081147
99997,4,23,5,,3,892238,1976,The Diary of Samuel Pepys 1668,5:5|4:5|3:5|2:0|1:0|total:15,,11,,15,book,60,877477
99998,35,1077,,,45,1094332,1928,The Bishop Murder Case,5:116|4:166|3:131|2:50|1:33|total:496,,,,496,book,1770,1081149
99999,9,2023,11,,85,83,1986,Rising from the Plains,5:504|4:478|3:176|2:24|1:8|total:1190,,17,,1190,book,5016,1391039


## Union

In [23]:
aux_df = books_df.merge(genres_df_fil, how='left', on='book_id').merge(authors_df, how='left', on='author_id')

aux_df['book_id'] = aux_df['book_id'].apply(lambda x: str(x))
interactions['book_id'] = interactions['book_id'].apply(lambda x: str(x))
whole_data_df = interactions.merge(aux_df, how='left', on='book_id')
    


In [24]:
whole_data_df

Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed,is_ebook,average_rating_book,num_pages,authors,publication_year,text_reviews_count_book,ratings_count_book,author_id,genres_mode,genres_mode_conf,average_rating,text_reviews_count,ratings_count
0,0,948,1,5,0,,,,,,,,,,,,,
1,0,947,1,5,1,,,,,,,,,,,,,
2,0,946,1,5,0,,,,,,,,,,,,,
3,0,945,1,5,0,,,,,,,,,,,,,
4,0,944,1,5,0,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,227,882,1,3,0,,,,,,,,,,,,,
99996,227,59168,1,3,0,,,,,,,,,,,,,
99997,227,24428,1,3,0,,,,,,,,,,,,,
99998,227,59167,1,2,0,,,,,,,,,,,,,


In [25]:
aux_df

Unnamed: 0,book_id,is_ebook,average_rating_book,num_pages,authors,publication_year,text_reviews_count_book,ratings_count_book,author_id,genres_mode,genres_mode_conf,average_rating,text_reviews_count,ratings_count
0,5333265,false,4.00,256,"[{'author_id': '604031', 'role': ''}]",1984,1,3,604031,"history, historical fiction, biography",1.000000,3.98,7,49
1,1333909,false,3.23,,"[{'author_id': '626222', 'role': ''}]",2001,6,10,626222,fiction,0.977679,4.08,28716,546796
2,7327624,false,4.03,600,"[{'author_id': '10333', 'role': ''}]",1987,7,140,10333,"fantasy, paranormal",0.756098,3.92,5075,122118
3,6066819,false,3.49,368,"[{'author_id': '9212', 'role': ''}]",2009,3282,51184,9212,fiction,0.943878,3.68,36262,888522
4,287140,false,3.40,,"[{'author_id': '149918', 'role': ''}]",,5,15,149918,non-fiction,1.000000,3.82,96,1740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99996,24679885,true,4.19,,"[{'author_id': '26372', 'role': ''}]",,3,95,26372,young-adult,0.420000,3.94,28714,568158
99997,27883170,true,3.51,33,"[{'author_id': '175855', 'role': ''}]",2016,80,276,175855,"fantasy, paranormal",0.659341,3.76,24392,371713
99998,35992614,true,4.33,,"[{'author_id': '15428911', 'role': ''}]",2017,10,15,15428911,romance,1.000000,4.04,65,224
99999,8713454,false,3.33,358,"[{'author_id': '385819', 'role': ''}]",,66,310,385819,fiction,0.478261,3.52,189,796


# Model

In [35]:
from surprise.prediction_algorithms.knns import KNNWithMeans

from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
reader = Reader(rating_scale=(1, 5))

In [36]:
ibcf_author_df = whole_data_df[['user_id', 'author_id','rating']]
ibcf_author_df.columns = ['user_id', 'author_id', 'raw_ratings']
ibcf_books_df = whole_data_df[['user_id', 'book_id','rating']]
ibcf_books_df.columns = ['user_id','book_id' , 'raw_ratings']
ibcf_genres_df = whole_data_df[['user_id', 'genres_mode','rating']]
ibcf_genres_df.columns = ['user_id','genres_mode' , 'raw_ratings']
ibcf_author_df = Dataset.load_from_df(ibcf_author_df, reader)
ibcf_books_df = Dataset.load_from_df(ibcf_books_df, reader)
ibcf_genres_df = Dataset.load_from_df(ibcf_genres_df, reader)

In [None]:

# We'll use the famous SVD algorithm.
algo_authors = KNNWithMeans(k=5)
algo_genres = KNNWithMeans(k=5)
algo_books = KNNWithMeans(k=5)
# Run 5-fold cross-validation and print results
cross_validate(algo_authors, ibcf_author_df, measures=['RMSE', 'MAE'], cv=5, verbose=True)
cross_validate(algo_genres, ibcf_genres_df, measures=['RMSE', 'MAE'], cv=5, verbose=True)
cross_validate(algo_books, ibcf_books_df, measures=['RMSE', 'MAE'], cv=5, verbose=True)

### IBCF genres

### IBCF authors

### IBCF books