## Dataset processing

In [1]:
import gzip
import json
from collections import Counter

In [2]:
def load_data(file_name, start, end):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            count += 1
            if count < start:
                continue
            if count > end:
                break
            
            d = json.loads(l)
            data.append(d)
    return data

In [3]:
def process_data(dir, stop, step):
    temp = [0]
    start = 0
    end = step
    out_users = []
    out_books = []
    count = 0
    while count < stop:
        temp = load_data(dir, start=start, end = end)
        if len(temp) == 0:
            break
        start = end+1
        end = start + step
        user_id = []
        book_id = []
        
        for x in temp:
            if x['is_read'] is True:
                user_id.append(x['user_id'])
                book_id.append(x['book_id'])
                
        out_users.append(Counter(user_id))
        out_books.append(Counter(book_id))
        count += len(temp)
        del temp
        del user_id
        del book_id
        print(count)
        
    return out_users, out_books

In [4]:
def find_relevant_ratings(dir, to_take_u, to_take_b, start=0, stop=100, step = 1):
    temp = [0]
    start = 0
    count = 0
    end = step
    out_ratings = []
    out_users = []
    out_books = []
    while count < stop:
        temp = load_data(dir, start=start, end = end)
        
        if len(temp) == 0:
            break
        start = end+1
        end = start + step
        
        for x in temp:
            if x['is_read'] is True:
                if x['user_id'] in to_take_u and x['book_id'] in to_take_b:
                    out_ratings.append(x['rating'])
                    out_users.append(x['user_id'])
                    out_books.append(x['book_id'])
    
        count += len(temp)
        del temp
        print(count)
        
    return out_ratings, out_users, out_books

In [5]:
def process_books(dir, to_take, start, stop, step):
    temp = [0]
    start = 0
    count = 0
    end = step
    out = []
    out_book_id = []
    out_description = []
    out_work_id = []
    while count < stop:
        temp = load_data(dir, start=start, end = end)
        
        if len(temp) == 0:
            break
        start = end+1
        end = start + step
        
        for x in temp:
            if x['book_id'] in to_take:
                out.append(x)
                out_book_id.append(x['book_id'])
                out_description.append(x['description'])
                out_work_id.append(x['work_id'])
    
        count += len(temp)
        del temp
        print(count)
        
    return out, out_book_id, out_description, out_work_id

In [8]:
DIR = 'goodreadsdata/goodreads_books.json.gz' 

In [9]:
out_users, out_books = process_data(DIR, stop=1e8, step = 4e6)
out_users_cum = Counter()
for x in out_users:
    out_users_cum = out_users_cum+x

out_books_cum = Counter()
for x in out_books:
    out_books_cum = out_books_cum+x
    
# with open("user_counts.json", "w") as f:
#     json.dump(dict(out_users_cum), f)
    
# with open("book_counts.json", "w") as f:
#     json.dump(dict(out_books_cum), f)

KeyboardInterrupt: 

In [9]:
# with open('user_counts.json') as f:
#     dt_users = json.load(f)

# dt_users = Counter(dt_users)

# with open('book_counts.json') as f:
#     dt_books = json.load(f)

# dt_books = Counter(dt_books)

# top_users = dict(dt_users.most_common(5000))
# top_books = dict(dt_books.most_common(5000))

# top_users_id = list(top_users.keys())
# top_books_id = list(top_books.keys())

# r, u, b = find_relevant_ratings(DIR, top_users_id, top_books_id, start=0, stop=1e8, step = 1e6)

# with open('ratings.json','w') as f:
#     json.dump(a, f)

In [10]:
# out, out_book_id, out_description, out_work_id = process_books(DIR, top_books_id, start=0, stop=1e8, step=1e5)

# with open('book_info.json', 'w') as f:
#     json.dump([out, out_book_id, out_description, out_work_id], f)

In [11]:
with open('user_counts.json') as f:
    dt_users = json.load(f)

dt_users = Counter(dt_users)

with open('book_counts.json') as f:
    dt_books = json.load(f)

dt_books = Counter(dt_books)

top_users = dict(dt_users.most_common(5000))
top_books = dict(dt_books.most_common(5000))

top_users_id = list(top_users.keys())
top_books_id = list(top_books.keys())

with open('ratings.json') as f:
    ratings_data = json.load(f)

with open('book_info.json') as f:
    book_info = json.load(f)

book_info_id = book_info[1]
book_info_description = book_info[2]
book_info_word_id = book_info[3]

In [12]:
import selenium

In [13]:
link = book_info[0][0]['link']

In [14]:
import requests
from bs4 import BeautifulSoup

# URL of the book page
url = "https://www.goodreads.com/book/show/54270.Mein_Kampf"

# Make a request to fetch the page content
headers = {'User-Agent': 'Mozilla/5.0'}  # Goodreads may block bot-like requests without headers
response = requests.get(url, headers=headers)

# Parse the HTML
soup = BeautifulSoup(response.content, "html.parser")

In [15]:
# Find all the genre spans
genre_spans = soup.find_all('span', class_='BookPageMetadataSection__genreButton')

# Extract the text of each genre
genres = [span.get_text(strip=True) for span in genre_spans]

# Print the genres
print(genre_spans)

[]


In [16]:
def find_genres(book_list):
    genres = []
    for book in book_list:
        url = book['link']
        headers = {'User-Agent': 'Mozilla/5.0'}  # Goodreads may block bot-like requests without headers
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")
        genre_spans = soup.find_all('span', class_='BookPageMetadataSection__genreButton')
        if len(genre_spans) == 0:
            print(url)
        genres.append([span.get_text(strip=True) for span in genre_spans])
    return genres

In [17]:
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import time # To demonstrate speedup

def fetch_and_parse_genres(book):
    """Fetches and parses genres for a single book's URL."""
    url = book.get('link')
    if not url:
        print("Warning: Book entry missing 'link'")
        return [] # Return empty list if no link

    headers = {'User-Agent': 'Mozilla/5.0'} # Be a good citizen
    genres_for_book = []
    try:
        # Consider adding a timeout
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

        soup = BeautifulSoup(response.content, "html.parser")
        # Updated selector based on potential Goodreads changes (check current site structure)
        # Common patterns: data-testid="genres" or specific class names
        genre_elements = soup.select('a[href*="/genres/"]') # Example selector, ADJUST AS NEEDED
        
        # If the above doesn't work, revert to original or inspect the page source:
        if not genre_elements:
             genre_spans = soup.find_all('span', class_='BookPageMetadataSection__genreButton') # Original selector
             genres_for_book = [span.get_text(strip=True) for span in genre_spans]
        else:
             # Process the elements found by the selector
             # This might need adjustment depending on the exact HTML structure
             genres_for_book = [elem.get_text(strip=True) for elem in genre_elements if '/genres/' in elem.get('href', '')]
             # Simple deduplication if needed
             genres_for_book = list(dict.fromkeys(genres_for_book))


        if not genres_for_book:
            print(f"Warning: No genres found for {book.get('title'), url}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        # Decide how to handle errors: return empty list, None, or raise exception
        return []
    except Exception as e:
        print(f"Error parsing {url}: {e}")
        return [] # Return empty list on parsing error

    return genres_for_book

# --- Parallel execution function ---
def find_genres_parallel(book_list, max_workers=10):
    """Finds genres for a list of books in parallel using threads."""
    all_genres = []
    # Use ThreadPoolExecutor to manage threads
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # map applies the function to each item in book_list concurrently
        # It returns results in the order the tasks were submitted
        results = executor.map(fetch_and_parse_genres, book_list)
        all_genres = list(results) # Convert the iterator to a list

    return all_genres

In [18]:
genres = find_genres_parallel(book_info[0], max_workers=20)



- find most commonly used books
- extract descriptions
- merge indices for same books using the works list i.e. if two books have the same work_id aggregate the ratings for those by picking only one of the ids

In [19]:
import pandas as pd

dt = pd.DataFrame({'rating':ratings_data[0], 'user_id':ratings_data[1], 'book_id':ratings_data[2]})
other_dt = pd.DataFrame({'book_id':book_info_id, 'work_id':book_info_word_id, 'description':book_info_description, 'genres':genres})
temp = pd.merge(dt, other_dt, how='left', on='book_id')

  from pandas.core import (


In [20]:
duplicate_work_ids = temp.groupby('work_id')['book_id'].nunique()
duplicate_work_ids = duplicate_work_ids[duplicate_work_ids > 1]

result = temp[temp['work_id'].isin(duplicate_work_ids.index)].copy()

In [21]:
work_data = load_data('goodreads_book_works.json.gz', 0, 1e8)

In [22]:
work_id_title = {x['work_id']:x['original_title'] for x in work_data}
work_id_best_book_id = {x['work_id']:x['best_book_id'] for x in work_data}
work_id_pub_year = {x['work_id']:x['original_publication_year'] for x in work_data}

In [23]:
temp['book_id'] = temp.apply(lambda row: work_id_best_book_id.get(row['work_id'], row['book_id']), axis = 1)

In [24]:
temp['title'] = temp.apply(lambda row: work_id_title.get(row['work_id']), axis = 1)
temp['title'] = temp['title'].apply(lambda x: 'missing' if len(x)==0 else x)

In [25]:
temp['year_pub'] = temp.apply(lambda row: work_id_pub_year.get(row['work_id'], 'missing'), axis = 1)

In [110]:
temp['romance'] = temp['genres'].apply(lambda x: 1 if len(set(x).intersection(['Romance', 'romance']))>0 else 0)
temp['history'] = temp['genres'].apply(lambda x: 1 if len(set(x).intersection(['History', 'history']))>0 else 0)

temp['biography'] = temp['genres'].apply(lambda x: 1 if len(set(x).intersection(['Biography', 'biography', 'Autobiography']))>0 else 0)

temp['fantasy'] = temp['genres'].apply(lambda x: 1 if len(set(x).intersection(['Fantasy', 'fantasy']))>0 else 0)

temp['fiction'] = temp['genres'].apply(lambda x: 1 if len(set(x).intersection(['Fiction', 'fiction']))>0 else 0)

temp['mistery'] = temp['genres'].apply(lambda x: 1 if len(set(x).intersection(['Mystery', 'mystery']))>0 else 0)

temp['classic'] = temp['genres'].apply(lambda x: 1 if len(set(x).intersection(['Classic', 'classic', 'Classics', 'classics']))>0 else 0)

In [90]:
a = ['1', '2']
b = ['1', '2', '3', '4']
set(a).intersection(b) # {'1', '2'}

{'1', '2'}

In [111]:
dataset_clean = temp.copy()

In [112]:
dataset_clean.to_csv('dataset_clean.csv')

In [62]:
matrix_form = dataset_clean.pivot_table(index='user_id', columns='book_id', values='rating', fill_value=0).astype(int)

In [9]:
import pandas as pd

In [10]:
dataset_clean = pd.read_csv('dataset_clean.csv')

In [113]:
dataset_clean.head()

Unnamed: 0,rating,user_id,book_id,work_id,description,genres,title,year_pub,romance,history,biography,young_adult,fantasy,fiction,mystery,classic,mistery
0,4,467ed8f03548be6c2d9228f9a2f7b2ea,32860355,53464530,Their romance shaped a nation. The rest was hi...,"[Historical Fiction, Romance, Young Adult, His...",Alex and Eliza,2017,1,0,0,1,0,1,0,0,0
1,4,467ed8f03548be6c2d9228f9a2f7b2ea,33607640,54427214,"Fiona Davis, author of The Dollhouse, returns ...","[Historical Fiction, Fiction, Mystery, Histori...",missing,2017,0,0,0,0,0,1,1,0,1
2,4,467ed8f03548be6c2d9228f9a2f7b2ea,27833796,47815738,The miraculous new novel from New York Times-b...,"[Fiction, Historical Fiction, France, Book Clu...",The Light of Paris,2016,1,0,0,0,0,1,0,0,0
3,4,467ed8f03548be6c2d9228f9a2f7b2ea,30555488,48287641,Cora is a slave on a cotton plantation in Geor...,"[Historical Fiction, Fiction, Book Club, Histo...",The Underground Railroad,2016,0,0,0,0,0,1,0,0,0
4,5,467ed8f03548be6c2d9228f9a2f7b2ea,29430012,45743836,From the New York Timesbestselling author of ...,"[Historical Fiction, Fiction, Book Club, Histo...",A Gentleman in Moscow,2016,0,0,0,0,0,1,0,0,0


In [57]:
genres_list = load_data('goodreads_books.json.gz', 0, 10)

In [59]:
genres_list[0]

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [41]:
def genres_list_to_dict(genres_list):
    genres_dict = {}
    for x in genres_list:
        genres_dict[x['book_id']] = x['genres']
    return genres_dict

In [47]:
genre_dict = genres_list_to_dict(genres_list)

In [48]:
dataset_clean['genres'] = dataset_clean['book_id'].map(genre_dict)

In [49]:
dataset_clean[~dataset_clean['genres'].isnull()]

Unnamed: 0.1,Unnamed: 0,rating,user_id,book_id,work_id,description,title,year_pub,genres


In [54]:
for x in genres_list:
    if x['book_id'] in dataset_clean['book_id'].values:
        print(x['book_id'])
        print(x['genres'])
        print('---')