# Processing data from created CSV files

* Need to have `data/10000_reviews.csv` and `data/book_titles.csv` to run

In [1]:
import pandas as pd
import re
import ast
from sentence_transformers import SentenceTransformer
import joblib

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Users reviews without book titles
reviews_df = pd.read_csv('data/10000_reviews.csv')
reviews_df.describe()

Unnamed: 0.1,Unnamed: 0,book_id,rating,n_votes,n_comments
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,691012.0,13375300.0,3.6958,3.0505,0.9392
std,398596.6,9243475.0,1.232321,12.915968,5.122336
min,14.0,1.0,0.0,0.0,0.0
25%,352582.5,6357565.0,3.0,0.0,0.0
50%,693865.5,13330940.0,4.0,0.0,0.0
75%,1036109.0,20359730.0,5.0,2.0,0.0
max,1377876.0,36252770.0,5.0,444.0,287.0


In [3]:
# Book titles information

book_names_df = pd.read_csv('data/book_titles.csv')
book_names_df.describe()

Unnamed: 0.1,Unnamed: 0,text_reviews_count,average_rating,num_pages,publication_day,publication_month,publication_year,book_id,ratings_count,work_id
count,6301.0,6301.0,6301.0,5968.0,5362.0,5505.0,5622.0,6301.0,6301.0,6301.0
mean,3150.0,2652.332804,3.972349,346.728887,14.02928,6.303724,2011.16222,13936000.0,51445.73,21120640.0
std,1819.086355,6319.195025,0.274742,144.932054,9.772724,3.274454,5.550981,9537153.0,195580.3,16181900.0
min,0.0,20.0,2.46,1.0,1.0,1.0,1950.0,1.0,42.0,114.0
25%,1575.0,332.0,3.81,274.0,5.0,4.0,2010.0,6492981.0,2561.0,6405906.0
50%,3150.0,825.0,3.99,336.0,13.0,6.0,2013.0,13612960.0,8227.0,18841560.0
75%,4725.0,2268.0,4.16,401.0,23.0,9.0,2014.0,21413850.0,28613.0,31175870.0
max,6300.0,142645.0,4.77,1859.0,31.0,12.0,2018.0,36252770.0,4899965.0,57997940.0


In [4]:
# Merge the two dataset

merged_df = reviews_df.merge(book_names_df, on='book_id', how='left')

In [5]:
merged_df.keys()

Index(['Unnamed: 0_x', 'user_id', 'book_id', 'review_id', 'rating',
       'review_text', 'date_added', 'date_updated', 'read_at', 'started_at',
       'n_votes', 'n_comments', 'Unnamed: 0_y', 'isbn', 'text_reviews_count',
       'series', 'country_code', 'language_code', 'popular_shelves', 'asin',
       'is_ebook', 'average_rating', 'kindle_asin', 'similar_books',
       'description', 'format', 'link', 'authors', 'publisher', 'num_pages',
       'publication_day', 'isbn13', 'publication_month', 'edition_information',
       'publication_year', 'url', 'image_url', 'ratings_count', 'work_id',
       'title', 'title_without_series'],
      dtype='object')

In [6]:
# Check missing values
merged_df.isnull().sum()

Unnamed: 0_x               0
user_id                    0
book_id                    0
review_id                  0
rating                     0
review_text                0
date_added                 0
date_updated               0
read_at                  953
started_at              2920
n_votes                    0
n_comments                 0
Unnamed: 0_y               0
isbn                    1890
text_reviews_count         0
series                     0
country_code               0
language_code            507
popular_shelves            0
asin                    8862
is_ebook                   0
average_rating             0
kindle_asin             1051
similar_books              0
description               61
format                  1271
link                       0
authors                    0
publisher               1383
num_pages                394
publication_day         1372
isbn13                  1641
publication_month       1168
edition_information     8701
publication_ye

In [7]:
# Aggregate reviews for each user
# With rating of 4 and 5 stars

df_high_rated = merged_df[merged_df['rating'] >= 4]

reviews_grouped = (
    df_high_rated
    .groupby('user_id', as_index=False)
    .agg({
        'review_text': lambda x: " ".join(x),
        'book_id': list,
        'rating': list,
        'similar_books': list,
        'authors': list,
        'image_url': list,   # Keeping the thumbnails for displaying book covers
        'title': list

    })
)


reviews_grouped.head()

Unnamed: 0,user_id,review_text,book_id,rating,similar_books,authors,image_url,title
0,0016a8010771c0c00c97f27dfc5cdd22,One stormy evening Judith McNaught stood at a ...,[129619],[5],"[['107770', '656836', '133499', '330721', '784...","[[{'author_id': '9885', 'role': ''}]]",[https://s.gr-assets.com/assets/nophoto/book/1...,"[A Kingdom of Dreams (Westmoreland, #1)]"
1,00214d8b0a020837cccf5f41eb563037,Even though I gave the book a pretty high rati...,[6582637],[4],"[['6753250', '10557809', '7044446', '188275', ...","[[{'author_id': '71688', 'role': ''}]]",[https://s.gr-assets.com/assets/nophoto/book/1...,"[Archangel's Kiss (Guild Hunter, #2)]"
2,00238d8a4c276c47f5d5e242f54a8f28,Woo the sequel to Hunting Lila! Loved the firs...,"[12410324, 13574417]","[4, 4]","[['13097681', '13181901', '12977172', '1348702...","[[{'author_id': '4573983', 'role': ''}], [{'au...",[https://images.gr-assets.com/books/1334998655...,"[Losing Lila (Lila, #2), Alienated (Alienated,..."
3,002a023d3de233b4bd3ec4fc3e9c581a,Read first time 11/7/15,[13508421],[4],"[['18135513', '15711420', '33283527', '1805398...","[[{'author_id': '835348', 'role': ''}]]",[https://images.gr-assets.com/books/1421837254...,"[Public Enemies (Immortal Game, #2)]"
4,00678bcab8da79ce5720200a3c2e4e7f,"""I think I realized that I would rather die be...",[13206760],[5],"[['9943270', '12812550', '10165761', '15723286...","[[{'author_id': '4684322', 'role': ''}]]",[https://images.gr-assets.com/books/1470056982...,"[Scarlet (The Lunar Chronicles, #2)]"


In [8]:
# Check to see if there's any duplicated user_id

duplicates_subset = reviews_grouped.duplicated(subset=['user_id'])
duplicates_subset.sum()

np.int64(0)

In [9]:
# Count words in each review
reviews_grouped['word_count'] = reviews_grouped['review_text'].apply(lambda x: len(str(x).split()))

# Filter out reviews less than 15 words

reviews_grouped = reviews_grouped[reviews_grouped['word_count'] >= 15].reset_index(drop=True)

reviews_grouped.head()

Unnamed: 0,user_id,review_text,book_id,rating,similar_books,authors,image_url,title,word_count
0,0016a8010771c0c00c97f27dfc5cdd22,One stormy evening Judith McNaught stood at a ...,[129619],[5],"[['107770', '656836', '133499', '330721', '784...","[[{'author_id': '9885', 'role': ''}]]",[https://s.gr-assets.com/assets/nophoto/book/1...,"[A Kingdom of Dreams (Westmoreland, #1)]",239
1,00214d8b0a020837cccf5f41eb563037,Even though I gave the book a pretty high rati...,[6582637],[4],"[['6753250', '10557809', '7044446', '188275', ...","[[{'author_id': '71688', 'role': ''}]]",[https://s.gr-assets.com/assets/nophoto/book/1...,"[Archangel's Kiss (Guild Hunter, #2)]",404
2,00238d8a4c276c47f5d5e242f54a8f28,Woo the sequel to Hunting Lila! Loved the firs...,"[12410324, 13574417]","[4, 4]","[['13097681', '13181901', '12977172', '1348702...","[[{'author_id': '4573983', 'role': ''}], [{'au...",[https://images.gr-assets.com/books/1334998655...,"[Losing Lila (Lila, #2), Alienated (Alienated,...",222
3,00678bcab8da79ce5720200a3c2e4e7f,"""I think I realized that I would rather die be...",[13206760],[5],"[['9943270', '12812550', '10165761', '15723286...","[[{'author_id': '4684322', 'role': ''}]]",[https://images.gr-assets.com/books/1470056982...,"[Scarlet (The Lunar Chronicles, #2)]",109
4,006f552534b15a7358a125f7505e0eea,Another excellent novel from Kanae Minato. Bui...,[19161835],[5],"[['7668119', '25367', '7516027', '3190020', '5...","[[{'author_id': '6426380', 'role': ''}, {'auth...",[https://images.gr-assets.com/books/1401076501...,[Confessions],54


In [10]:
# Function to clean up the reviews

def clean_text(text):
    text = re.sub(r"<.*?>", "", text)       # Remove HTML tags
    text = re.sub(r"[\r\n]+", " ", text)    # Remove line breaks
    text = re.sub(r"\s+", " ", text)        # Remove space
    text = text.replace(r"\'", "'")         # Replace escaped apostrophes
    return text.strip().lower()

In [11]:
reviews_grouped['cleaned_reviews'] = reviews_grouped['review_text'].apply(clean_text)

In [12]:
# Drop the uncleaned reviews
reviews = reviews_grouped.drop('review_text', axis=1)

In [13]:
reviews.to_csv('data/processed_reviews.csv', index=False)     # Processed review file

In [14]:
book_names_df.keys()

Index(['Unnamed: 0', 'isbn', 'text_reviews_count', 'series', 'country_code',
       'language_code', 'popular_shelves', 'asin', 'is_ebook',
       'average_rating', 'kindle_asin', 'similar_books', 'description',
       'format', 'link', 'authors', 'publisher', 'num_pages',
       'publication_day', 'isbn13', 'publication_month', 'edition_information',
       'publication_year', 'url', 'image_url', 'book_id', 'ratings_count',
       'work_id', 'title', 'title_without_series'],
      dtype='object')

In [15]:
# Keeping only what we need

book_descriptions_df = book_names_df[[
    'book_id','title','description','average_rating','similar_books','image_url','ratings_count'
]]
book_descriptions_df.head()

Unnamed: 0,book_id,title,description,average_rating,similar_books,image_url,ratings_count
0,38568,A Quick Bite (Argeneau #1),That hot guy tied to Lissianna Argeneau's bed?...,3.91,"['225669', '50789', '383143', '293616', '75960...",https://images.gr-assets.com/books/1410129015m...,32140
1,16107050,Tied (Fire Born #1),Normal people don't believe their nightmares s...,3.45,"['18074687', '18692520', '25679123', '25744874...",https://images.gr-assets.com/books/1361793669m...,79
2,19057,I Am the Messenger,protect the diamonds\nsurvive the clubs\ndig d...,4.09,"['1121894', '99375', '233906', '824499', '1006...",https://images.gr-assets.com/books/1398483261m...,94968
3,17333426,"Egghead: Or, You Can't Survive on Ideas Alone",A strange and charming collection of hilarious...,4.12,"['15824233', '13152144', '13529076', '18369456...",https://images.gr-assets.com/books/1400879372m...,6461
4,19398490,All the Light We Cannot See,Winner of the Pulitzer Prize\nFrom the highly ...,4.31,"['11807189', '11529868', '17332207', '13593526...",https://images.gr-assets.com/books/1440903901m...,53342


In [16]:
# Drop rows with missing description

book_descriptions_df = book_descriptions_df.dropna(subset=['description']).reset_index(drop=True)

In [17]:
# Clean up description

book_descriptions_df['cleaned_description'] = book_descriptions_df['description'].apply(clean_text)
book_descriptions_df.drop('description', axis=1, inplace=True)
book_descriptions_df

Unnamed: 0,book_id,title,average_rating,similar_books,image_url,ratings_count,cleaned_description
0,38568,A Quick Bite (Argeneau #1),3.91,"['225669', '50789', '383143', '293616', '75960...",https://images.gr-assets.com/books/1410129015m...,32140,that hot guy tied to lissianna argeneau's bed?...
1,16107050,Tied (Fire Born #1),3.45,"['18074687', '18692520', '25679123', '25744874...",https://images.gr-assets.com/books/1361793669m...,79,normal people don't believe their nightmares s...
2,19057,I Am the Messenger,4.09,"['1121894', '99375', '233906', '824499', '1006...",https://images.gr-assets.com/books/1398483261m...,94968,protect the diamonds survive the clubs dig dee...
3,17333426,"Egghead: Or, You Can't Survive on Ideas Alone",4.12,"['15824233', '13152144', '13529076', '18369456...",https://images.gr-assets.com/books/1400879372m...,6461,a strange and charming collection of hilarious...
4,19398490,All the Light We Cannot See,4.31,"['11807189', '11529868', '17332207', '13593526...",https://images.gr-assets.com/books/1440903901m...,53342,winner of the pulitzer prize from the highly a...
...,...,...,...,...,...,...,...
6247,16097364,"Severed Heads, Broken Hearts",3.77,"['16065521', '14290962', '12157780', '15721669...",https://images.gr-assets.com/books/1374501795m...,1119,"robyn schneider's book, originally titled seve..."
6248,32766757,No Good Deed,3.85,[],https://images.gr-assets.com/books/1481308695m...,295,ellie hudson is the front-runner on the road t...
6249,17061,Coraline,4.03,"['2934112', '24774', '13538708', '34501', '371...",https://images.gr-assets.com/books/1493497435m...,325562,'an electrifyingly creepy tale likely to haunt...
6250,24397043,The Hollow Boy (Lockwood & Co. #3),4.36,"['24611884', '17464884', '20706799', '18405537...",https://images.gr-assets.com/books/1429031246m...,6088,as a massive outbreak of supernatural visitors...


In [18]:
book_descriptions_df.to_csv('data/book_descriptions.csv', index=False)

### Description clean-up

In [19]:
book_desc_df = book_descriptions_df
reviews_df = reviews

Investigate the shape of both dataframes

In [20]:
# Check shape of both DataFrames
print("Book Descriptions Dataset:")
print(f"Rows: {book_desc_df.shape[0]}, Columns: {book_desc_df.shape[1]}\n")

print("Processed Reviews Dataset:")
print(f"Rows: {reviews_df.shape[0]}, Columns: {reviews_df.shape[1]}")


Book Descriptions Dataset:
Rows: 6252, Columns: 7

Processed Reviews Dataset:
Rows: 4015, Columns: 9


Check for books that are in reviews but missing a description

In [21]:
# Ensure book_id is treated as list, then flatten

# reviews_df['book_id'] = reviews_df['book_id'].apply(eval)  # safely convert string to list
all_review_book_ids = pd.Series([bid for sublist in reviews_df['book_id'] for bid in sublist])

# Get missing book IDs
missing_books = set(all_review_book_ids) - set(book_desc_df['book_id'])
print(f"❗ Missing book descriptions for {len(missing_books)} books")

❗ Missing book descriptions for 31 books


Drop rows where all book_ids are missing

In [22]:
# drop rows where all book_ids are missing
# Or keep only reviews where at least one book_id is present in descriptions
def filter_valid_books(book_ids):
    return [bid for bid in book_ids if bid in set(book_desc_df['book_id'])]

reviews_df['book_id'] = reviews_df['book_id'].apply(filter_valid_books)
reviews_df = reviews_df[reviews_df['book_id'].str.len() > 0].reset_index(drop=True)

confirm dropping

In [23]:
# Check shape of both DataFrames
print("Book Descriptions Dataset:")
print(f"Rows: {book_desc_df.shape[0]}, Columns: {book_desc_df.shape[1]}\n")

print("Processed Reviews Dataset:")
print(f"Rows: {reviews_df.shape[0]}, Columns: {reviews_df.shape[1]}")

Book Descriptions Dataset:
Rows: 6252, Columns: 7

Processed Reviews Dataset:
Rows: 3996, Columns: 9


Unwrap & Flatten similar_books and authors to convert nested string lists into clean Python lists

In [24]:
def unwrap_outer_to_list(cell):
    """
    cell: a string that looks like ["'item1', 'item2'"]  OR already a list
    returns a Python list
    """
    if isinstance(cell, str):
        try:
            return ast.literal_eval(cell)
        except Exception:
            return []
    return cell  # already list

def flatten_similar(cell):
    outer = unwrap_outer_to_list(cell)           # → list of *strings*
    flat  = []
    for inner in outer:
        inner_list = unwrap_outer_to_list(inner) # → actual list of IDs
        if isinstance(inner_list, list):
            flat.extend(inner_list)
    return flat

def flatten_authors(cell):
    outer = unwrap_outer_to_list(cell)
    ids   = []
    for inner in outer:
        inner_list = unwrap_outer_to_list(inner)   # → list of dicts

        if isinstance(inner_list, dict):
            inner_list = [inner_list]
        if isinstance(inner_list, list):
            ids.extend(d.get('author_id')
                       for d in inner_list
                       if isinstance(d, dict) and 'author_id' in d)
    return ids

In [25]:
reviews_df['similar_books'] = reviews_df['similar_books'].apply(flatten_similar)
reviews_df['authors']       = reviews_df['authors'].apply(flatten_authors)

In [26]:
print("Flattened 'similar_books':")
print(reviews_df['similar_books'].head())

print("Flattened 'authors':")
print(reviews_df['authors'].head())

Flattened 'similar_books':
0    [107770, 656836, 133499, 330721, 784045, 36025...
1    [6753250, 10557809, 7044446, 188275, 5604848, ...
2    [13097681, 13181901, 12977172, 13487029, 10552...
3    [9943270, 12812550, 10165761, 15723286, 135819...
4    [7668119, 25367, 7516027, 3190020, 577399, 177...
Name: similar_books, dtype: object
Flattened 'authors':
0                [9885]
1               [71688]
2    [4573983, 5804715]
3             [4684322]
4     [6426380, 144369]
Name: authors, dtype: object


In [27]:
# Preview the dataframes
print("Book Descriptions Preview:")
display(book_desc_df.head())

print("Processed Reviews Preview:")
display(reviews_df.head())

Book Descriptions Preview:


Unnamed: 0,book_id,title,average_rating,similar_books,image_url,ratings_count,cleaned_description
0,38568,A Quick Bite (Argeneau #1),3.91,"['225669', '50789', '383143', '293616', '75960...",https://images.gr-assets.com/books/1410129015m...,32140,that hot guy tied to lissianna argeneau's bed?...
1,16107050,Tied (Fire Born #1),3.45,"['18074687', '18692520', '25679123', '25744874...",https://images.gr-assets.com/books/1361793669m...,79,normal people don't believe their nightmares s...
2,19057,I Am the Messenger,4.09,"['1121894', '99375', '233906', '824499', '1006...",https://images.gr-assets.com/books/1398483261m...,94968,protect the diamonds survive the clubs dig dee...
3,17333426,"Egghead: Or, You Can't Survive on Ideas Alone",4.12,"['15824233', '13152144', '13529076', '18369456...",https://images.gr-assets.com/books/1400879372m...,6461,a strange and charming collection of hilarious...
4,19398490,All the Light We Cannot See,4.31,"['11807189', '11529868', '17332207', '13593526...",https://images.gr-assets.com/books/1440903901m...,53342,winner of the pulitzer prize from the highly a...


Processed Reviews Preview:


Unnamed: 0,user_id,book_id,rating,similar_books,authors,image_url,title,word_count,cleaned_reviews
0,0016a8010771c0c00c97f27dfc5cdd22,[129619],[5],"[107770, 656836, 133499, 330721, 784045, 36025...",[9885],[https://s.gr-assets.com/assets/nophoto/book/1...,"[A Kingdom of Dreams (Westmoreland, #1)]",239,one stormy evening judith mcnaught stood at a ...
1,00214d8b0a020837cccf5f41eb563037,[6582637],[4],"[6753250, 10557809, 7044446, 188275, 5604848, ...",[71688],[https://s.gr-assets.com/assets/nophoto/book/1...,"[Archangel's Kiss (Guild Hunter, #2)]",404,even though i gave the book a pretty high rati...
2,00238d8a4c276c47f5d5e242f54a8f28,"[12410324, 13574417]","[4, 4]","[13097681, 13181901, 12977172, 13487029, 10552...","[4573983, 5804715]",[https://images.gr-assets.com/books/1334998655...,"[Losing Lila (Lila, #2), Alienated (Alienated,...",222,woo the sequel to hunting lila! loved the firs...
3,00678bcab8da79ce5720200a3c2e4e7f,[13206760],[5],"[9943270, 12812550, 10165761, 15723286, 135819...",[4684322],[https://images.gr-assets.com/books/1470056982...,"[Scarlet (The Lunar Chronicles, #2)]",109,"""i think i realized that i would rather die be..."
4,006f552534b15a7358a125f7505e0eea,[19161835],[5],"[7668119, 25367, 7516027, 3190020, 577399, 177...","[6426380, 144369]",[https://images.gr-assets.com/books/1401076501...,[Confessions],54,another excellent novel from kanae minato. bui...


## Creating embeddings

In [28]:
descriptions = book_desc_df['cleaned_description']
reviews = reviews_df['cleaned_reviews']

In [29]:
model = SentenceTransformer('all-MiniLM-L6-v2')
description_embedding = model.encode(descriptions)
reviews_embedding = model.encode(reviews)

In [30]:
joblib.dump(description_embedding, 'data/description_embedding.pkl')
joblib.dump(reviews_embedding, 'data/reviews_embedding.pkl')

['data/reviews_embedding.pkl']