# Setup

In [1]:
import pandas as pd

In [2]:
books_df = pd.read_csv("./filtered_datasets/books_merged_filtered.csv")
ratings_df = pd.read_csv("./filtered_datasets/ratings_1to10_over3.csv",low_memory=False)

## Data exploration

We can see we have a dataset with users' reviews on books

In [3]:
ratings_df

Unnamed: 0,User-ID,ISBN,Book-Rating
0,277427,0375751513,9
1,277427,0440236738,9
2,277427,0441627404,10
3,277427,0446608890,10
4,277427,0679731148,9
...,...,...,...
7399,276463,1400031362,8
7400,276680,0312422156,10
7401,276680,0345436911,8
7402,276680,0375727132,8


In [4]:
books_df.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Publisher,Year-Of-Publication,categories,description,Image-URL-S,Image-URL-M,Image-URL-L
0,440234743,The Testament,John Grisham,Dell,1999,Fiction,"A suicidal billionaire, a burnt-out Washington...",http://images.amazon.com/images/P/0440234743.0...,http://images.amazon.com/images/P/0440234743.0...,http://images.amazon.com/images/P/0440234743.0...
1,553582909,Icebound,Dean R. Koontz,Bantam Books,2000,Fiction,A secret Arctic experiment turns into a frozen...,http://images.amazon.com/images/P/0553582909.0...,http://images.amazon.com/images/P/0553582909.0...,http://images.amazon.com/images/P/0553582909.0...


---

## Fixing the format

__The problem__: Als requires identifiers to be integers (ISBN is not an int).

__Solution__: Create book-id's to corelate the books

In [5]:
# Adding Book-Id column
books_df['Book-Id'] = range(0, len(books_df))

# Reorder columns to place Book-Id before ISBN
cols = list(books_df.columns)
cols.insert(cols.index('ISBN'), cols.pop(cols.index('Book-Id')))
books_with_id_df = books_df[cols]

books_with_id_df.head(1)

Unnamed: 0,Book-Id,ISBN,Book-Title,Book-Author,Publisher,Year-Of-Publication,categories,description,Image-URL-S,Image-URL-M,Image-URL-L
0,0,440234743,The Testament,John Grisham,Dell,1999,Fiction,"A suicidal billionaire, a burnt-out Washington...",http://images.amazon.com/images/P/0440234743.0...,http://images.amazon.com/images/P/0440234743.0...,http://images.amazon.com/images/P/0440234743.0...


Now we map the book id's to isbn

In [6]:
# Create a mapping dictionary from ISBN to ID
isbn_to_id_map = dict(zip(books_with_id_df['ISBN'], books_with_id_df['Book-Id']))

# Add 'book_id' column to ratings_df by mapping ISBN to ID
ratings_df['Book-Id'] = ratings_df['ISBN'].map(isbn_to_id_map)

# Print updated ratings_df with book_id field
print(ratings_df)

      User-ID        ISBN  Book-Rating  Book-Id
0      277427  0375751513            9        9
1      277427  0440236738            9      278
2      277427  0441627404           10      202
3      277427  0446608890           10      159
4      277427  0679731148            9       63
...       ...         ...          ...      ...
7399   276463  1400031362            8      246
7400   276680  0312422156           10       35
7401   276680  0345436911            8       60
7402   276680  0375727132            8       81
7403   276680  0425185508            6      298

[7404 rows x 4 columns]


Finnaly, replace the ISBN with the book id and we have the correctly formatted reviews

In [7]:
ratings_df.drop(columns=['ISBN'], inplace=True)

In [8]:
ratings_df

Unnamed: 0,User-ID,Book-Rating,Book-Id
0,277427,9,9
1,277427,9,278
2,277427,10,202
3,277427,10,159
4,277427,9,63
...,...,...,...
7399,276463,8,246
7400,276680,10,35
7401,276680,8,60
7402,276680,8,81


In [9]:
ratings_df.to_csv("./filtered_datasets/Final/final_ratings.csv",index=False)

In [10]:
books_with_id_df.to_csv("filtered_datasets/Final/final_books.csv",index=False)