In [122]:
import pandas as pd
import numpy as np

In [123]:
reviews_df = pd.read_csv("./filtered_datasets/final_ratings_collab.csv")
books_df = pd.read_csv("./filtered_datasets/final_books_collab.csv")

In [124]:
reviews_df.head(1)

Unnamed: 0,User-ID,Book-Rating,Book-Id
0,277427,9,9


In [125]:
books_df.head(1)

Unnamed: 0,Book-Id,ISBN,Book-Title,Book-Author,Publisher,Year-Of-Publication,categories,description,Image-URL-S,Image-URL-M,Image-URL-L
0,0,440234743,The Testament,John Grisham,Dell,1999,Fiction,"A suicidal billionaire, a burnt-out Washington...",http://images.amazon.com/images/P/0440234743.0...,http://images.amazon.com/images/P/0440234743.0...,http://images.amazon.com/images/P/0440234743.0...


In [129]:
unique_user_ids = reviews_df['User-ID'].unique()
unique_user_ids

array([277427, 278026, 278418, ..., 276307, 276463, 276680], dtype=int64)

In [131]:
len(unique_user_ids)

1345

In [88]:
def add_user_reviews(user_id, num_books):
    # Select random books
    selected_books = books_df.sample(n=num_books, replace=False)['Book-Id'].values

    # Now, humans generally tend to leave more positive reviews overall
    # So, we generate random ratings following a normal distribution centered around 7 to simulate that
    ratings = np.clip(np.random.normal(loc=7, scale=2, size=num_books), 1, 10).astype(int)
    
    # Create the new ratings dataframe
    new_ratings_df = pd.DataFrame({
        'User-ID': [user_id] * num_books,
        'Book-Rating': ratings,
        'Book-Id': selected_books
    })
    
    return new_ratings_df

In [96]:
def add_reviews_for_existing_users(ratings_df, user_count, num_books):
    unique_user_ids = ratings_df['User-ID'].unique()
    possible_user_ids = set(range(300001)) - set(unique_user_ids)
    
    # Select a specified number of random user IDs from the possible_user_ids
    random_user_ids = np.random.choice(list(possible_user_ids), size=user_count, replace=False)
    all_new_reviews_df = pd.DataFrame(columns=['User-ID', 'Book-Rating', 'Book-Id'])
    
    for user_id in random_user_ids:
        new_reviews_df = add_user_reviews(user_id, num_books)
        # Append new reviews to the all_new_reviews_df
        all_new_reviews_df = pd.concat([all_new_reviews_df, new_reviews_df], ignore_index=True)
    
    return all_new_reviews_df

In [114]:
new_reviews = add_reviews_for_existing_users(reviews_df,1000,10)

In [115]:
new_reviews

Unnamed: 0,User-ID,Book-Rating,Book-Id
0,83655,4,1118
1,83655,4,745
2,83655,8,130
3,83655,6,372
4,83655,3,1440
...,...,...,...
9995,289290,5,303
9996,289290,5,1307
9997,289290,6,1475
9998,289290,4,2202


In [116]:
new_reviews.shape

(10000, 3)

In [117]:
# Check if any duplicate users were added
overlapping_user_ids = set(reviews_df['User-ID']).intersection(set(new_reviews['User-ID']))

if overlapping_user_ids:
    print("Overlapping User IDs found:", overlapping_user_ids)
else:
    print("No overlapping User IDs found.")

No overlapping User IDs found.


In [118]:
# Check if any non-existent books were added
new_books_set = set(new_reviews['Book-Id'])
existing_books_set = set(books_df['Book-Id'])

missing_books = new_books_set - existing_books_set

if missing_books:
    print("Book-Id values present in all_new_reviews_df but not in book_df:")
    print(missing_books)
else:
    print("All Book-Id values in all_new_reviews_df are present in book_df.")

All Book-Id values in all_new_reviews_df are present in book_df.


In [119]:
updated_reviews_df = pd.concat([reviews_df, new_reviews], ignore_index=True)

In [120]:
updated_reviews_df.shape

(17458, 3)

In [121]:
updated_reviews_df.to_csv("./filtered_datasets/final_ratings_collab_generated.csv", index=False)