# Book Recommendation System

This notebook implements a collaborative filtering recommendation system for books using deep learning. The system uses the Book-Crossing dataset to predict user ratings for books based on both user and book features.

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dot, Lambda
from sklearn.model_selection import train_test_split
import numpy as np
import numpy.ma as ma
from numpy import genfromtxt
from collections import defaultdict
pd.set_option("display.precision", 1)

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from numpy import genfromtxt
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
import csv
from tensorflow import keras

## Data Preprocessing and File Generation

This section handles the initial data processing steps:
1. Loads the original Book-Crossing dataset (books, users, and ratings)
2. Samples 10% of the ratings for training
3. Merges the data to create complete training examples
4. Processes and encodes categorical features
5. Generates necessary CSV files for training

In [11]:
import pandas as pd
import numpy as np
import csv
import pickle

def generate_csv_files():
    """Generate necessary CSV files from book, user, and ratings data using 10% of the ratings.
       Each row in the output files corresponds to one rating (with its associated book and user features),
       so all three files will have the same number of rows."""
    
    # Load original data with proper quoting for books
    books = pd.read_csv('data/BX_Books.csv', delimiter=';', encoding='latin-1', 
                        quoting=csv.QUOTE_NONNUMERIC)
    users = pd.read_csv('data/BX-Users.csv', delimiter=';', encoding='latin-1')
    ratings = pd.read_csv('data/BX-Book-Ratings.csv', delimiter=';', encoding='latin-1')
    
    # Sample 10% of the ratings (this is our primary unit of training)
    ratings_sampled = ratings.sample(frac=0.1, random_state=42)
    
    # Merge the sampled ratings with books and users so that each row gets its corresponding features.
    # This inner merge ensures that each row in the merged DataFrame is complete.
    merged = pd.merge(ratings_sampled, books, on='ISBN', how='inner')
    merged = pd.merge(merged, users, on='User-ID', how='inner')
    
    # At this point, every row in "merged" corresponds to one rating with its book and user features.
    total_rows = len(merged)
    print(f"Number of merged (training) rows: {total_rows}")
    
    # Extract the three dataframes from the merged DataFrame:
    #   - y_train: the ratings
    #   - item_train: the book features
    #   - user_train: the user features
    y_train_df = merged[['Book-Rating']].copy()
    item_train_df = merged[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']].copy()
    user_train_df = merged[['User-ID', 'Location', 'Age']].copy()
    
    # Process item features:
    item_train_df['Year-Of-Publication'] = pd.to_numeric(item_train_df['Year-Of-Publication'], errors='coerce')
    item_train_df['Book-Title'] = pd.factorize(item_train_df['Book-Title'])[0]
    item_train_df['Book-Author'] = pd.factorize(item_train_df['Book-Author'])[0]
    item_train_df['Publisher'] = pd.factorize(item_train_df['Publisher'])[0]
    item_train_df = item_train_df.fillna(0)
    
    # Process user features:
    user_train_df['Location'] = pd.factorize(user_train_df['Location'])[0]
    user_train_df = user_train_df.fillna(0)
    
    # Save CSV files with proper quoting to ensure consistency.
    item_train_df.to_csv('data/content_item_train.csv', index=False, header=False, quoting=csv.QUOTE_NONNUMERIC)
    user_train_df.to_csv('data/content_user_train.csv', index=False, header=False, quoting=csv.QUOTE_NONNUMERIC)
    y_train_df.to_csv('data/content_y_train.csv', index=False, header=False)
    
    # Save header files.
    with open('data/content_item_train_header.txt', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher'])
    
    with open('data/content_user_train_header.txt', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['User-ID', 'Location', 'Age'])
    
    # Create content_item_vecs.csv from numeric columns of item_train_df.
    item_vecs = item_train_df.select_dtypes(include=[np.number]).values
    np.savetxt('data/content_item_vecs.csv', item_vecs, delimiter=',')
    
    # Create a book list file (here renamed to "content_book_list.csv") for reference.
    # We use the original (non-factorized) book fields from the sampled books.
    book_list = books[books['ISBN'].isin(merged['ISBN'])][['ISBN', 'Book-Title', 'Book-Author']]
    book_list.to_csv('data/content_book_list.csv', index=False)
    
    # Create content_user_to_genre.pickle:
    # Since we may not have explicit genre info, we construct a simple user-to-author preference dictionary.
    user_to_author = {}
    for _, row in merged.iterrows():
        uid = row['User-ID']
        author = row['Book-Author']  # Use the original author string if desired (or factorized version)
        if uid not in user_to_author:
            user_to_author[uid] = {}
        user_to_author[uid][author] = user_to_author[uid].get(author, 0) + 1
        
    with open('data/content_user_to_genre.pickle', 'wb') as f:
        pickle.dump(user_to_author, f)
    
    # Read back row counts (from the in-memory dataframes) to confirm consistency:
    num_item_rows = len(item_train_df)
    num_user_rows = len(user_train_df)
    num_y_rows = len(y_train_df)
    print("Row counts in generated files:")
    print(f"content_item_train.csv: {num_item_rows}")
    print(f"content_user_train.csv: {num_user_rows}")
    print(f"content_y_train.csv: {num_y_rows}")

    if num_item_rows == num_user_rows == num_y_rows:
        print("Success: All generated files have the same number of rows.")
    else:
        print("Error: Mismatch in row counts!")

# Call the function to generate files.
generate_csv_files()

Number of merged (training) rows: 103130
Row counts in generated files:
content_item_train.csv: 103130
content_user_train.csv: 103130
content_y_train.csv: 103130
Success: All generated files have the same number of rows.


## Data Loading and Processing

This section includes functions to:
1. Load the preprocessed data files
2. Scale the features appropriately
3. Split the data into training and test sets
4. Display sample data for verification

In [3]:
import pickle
import numpy as np
from numpy import genfromtxt
from collections import defaultdict
import csv

def load_data():
    """Load all necessary data for the book recommendation system"""
    
    item_train = genfromtxt('./data/content_item_train.csv', delimiter=',', 
                           dtype=float, skip_header=0, filling_values=0)
    user_train = genfromtxt('./data/content_user_train.csv', delimiter=',', 
                           dtype=float, skip_header=0, filling_values=0)
    y_train    = genfromtxt('./data/content_y_train.csv', delimiter=',', 
                           dtype=float, skip_header=0, filling_values=0)
    
    with open('./data/content_item_train_header.txt', newline='') as f:
        item_features = list(csv.reader(f))[0]
    with open('./data/content_user_train_header.txt', newline='') as f:
        user_features = list(csv.reader(f))[0]
        
    item_vecs = genfromtxt('./data/content_item_vecs.csv', delimiter=',')
    
    book_dict = defaultdict(dict)
    count = 0
    with open('./data/content_book_list.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for line in reader:
            if count == 0:
                count += 1
            else:
                count += 1
                book_id = line[0]  # ISBN
                book_dict[book_id]["title"] = line[1]  # Book-Title
                book_dict[book_id]["author"] = line[2]  # Book-Author

    with open('./data/content_user_to_genre.pickle', 'rb') as f:
        user_to_genre = pickle.load(f)

    return(item_train, user_train, y_train, item_features, user_features, item_vecs, book_dict, user_to_genre)

In [4]:
item_train, user_train, y_train, item_features, user_features, item_vecs, book_dict, user_to_genre = load_data()


In [42]:
num_user_features = user_train.shape[1] - 1  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 1  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items
scaledata = True  # applies the standard scalar to data if true
print(f"Number of training vectors: {len(item_train)}")
print(f"Number of user vectors: {len(user_train)}")
print(f"Number of y vectors: {len(y_train)}")

Number of training vectors: 66003
Number of user vectors: 66003
Number of y vectors: 66003


In [21]:
def split_str(ifeatures, smax):
    ofeatures = []
    for s in ifeatures:
        if ' ' not in s:  # skip string that already have a space            
            if len(s) > smax:
                mid = int(len(s)/2)
                s = s[:mid] + " " + s[mid:]
        ofeatures.append(s)
    return(ofeatures)

In [7]:

import tabulate
def pprint_train(x_train, features,  vs, u_s, maxcount = 5, user=True):
    """ Prints user_train or item_train nicely """
    if user:
        flist = [".0f",".0f",".1f", 
                 ".1f", ".1f", ".1f", ".1f",".1f",".1f", ".1f",".1f",".1f", ".1f",".1f",".1f",".1f",".1f"]
    else:
        flist = [".0f",".0f",".1f", 
                 ".0f",".0f",".0f", ".0f",".0f",".0f", ".0f",".0f",".0f", ".0f",".0f",".0f",".0f",".0f"]

    head = features[:vs]
    if vs < u_s: print("error, vector start {vs} should be greater then user start {u_s}")
    for i in range(u_s):
        head[i] = "[" + head[i] + "]"
    genres = features[vs:]
    hdr = head + genres
    disp = [split_str(hdr, 5)]
    count = 0
    for i in range(0,x_train.shape[0]):
        if count == maxcount: break
        count += 1
        disp.append( [ 
                      x_train[i,0].astype(int),  
                      x_train[i,1].astype(int),   
                      x_train[i,2].astype(float), 
                      *x_train[i,3:].astype(float)
                    ])
    table = tabulate.tabulate(disp, tablefmt='html',headers="firstrow", floatfmt=flist, numalign='center')
    return(table)



In [22]:
pprint_train(user_train, user_features, uvs,  u_s, maxcount=5)

[Use r-ID],Loca tion,Age
0,0,1.5
1,0,-1.3
1,0,-1.3
-1,0,-0.3
-1,0,-1.3


In [23]:
pprint_train(item_train, item_features, ivs, i_s, maxcount=5, user=False)

[IS BN],Book- Title,Book- Author,Year-Of-P ublication,Publ isher
0,0,-0.8,0,0
0,0,-0.6,0,0
0,0,0.2,0,0
0,-1,-0.8,0,0
0,1,-0.7,0,0


In [24]:
print(f"y_train[:5]: {y_train[:5]}")

y_train[:5]: [0. 9. 0. 0. 0.]


In [25]:
# scale training data
if scaledata:
    item_train_save = item_train
    user_train_save = user_train

    scalerItem = StandardScaler()
    scalerItem.fit(item_train)
    item_train = scalerItem.transform(item_train)

    scalerUser = StandardScaler()
    scalerUser.fit(user_train)
    user_train = scalerUser.transform(user_train)

    print(np.allclose(item_train_save, scalerItem.inverse_transform(item_train)))
    print(np.allclose(user_train_save, scalerUser.inverse_transform(user_train)))

True
True


In [26]:
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"Book/item training data shape: {item_train.shape}")
print(f"Book/item test  data shape: {item_test.shape}")

Book/item training data shape: (66003, 5)
Book/item test  data shape: (16501, 5)


In [27]:
pprint_train(user_train, user_features, uvs, u_s, maxcount=5)

[Use r-ID],Loca tion,Age
0,0,0.3
1,0,-1.3
0,0,0.8
0,0,-1.3
1,0,0.1


In [28]:
scaler = MinMaxScaler((-1, 1))
scaler.fit(y_train.reshape(-1, 1))
ynorm_train = scaler.transform(y_train.reshape(-1, 1))
ynorm_test = scaler.transform(y_test.reshape(-1, 1))
print(ynorm_train.shape, ynorm_test.shape)

(66003, 1) (16501, 1)


In [29]:
print(f"user_train shape: {user_train[:, u_s:].shape}")
print(f"item_train shape: {item_train[:, i_s:].shape}")
print(f"ynorm_train shape: {ynorm_train.shape}")

user_train shape: (66003, 2)
item_train shape: (66003, 4)
ynorm_train shape: (66003, 1)


## Model Architecture

The recommendation system uses a neural network with:
1. User embedding layer
2. Item embedding layer
3. Dot product layer for rating prediction
4. Regularization to prevent overfitting

In [43]:
num_outputs = 32

tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear'),
])

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear'),
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features,))
vu = user_NN(input_user)
vu = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vu)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features,))
vm = item_NN(input_item)
vm = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

In [36]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [37]:
# Check the shapes of your data
print(f"user_train shape: {user_train[:, u_s:].shape}")
print(f"item_train shape: {item_train[:, i_s:].shape}")
print(f"ynorm_train shape: {ynorm_train.shape}")

# Make sure all arrays have the same number of samples
min_samples = min(len(user_train), len(item_train), len(ynorm_train))
user_train_trimmed = user_train[:min_samples, u_s:]
item_train_trimmed = item_train[:min_samples, i_s:]
ynorm_train_trimmed = ynorm_train[:min_samples]



user_train shape: (66003, 2)
item_train shape: (66003, 4)
ynorm_train shape: (66003, 1)


## Model Training

This section includes:
1. Data preparation and scaling
2. Training/test split
3. Model training loop
4. Loss tracking and optimization

In [38]:
# Now fit the model with the trimmed data
tf.random.set_seed(1)
model.fit([user_train_trimmed, item_train_trimmed], ynorm_train_trimmed, epochs=30)

Epoch 1/30
[1m2063/2063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - loss: 0.5829
Epoch 2/30
[1m2063/2063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - loss: 0.5751
Epoch 3/30
[1m2063/2063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - loss: 0.5742
Epoch 4/30
[1m2063/2063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - loss: 0.5736
Epoch 5/30
[1m2063/2063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - loss: 0.5730
Epoch 6/30
[1m2063/2063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - loss: 0.5722
Epoch 7/30
[1m2063/2063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.5718
Epoch 8/30
[1m2063/2063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - loss: 0.5706
Epoch 9/30
[1m2063/2063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - loss: 0.5708
Epoch 10/30
[1m2063/2063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

<keras.src.callbacks.history.History at 0x1bf8616d250>

In [39]:
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], ynorm_test)

[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.5677


0.5703388452529907

## Model Evaluation and Recommendations

This section includes:
1. Model evaluation metrics
2. Function to generate personalized book recommendations
3. Example recommendations for specific users

In [46]:
# Cell 1: Prepare the new user vector and replicate it for all books

import numpy as np
import pandas as pd

# For our book model the user features (as used for training) are:
# ['Location', 'Age'] (user ID is not used for the neural network input)
# Here, we define a new user with arbitrary values.
new_location = 2   # for example, the factorized value for location
new_age = 30       # the new user's age

# Create the new user vector (only the features required, shape: [1, 2])
new_user_vec = np.array([[new_location, new_age]])

# Load the precomputed item (book) vectors
# These vectors were generated from your content_item_train.csv and saved in content_item_vecs.csv.
item_vecs = np.loadtxt('data/content_item_vecs.csv', delimiter=',')
num_items = item_vecs.shape[0]

# Replicate the new user vector to match each item.
# This creates an array of shape (num_items, 2)
new_user_vecs = np.tile(new_user_vec, (num_items, 1))

print(f"New user vector replicated shape: {new_user_vecs.shape}")
print(f"Number of items: {num_items}")

New user vector replicated shape: (103130, 2)
Number of items: 103130


In [50]:
import numpy as np
import pandas as pd

# Predict ratings using the trained model.
predictions = model.predict([new_user_vecs, item_vecs]).flatten()

# Sort predictions in descending order.
sorted_indices = np.argsort(predictions)[::-1]

# Load the book list.
book_list = pd.read_csv('data/content_book_list.csv')

print("Number of item vectors:", item_vecs.shape[0])
print("Number of books in book_list:", len(book_list))

# Filter indices if necessary
valid_top_indices = [idx for idx in sorted_indices if idx < len(book_list)]
top_n = 10
top_indices = valid_top_indices[:top_n]
top_predictions = predictions[top_indices]

print("Top recommended books for the new user:")
for idx, pred in zip(top_indices, top_predictions):
    isbn = book_list.iloc[idx]['ISBN']
    title = book_list.iloc[idx]['Book-Title']
    author = book_list.iloc[idx]['Book-Author']
    print(f"ISBN: {isbn}, Title: {title}, Author: {author}, Predicted Rating: {pred:.3f}")

[1m3223/3223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step
Number of item vectors: 103130
Number of books in book_list: 60137
Top recommended books for the new user:
ISBN: 1569871205, Title: The Mutiny on the H. M. S. Bounty (Illustrated Classics), Author: William Bligh, Predicted Rating: 0.136
ISBN: 0451170857, Title: A Cat in Wolf's Clothing (Alice Nestleton Mysteries (Paperback)), Author: Lydia Adamson, Predicted Rating: 0.133
ISBN: 0425186105, Title: Gladiatrix: The True Story of History's Unknown Woman Warrior, Author: Amy Zoll, Predicted Rating: 0.133
ISBN: 3499264293, Title: Ein unverhofftes GestÃ?Â¤ndnis. Roman., Author: P. D. James, Predicted Rating: 0.132
ISBN: 0446607223, Title: Shadows on the Aegean, Author: Suzanne Frank, Predicted Rating: 0.132
ISBN: 0553272055, Title: Call Me Anna: The Autobiography of Patty Duke, Author: Patty Duke, Predicted Rating: 0.132
ISBN: 0312876637, Title: The Folk of the Fringe, Author: Orson Scott Card, Predicted Rating: 0.13