In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

In [6]:
# Load the CSV files
book_data = pd.read_csv('data/book_data.csv')
genre_mat = pd.read_csv('data/bin_genre_mat.csv')

In [7]:
book_data.head()

Unnamed: 0,BookID,Title,Author,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,GenreLink,Series,PublishYear,isbn,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,5,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling, Mary GrandPré",2028235,706012,215277,29265,14292,58998,435,Mass Market Paperback,English,2402163,Harry Potter,1999,043965548X,Scholastic Paperbacks,http://images.amazon.com/images/P/043965548X.0...,http://images.amazon.com/images/P/043965548X.0...,http://images.amazon.com/images/P/043965548X.0...
1,2,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré",1685420,668805,232468,41915,16344,45160,870,Paperback,English,2809203,Harry Potter,2003,0439358078,Scholastic Paperbacks,http://images.amazon.com/images/P/0439358078.0...,http://images.amazon.com/images/P/0439358078.0...,http://images.amazon.com/images/P/0439358078.0...
2,13,The Ultimate Hitchhiker's Guide to the Galaxy,Douglas Adams,168732,81509,32463,8707,4384,5443,815,Paperback,English,135328,The Hitchhiker's Guide to the Galaxy,1996,0345453743,Del Rey,http://images.amazon.com/images/P/0345453743.0...,http://images.amazon.com/images/P/0345453743.0...,http://images.amazon.com/images/P/0345453743.0...
3,4,Harry Potter and the Chamber of Secrets,J.K. Rowling,1714974,791685,317647,55360,15765,56282,352,Hardcover,English,6231171,Harry Potter,1998,0439554896,Arthur A. Levine Books,http://images.amazon.com/images/P/0439554896.0...,http://images.amazon.com/images/P/0439554896.0...,http://images.amazon.com/images/P/0439554896.0...
4,18,The Ultimate Hitchhiker's Guide,Douglas Adams,168734,81511,32464,8707,4384,5443,815,Hardcover,English,135328,The Hitchhiker's Guide to the Galaxy,1996,0517149257,Random House Value Publishing,http://images.amazon.com/images/P/0517149257.0...,http://images.amazon.com/images/P/0517149257.0...,http://images.amazon.com/images/P/0517149257.0...


In [8]:
# Replace NaN values with 0
book_data['Series'] = book_data['Series'].fillna(0)

# Convert all non-zero values to 1
book_data['Series'] = np.where(book_data['Series'] == 0, 0, 1)

In [9]:
book_data.head(5)

Unnamed: 0,BookID,Title,Author,5 stars,4 stars,3 stars,2 stars,1 star,Number of reviewers,Pages,Book format,Language,GenreLink,Series,PublishYear,isbn,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,5,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling, Mary GrandPré",2028235,706012,215277,29265,14292,58998,435,Mass Market Paperback,English,2402163,1,1999,043965548X,Scholastic Paperbacks,http://images.amazon.com/images/P/043965548X.0...,http://images.amazon.com/images/P/043965548X.0...,http://images.amazon.com/images/P/043965548X.0...
1,2,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré",1685420,668805,232468,41915,16344,45160,870,Paperback,English,2809203,1,2003,0439358078,Scholastic Paperbacks,http://images.amazon.com/images/P/0439358078.0...,http://images.amazon.com/images/P/0439358078.0...,http://images.amazon.com/images/P/0439358078.0...
2,13,The Ultimate Hitchhiker's Guide to the Galaxy,Douglas Adams,168732,81509,32463,8707,4384,5443,815,Paperback,English,135328,1,1996,0345453743,Del Rey,http://images.amazon.com/images/P/0345453743.0...,http://images.amazon.com/images/P/0345453743.0...,http://images.amazon.com/images/P/0345453743.0...
3,4,Harry Potter and the Chamber of Secrets,J.K. Rowling,1714974,791685,317647,55360,15765,56282,352,Hardcover,English,6231171,1,1998,0439554896,Arthur A. Levine Books,http://images.amazon.com/images/P/0439554896.0...,http://images.amazon.com/images/P/0439554896.0...,http://images.amazon.com/images/P/0439554896.0...
4,18,The Ultimate Hitchhiker's Guide,Douglas Adams,168734,81511,32464,8707,4384,5443,815,Hardcover,English,135328,1,1996,0517149257,Random House Value Publishing,http://images.amazon.com/images/P/0517149257.0...,http://images.amazon.com/images/P/0517149257.0...,http://images.amazon.com/images/P/0517149257.0...


In [10]:
# Calculate average rating
rating_columns = ['5 stars', '4 stars', '3 stars', '2 stars', '1 star']
book_data['average_rating'] = (5*book_data['5 stars'] + 4*book_data['4 stars'] + 3*book_data['3 stars'] + 
                               2*book_data['2 stars'] + 1*book_data['1 star']) / book_data[rating_columns].sum(axis=1)
book_data['average_rating']

0       4.571836
1       4.499098
2       4.357352
3       4.428023
4       4.357350
          ...   
3415    3.616282
3416    4.135051
3417    3.929368
3418    4.077090
3419    3.966531
Name: average_rating, Length: 3420, dtype: float64

In [11]:
book_data['num_years'] = 2024-book_data['PublishYear']

In [12]:
# Select required columns
book_data_reduced = book_data[['BookID', 'average_rating', 'Pages', 'Book format', 'Series', 'num_years']]
book_data_reduced.head()

Unnamed: 0,BookID,average_rating,Pages,Book format,Series,num_years
0,5,4.571836,435,Mass Market Paperback,1,25
1,2,4.499098,870,Paperback,1,21
2,13,4.357352,815,Paperback,1,28
3,4,4.428023,352,Hardcover,1,26
4,18,4.35735,815,Hardcover,1,28


In [13]:
# Merge the genre matrix with book data
merged_data = pd.merge(genre_mat, book_data_reduced, left_on='bookId', right_on='BookID')
merged_data.head()

Unnamed: 0,bookId,adult,adventure,art,biography,bussiness,children,classics,comedy,contemporary,...,short-stories,sociology,tragedy,young-adult,BookID,average_rating,Pages,Book format,Series,num_years
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,4.499098,870,Paperback,1,21
1,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,4,4.428023,352,Hardcover,1,26
2,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,4.571836,435,Mass Market Paperback,1,25
3,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,13,4.357352,815,Paperback,1,28
4,18,1,1,0,0,0,0,1,1,1,...,0,0,0,1,18,4.35735,815,Hardcover,1,28


In [14]:
# One-hot encode categorical variables
merged_data = pd.get_dummies(merged_data, columns=['Book format'])

In [15]:
# # Normalize continuous variables
scaler = StandardScaler()
# merged_data['Pages'] = scaler.fit_transform(merged_data[['Pages']])

# Prepare data for PyTorch
X = merged_data.drop(columns=['bookId', 'BookID', 'average_rating']).values
y = merged_data['average_rating'].values

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled

array([[-0.96057198, -0.56722053, -0.23309925, ..., -0.01912147,
        -0.03313148, -0.01912147],
       [ 1.0410464 ,  1.76298273, -0.23309925, ..., -0.01912147,
        -0.03313148, -0.01912147],
       [-0.96057198, -0.56722053, -0.23309925, ..., -0.01912147,
        -0.03313148, -0.01912147],
       ...,
       [-0.96057198,  1.76298273, -0.23309925, ..., -0.01912147,
        -0.03313148, -0.01912147],
       [-0.96057198, -0.56722053, -0.23309925, ..., -0.01912147,
        -0.03313148, -0.01912147],
       [-0.96057198,  1.76298273, -0.23309925, ..., -0.01912147,
        -0.03313148, -0.01912147]])

In [18]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-0.96057198, -0.56722053, -0.23309925, ..., -0.01912147,
        -0.03313148, -0.01912147],
       [ 1.0410464 , -0.56722053, -0.23309925, ..., -0.01912147,
        -0.03313148, -0.01912147],
       [-0.96057198, -0.56722053, -0.23309925, ..., -0.01912147,
        -0.03313148, -0.01912147],
       ...,
       [ 1.0410464 , -0.56722053, -0.23309925, ..., -0.01912147,
        -0.03313148, -0.01912147],
       [ 1.0410464 ,  1.76298273, -0.23309925, ..., -0.01912147,
        -0.03313148, -0.01912147],
       [-0.96057198, -0.56722053, -0.23309925, ..., -0.01912147,
        -0.03313148, -0.01912147]])

In [19]:
# Convert arrays to PyTorch tensors
X_torch_train = torch.tensor(X_train_scaled, dtype=torch.float32)
y_torch_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

In [20]:
# Convert arrays to PyTorch tensors
X_torch_test = torch.tensor(X_test_scaled, dtype=torch.float32)
y_torch_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [21]:
# Create dataloaders
train_dataset = TensorDataset(X_torch_train, y_torch_train)
test_dataset = TensorDataset(X_torch_test, y_torch_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [22]:
# Define the model
class ANN(nn.Module):
    def __init__(self,input_size, num_features=32):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, num_features)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(num_features, 1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        # x = self.relu(x)
        return x

In [23]:
# # Instantiate the model
# model = ANN(X_train.shape[1])

# # Loss and optimizer
# criterion = nn.MSELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [24]:
def train_model(model, train_loader, test_loader, epochs, criterion, optimizer, num_features=32):
    model.train()
    best_err = float('inf')
    for epoch in range(epochs):
        # losses = []
        train_err = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            # losses.append(loss)
            loss.backward()
            optimizer.step()
            train_err += loss.item()
        train_err /= len(train_loader)
        # Validation phase
        model.eval()
        val_err = 0
        with torch.no_grad():
            for inputs, targets in test_loader:
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_err += loss.item()
        
        val_err /= len(test_loader)

        # Save best model
        if val_err < best_err:
            best_err = val_err
            torch.save(model.state_dict(), f'book_rating_model_{num_features}.pth')
        # print(f"Epoch {epoch+1}, Train Loss: {train_err}, Validation Loss: {val_err}, Best Loss: {best_err}")
    print(f"Best error: {best_err}")
    return {'best_err': best_err, 'model_path':f'book_rating_model_{num_features}.pth'}


In [21]:
# Training the model
# train_model(model, train_loader, test_loader, epochs=200)

In [22]:
# joblib.dump(scaler, 'scaler.save') 

In [25]:
# num_features = [8,16,32]
num_features = [8,10,16,20,30,32,40,50]
results = []
for num_feat in num_features:
    model = ANN(X_train.shape[1], num_feat)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    result = train_model(model, train_loader, test_loader, criterion=criterion, optimizer=optimizer, epochs=200, num_features=num_feat)
    results.append(result)

Best error: 0.05283192993903702
Best error: 0.07689612528139894
Best error: 0.06831414387984709
Best error: 0.07055210948667744
Best error: 0.07296404259448702
Best error: 0.0739488232542168
Best error: 0.07382241653447802
Best error: 0.07328617860647765


In [26]:
results

[{'best_err': 0.05283192993903702, 'model_path': 'book_rating_model_8.pth'},
 {'best_err': 0.07689612528139894, 'model_path': 'book_rating_model_10.pth'},
 {'best_err': 0.06831414387984709, 'model_path': 'book_rating_model_16.pth'},
 {'best_err': 0.07055210948667744, 'model_path': 'book_rating_model_20.pth'},
 {'best_err': 0.07296404259448702, 'model_path': 'book_rating_model_30.pth'},
 {'best_err': 0.0739488232542168, 'model_path': 'book_rating_model_32.pth'},
 {'best_err': 0.07382241653447802, 'model_path': 'book_rating_model_40.pth'},
 {'best_err': 0.07328617860647765, 'model_path': 'book_rating_model_50.pth'}]

In [25]:
num_features = [8,10,16,20,30,32,40,50]

In [26]:
models = []
for i in num_features:
    model = ANN(X_train.shape[1], i)
    model.load_state_dict(torch.load(f'book_rating_model_{i}.pth'))
    model.eval()
    models.append(model)
len(models)

8