In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# For Pre-Processing
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

# For GNN

import torch
import torch.nn as nn
import torch.nn.functional as F

import dgl
import dgl.function as fn
from dgl.nn import GraphConv

import networkx as nx
from networkx.algorithms import bipartite


In [2]:
ratings = pd.read_csv("Data/Ratings.csv")
books = pd.read_csv("Data/Books.csv")
users = pd.read_csv("Data/Users.csv")

  books = pd.read_csv("Data/Books.csv")


In [3]:
display(books.shape)
display(ratings.shape)
display(users.shape)

(271360, 8)

(1149780, 3)

(278858, 3)

### Pre-Processing User Data

In [4]:
def extract_country(location):
    # Split the location by comma and strip whitespace
    parts = [part.strip() for part in location.split(',')]
    # Return the last part as the country
    return parts[-1] if parts else None

users['Country'] = users['Location'].apply(extract_country)

In [5]:
# # Either Impute or Drop NANs for AGE
# knn_imputer = KNNImputer(n_neighbors=5)
# users['Age'] = knn_imputer.fit_transform(users[['Age']])

users = users.dropna()

In [6]:
display(users.shape)
users.head()

(168096, 4)

Unnamed: 0,User-ID,Location,Age,Country
1,2,"stockton, california, usa",18.0,usa
3,4,"porto, v.n.gaia, portugal",17.0,portugal
5,6,"santa monica, california, usa",61.0,usa
9,10,"albacete, wisconsin, spain",26.0,spain
10,11,"melbourne, victoria, australia",14.0,australia


### Pre-Processing Ratings and Books Data

In [7]:
books_and_ratings = ratings.merge(books, on='ISBN')
print(f"Books and Ratings shape after merge: {books_and_ratings.shape}")

Books and Ratings shape after merge: (1031136, 10)


In [8]:
books_and_ratings = books_and_ratings.drop(columns=["Image-URL-S", "Image-URL-M", "Image-URL-L"])
books_and_ratings = books_and_ratings[books_and_ratings['Book-Rating'] > 0]
print(f"Books and Ratings shape after filtering: {books_and_ratings.shape}")

Books and Ratings shape after filtering: (383842, 7)


In [9]:
print(f"Number of unique books with ratings: {books_and_ratings['ISBN'].nunique()}")
print(f"Total number of ratings: {len(books_and_ratings)}")
print(f"Total number of unique titles with ratings: {books_and_ratings['Book-Title'].nunique()}")

Number of unique books with ratings: 149836
Total number of ratings: 383842
Total number of unique titles with ratings: 135567


In [10]:
books_and_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle
3,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press
6,276744,038550120X,7,A Painted House,JOHN GRISHAM,2001,Doubleday
13,276747,0060517794,9,Little Altars Everywhere,Rebecca Wells,2003,HarperTorch


In [11]:
avg_rating = pd.DataFrame(books_and_ratings.groupby('ISBN')['Book-Rating'].mean().round(1))
avg_rating.reset_index(inplace=True)
avg_rating.rename(columns={'Book-Rating':'Average-Rating'}, inplace=True)
avg_rating.head()

Unnamed: 0,ISBN,Average-Rating
0,0000913154,8.0
1,0001046438,9.0
2,000104687X,6.0
3,0001047213,9.0
4,0001047973,9.0


In [12]:
avg_rating_df = pd.merge(books_and_ratings, avg_rating, on='ISBN')
display(avg_rating_df.head())
print(avg_rating_df.shape)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Average-Rating
0,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,5.0
1,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,3.0
2,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,6.0
3,276744,038550120X,7,A Painted House,JOHN GRISHAM,2001,Doubleday,7.6
4,276747,0060517794,9,Little Altars Everywhere,Rebecca Wells,2003,HarperTorch,8.0


(383842, 8)
