In [135]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

Step 1: Books

In [136]:
books = pd.read_csv("BX-Books.csv")
books.head

# Checking for any empty rows
empty_rows = books[books.isna().any(axis=1) | books.eq('').any(axis=1)]
print(empty_rows)

# Checking the Year of Publication first
YOF = books["Year-Of-Publication"]
values, counts = np.unique(YOF, return_counts=True)
for value, count in zip(values, counts):
    print("YOF:", value, ", Count:", count)

# Performing Scaling on the Numerical Data
mode_YOF = YOF.mode()[0]
books["Year-Of-Publication"] = books["Year-Of-Publication"].replace(0, mode_YOF)
books["Year-Of-Publication"] = books["Year-Of-Publication"].replace(2030, mode_YOF)
YOF = books["Year-Of-Publication"]
values, counts = np.unique(YOF, return_counts=True)
for value, count in zip(values, counts):
    print("Make:", value, ", Count:", count)

# Performing Data Manipulation on worded data to make them all lowercase
books["Book-Publisher"] = books["Book-Publisher"].str.lower()
books["Book-Author"] = books["Book-Author"].str.lower()
books["Book-Title"] = books["Book-Title"].str.lower()
print(books.iloc[0:50])

# Checking the ISBN and if they are formatted correctly
def isbn_check(isbn):
    if len(isbn) != 10:
        return False
    if not isbn[:-1].isdigit():
        return False
    if not (isbn[-1].isdigit() or isbn[-1] == "X"):
        return False
    return True

invalid_isbn = books[~books["ISBN"].apply(isbn_check)]
print(invalid_isbn)

# Performing Data Manipulation on the ISBNs to make them consistent
invalid_isbn_rows = books[books["ISBN"].str.contains('x')]
books.loc[invalid_isbn_rows.index, "ISBN"] = books.loc[invalid_isbn_rows.index, "ISBN"].str.upper()

invalid_isbn = books[~books["ISBN"].apply(isbn_check)]
print(invalid_isbn)

# View the head of the dataframe
books.head


Empty DataFrame
Columns: [ISBN, Book-Title, Book-Author, Year-Of-Publication, Book-Publisher]
Index: []
YOF: 0 , Count: 314
YOF: 1920 , Count: 2
YOF: 1927 , Count: 1
YOF: 1929 , Count: 1
YOF: 1930 , Count: 1
YOF: 1932 , Count: 1
YOF: 1936 , Count: 1
YOF: 1938 , Count: 1
YOF: 1942 , Count: 1
YOF: 1945 , Count: 1
YOF: 1946 , Count: 2
YOF: 1947 , Count: 1
YOF: 1949 , Count: 1
YOF: 1950 , Count: 2
YOF: 1951 , Count: 2
YOF: 1952 , Count: 3
YOF: 1953 , Count: 14
YOF: 1954 , Count: 6
YOF: 1955 , Count: 4
YOF: 1956 , Count: 5
YOF: 1957 , Count: 4
YOF: 1958 , Count: 5
YOF: 1959 , Count: 6
YOF: 1960 , Count: 6
YOF: 1961 , Count: 9
YOF: 1962 , Count: 6
YOF: 1963 , Count: 9
YOF: 1964 , Count: 9
YOF: 1965 , Count: 7
YOF: 1966 , Count: 10
YOF: 1967 , Count: 4
YOF: 1968 , Count: 5
YOF: 1969 , Count: 19
YOF: 1970 , Count: 25
YOF: 1971 , Count: 15
YOF: 1972 , Count: 31
YOF: 1973 , Count: 19
YOF: 1974 , Count: 24
YOF: 1975 , Count: 13
YOF: 1976 , Count: 46
YOF: 1977 , Count: 60
YOF: 1978 , Count: 55
YOF

<bound method NDFrame.head of              ISBN                                         Book-Title  \
0      0002005018                                       clara callan   
1      0374157065  flu: the story of the great influenza pandemic...   
2      0399135782                             the kitchen god's wife   
3      0440234743                                      the testament   
4      0452264464               beloved (plume contemporary fiction)   
...           ...                                                ...   
18180  0375411615                                         love, etc.   
18181  0836227751              the wit and whimsy of mary engelbreit   
18182  8433966634                            los detectives salvajes   
18183  0330353349                  the ice house (tv tie-in edition)   
18184  0394757645  trouble is my business (vintage crime/black li...   

                Book-Author  Year-Of-Publication             Book-Publisher  
0      richard bruce wright

Step 2: Users

In [137]:
import re
# Read the DB
users = pd.read_csv("BX-Users.csv")

""" Pre-processing the Age column """
# Manipulating the elements
users['User-Age'] = users['User-Age'].str.rstrip('"')
# Imputate the missing data using the mode age
ages = users['User-Age']
mode_age = ages.mode()[0]
ages = ages.fillna(mode_age).astype(int)

# Convert to object dtype
ages = ages.astype(object)

# Classify the ages into different age categories
for i in  range(len(ages)):
  if (ages[i] < 10):
    ages[i] = 'kids'
  elif ((ages[i] >= 10) & (ages[i] < 18)):
    ages[i] = 'adolescent'
  elif ((ages[i] >= 18) & (ages[i] < 40)):
    ages[i] = 'adult'
  elif ((ages[i] >= 40) & (ages[i] < 60)):
    ages[i] = 'middle-aged adult'
  else:
    ages[i] = 'old adult'
users['User-Age'] = ages
# Check to see if all rows have been classified
value_counts = users['User-Age'].value_counts()
unique_strings = value_counts.index
counts = value_counts.values
for string, count in zip(unique_strings, counts):
    print(f"Age Category: {string}, Count: {count}")


""" Pre-processing the Country column """
# Re-format elements in the column by casefolding 
users["User-Country"] = users["User-Country"].str.lower().str.strip()
# Getting keys from dictionaries of 
# existing countries from Github repository https://github.com/QuantumTech11/PythonDictionaries/blob/master/Countries%20Dictionary.py 
Countries = {
"Afghanistan" : "Kabul",
"Albania" : "Tirana",
"Algeria" : "Algiers",
"Andorra" : "Andorra la Vella",
"Angola" : "Luanda",
"Antigua and Barbuda" : "Saint John’s",
"Argentina" : "Buenos Aires",
"Armenia" : "Yerevan",
"Australia" : "Canberra",
"Austria" : "Vienna",
"Azerbaijan" : "Baku",
"The Bahamas" : "Nassau",
"Bahrain" : "Manama",
"Bangladesh" : "Dhaka",
"Barbados" : "Bridgetown",
"Belarus" : "Minsk",
"Belgium" : "Brussels",
"Belize" : "Belmopan",
"Benin" : "Porto-Novo",
"Bhutan" : "Thimphu",
"Bolivia" : "La Paz, Sucre",
"Bosnia and Herzegovina" : "Sarajevo",
"Botswana" : "Gaborone",
"Brazil" : "Brasilia",
"Brunei" : "Bandar Seri Begawan",
"Bulgaria" : "Sofia",
"Burkina Faso" : "Ouagadougou",
"Burundi" : "Bujumbura",
"Cambodia" : "Phnom Penh",
"Cameroon" : "Yaounde",
"Canada" : "Ottawa",
"Cape Verde" : "Praia",
"Central African Republic" : "Bangui",
"Chad" : "N’Djamena",
"Chile" : "Santiago",
"China" : "Beijing",
"Colombia" : "Bogota",
"Comoros" : "Moroni",
"Republic of the Congo": "Brazzaville",
"Democratic Republic of the Congo" : "Kinshasa",
"Costa Rica" : "San Jose",
"Cote d’Ivoire" : "Yamoussoukro",
"Croatia" : "Zagreb",
"Cuba" : "Havana",
"Cyprus" : "Nicosia",
"Czech Republic" : "Prague",
"Denmark" : "Copenhagen",
"Djibouti" : "Djibouti",
"Dominica" : "Roseau",
"Dominican Republic" : "Santo Domingo",
"East Timor" : "Dili",
"Ecuador" : "Quito",
"Egypt" : "Cairo",
"El Salvador" : "San Salvador",
"Equatorial Guinea" : "Malabo",
"Eritrea" : "Asmara",
"Estonia" : "Tallinn",
"Ethiopia" : "Addis Ababa",
"Fiji" : "Suva",
"Finland" : "Helsinki",
"France" : "Paris",
"Gabon" : "Libreville",
"The Gambia" : "Banjul",
"Georgia" : "Tbilisi",
"Germany" : "Berlin",
"Ghana" : "Accra",
"Greece" : "Athens",
"Grenada" : "Saint George’s",
"Guatemala" : "Guatemala City",
"Guinea" : "Conakry",
"Guinea-Bissau" : "Bissau",
"Guyana" : "Georgetown",
"Haiti" : "Port-au-Prince",
"Honduras" : "Tegucigalpa",
"Hungary" : "Budapest",
"Iceland" : "Reykjavik",
"India" : "New Delhi",
"Indonesia" : "Jakarta",
"Iran" : "Tehran",
"Iraq" : "Baghdad",
"Ireland" : "Dublin",
"Israel" : "Jerusalem",
"Italy" : "Rome",
"Jamaica" : "Kingston",
"Japan" : "Tokyo",
"Jordan" : "Amman",
"Kazakhstan" : "Astana",
"Kenya" : "Nairobi",
"Kiribati" : "Tarawa Atoll",
"North Korea" : "Pyongyang",
"South Korea" : "Seoul",
"Kosovo" : "Pristina",
"Kuwait" : "Kuwait City",
"Kyrgyzstan" : "Bishkek",
"Laos" : "Vientiane",
"Latvia" : "Riga",
"Lebanon" : "Beirut",
"Lesotho" : "Maseru",
"Liberia" : "Monrovia",
"Libya" : "Tripoli",
"Liechtenstein" : "Vaduz",
"Lithuania" : "Vilnius",
"Luxembourg" : "Luxembourg",
"Macedonia" : "Skopje",
"Madagascar" : "Antananarivo",
"Malawi" : "Lilongwe",
"Malaysia" : "Kuala Lumpur",
"Maldives" : "Male",
"Mali" : "Bamako",
"Malta" : "Valletta",
"Marshall Islands" : "Majuro",
"Mauritania" : "Nouakchott",
"Mauritius" : "Port Louis",
"Mexico" : "Mexico City",
"Federated States of Micronesia" : "Palikir",
"Moldova" : "Chisinau",
"Monaco" : "Monaco",
"Mongolia" : "Ulaanbaatar",
"Montenegro" : "Podgorica",
"Morocco" : "Rabat",
"Mozambique" : "Maputo",
"Myanmar" : "Naypyidaw",
"Namibia" : "Windhoek",
"Nauru" : "Yaren District",
"Nepal" : "Kathmandu",
"Netherlands" : "Amsterdam",
"New Zealand" : "Wellington",
"Nicaragua" : "Managua",
"Niger" : "Niamey",
"Nigeria" : "Abuja",
"Norway" : "Oslo",
"Oman" : "Muscat",
"Pakistan" : "Islamabad",
"Palau" : "Melekeok",
"Panama" : "Panama City",
"Papua New Guinea" : "Port Moresby",
"Paraguay" : "Asuncion",
"Peru" : "Lima",
"Philippines" : "Manila",
"Poland" : "Warsaw",
"Portugal" : "Lisbon",
"Qatar" : "Doha",
"Romania" : "Bucharest",
"Russia" : "Moscow",
"Rwanda" : "Kigali",
"Saint Kitts and Nevis" : "Basseterre",
"Saint Lucia" : "Castries",
"Saint Vincent and the Grenadines" : "Kingstown",
"Samoa" : "Apia",
"San Marino" : "San Marino",
"Sao Tome and Principe" : "Sao Tome",
"Saudi Arabia" : "Riyadh",
"Senegal" : "Dakar",
"Serbia" : "Belgrade",
"Seychelles" : "Victoria",
"Sierra Leone" : "Freetown",
"Singapore" : "Singapore",
"Slovakia" : "Bratislava",
"Slovenia" : "Ljubljana",
"Solomon Islands" : "Honiara",
"Somalia" : "Mogadishu",
"South Africa" : "Pretoria, Cape Town, Bloemfontein",
"South Sudan" : "Juba",
"Spain" : "Madrid",
"Sri Lanka" : "Colombo, Sri Jayewardenepura Kotte",
"Sudan" : "Khartoum",
"Suriname" : "Paramaribo",
"Swaziland" : "Mbabane",
"Sweden" : "Stockholm",
"Switzerland" : "Bern",
"Syria" : "Damascus",
"Taiwan" : "Taipei",
"Tajikistan" : "Dushanbe",
"Tanzania" : "Dodoma",
"Thailand" : "Bangkok",
"Togo" : "Lome",
"Tonga" : "Nuku’alofa",
"Trinidad and Tobago" : "Port-of-Spain",
"Tunisia" : "Tunis",
"Turkey" : "Ankara",
"Turkmenistan" : "Ashgabat",
"Tuvalu" : "Funafuti",
"Uganda" : "Kampala",
"Ukraine" : "Kyiv",
"United Arab Emirates" : "Abu Dhabi",
"United Kingdom" : "London",
"USA" : "Washington D.C.",
"Uruguay" : "Montevideo",
"Uzbekistan" : "Tashkent",
"Vanuatu" : "Port-Vila",
"Vatican City" : "Vatican City",
"Venezuela" : "Caracas",
"Vietnam" : "Hanoi",
"Yemen" : "Sanaa",
"Zambia" : "Lusaka",
"Zimbabwe" : "Harare"
}
countries = list(Countries.keys())
# Punctuation removal to facilitate change of elements due to spelling mistakes 
lowercase_countries = [x.lower() for x in countries]
for i in range(len(users)):
    country = str(users.at[i, 'User-Country'])
    country = re.sub(r'[\"\.\']', '', country)
    users.at[i, 'User-Country'] = country
# Finding common mistakes in inputs of elements and correcting it (kind of lamentising maybe?)
users.loc[users['User-Country'].str.contains('ame|sta', case=False), 'User-Country'] = 'usa'
users.loc[users['User-Country'].str.contains('eng|kind|scot|wales|uk', case=False), 'User-Country'] = 'united kingdom'
users.loc[users['User-Country'].str.contains('ita', case=False), 'User-Country'] = 'italy'
users.loc[users['User-Country'].str.contains('fra', case=False), 'User-Country'] = 'france'
# Classify data that are not names of a country and imputate missing data as blank
users.loc[~users['User-Country'].isin(lowercase_countries), 'User-Country'] = '-'

""" Pre-process the States column """
countries_with_states = ['usa', 'nigeria','mexico','india','brazil','germany','malaysia','austria','myanmar','australia','new zealand', 'south sudan','canada']
# Classify rows that have countries that don't have a states as a blank
users.loc[~users['User-Country'].isin(countries_with_states), 'User-State'] = '-'

""" Pre-process the City column"""
# Classify rows that don't have a country as blank
users.loc[users['User-Country'] == '-', 'User-City'] = '-'
# Imputate missing rows as blank
users.loc[users['User-City'].isna(), 'User-City'] = '-'

''' Test print '''
users.head(30)



Age Category: adult, Count: 36028
Age Category: middle-aged adult, Count: 8580
Age Category: old adult, Count: 1898
Age Category: adolescent, Count: 1618
Age Category: kids, Count: 175


Unnamed: 0,User-ID,User-City,User-State,User-Country,User-Age
0,8,timmins,ontario,canada,adult
1,9,germantown,tennessee,usa,adult
2,16,albuquerque,new mexico,usa,adult
3,17,chesapeake,virginia,usa,adult
4,19,-,-,-,adolescent
5,26,bellevue,washington,usa,adult
6,32,portland,oregon,usa,adult
7,39,cary,north carolina,usa,adult
8,42,appleton,wisconsin,usa,adolescent
9,44,black mountain,north carolina,usa,middle-aged adult


Step 3: Ratings

In [138]:
ratings = pd.read_csv("BX-Ratings.csv")
ratings.head

# Checking for any empty rows
empty_rows = ratings[ratings.isna().any(axis=1) | ratings.eq('').any(axis=1)]
print(empty_rows)

# Checking the ISBN and if they are formatted correctly
def isbn_check(isbn):
    if len(isbn) != 10:
        return False
    if not isbn[:-1].isdigit():
        return False
    if not (isbn[-1].isdigit() or isbn[-1] == "X"):
        return False
    return True

invalid_isbn = ratings[~ratings["ISBN"].apply(isbn_check)]
print(invalid_isbn)

# Performing Data Manipulation on the ISBNs to make them consistent
invalid_isbn_rows = ratings[ratings["ISBN"].str.contains('x')]
ratings.loc[invalid_isbn_rows.index, "ISBN"] = ratings.loc[invalid_isbn_rows.index, "ISBN"].str.upper()

invalid_isbn = ratings[~ratings["ISBN"].apply(isbn_check)]
print(invalid_isbn)

# Checking the ratings and seeing if any are unusual
Ratings = ratings["Book-Rating"]
values, counts = np.unique(Ratings, return_counts=True)
for value, count in zip(values, counts):
    print("Ratings:", value, ", Count:", count)

# Checking the User-IDs by finding the largest numbered ID first
num_rows = len(ratings)
max_ID = ratings["User-ID"].max()
print("Rows:", num_rows, ", Largest ID:", max_ID)

# Searching for any unusual data entries
valid_ID = (ratings["User-ID"] >= 0)
num_invalid_ID = num_rows
ID_counter = 0
for ID in valid_ID:
    ID_counter += 1
    if ID == False:
        print("Row:", ID_counter, ", ID:", ID)
    else:
        num_invalid_ID -= 1
        if num_invalid_ID == 0:
            print("Every ID is seemingly valid!")

Empty DataFrame
Columns: [User-ID, ISBN, Book-Rating]
Index: []
        User-ID        ISBN  Book-Rating
72566     95492  039592720x           10
106228   138844  039592720x           10
140240   187409  039592720x           10
155055   209156  039592720x            9
Empty DataFrame
Columns: [User-ID, ISBN, Book-Rating]
Index: []
Ratings: 1 , Count: 712
Ratings: 2 , Count: 1173
Ratings: 3 , Count: 2559
Ratings: 4 , Count: 3855
Ratings: 5 , Count: 20080
Ratings: 6 , Count: 16195
Ratings: 7 , Count: 35437
Ratings: 8 , Count: 51206
Ratings: 9 , Count: 34822
Ratings: 10 , Count: 38125
Rows: 204164 , Largest ID: 278854
Every ID is seemingly valid!


Merge Datasets

In [139]:
merged_data = pd.merge(ratings, books, on='ISBN', how='inner')
merged_data = pd.merge(merged_data, users, on='User-ID', how='inner')

In [140]:
merged_data.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Book-Publisher,User-City,User-State,User-Country,User-Age
0,276744,038550120X,7,a painted house,john grisham,2001,doubleday,torrance,california,usa,adult
1,276754,0684867621,8,the girl who loved tom gordon : a novel,stephen king,1999,scribner,alberta beach,alberta,canada,adult
2,276755,0451166892,5,the pillars of the earth,ken follett,1996,signet book,frankfurt am main,hessen,germany,adult
3,276762,0380711524,5,see jane run,joy fielding,1992,avon,duisburg,nordrhein-westfalen,germany,adult
4,276772,0553572369,7,pay dirt (mrs. murphy mysteries (paperback)),rita mae brown,1996,bantam,bonn,nordrhein-westfalen,germany,adult


Drop unnecessary columns and rename remaining ones for clarity

In [141]:
columns_to_drop = ['Book-Publisher', 'User-City', 'User-State']
merged_data.drop(columns_to_drop, axis=1, inplace=True)
column_renames = {
    'User-ID': 'id', 
    'ISBN': 'isbn',
    'Book-Rating': 'rating', 
    'Book-Author': 'author', 
    'Year-Of-Publication': 'yop', 
    'Book-Title': 'title', 
    'User-Age': 'age', 
    'User-Country': 'country'
}
merged_data.rename(columns=column_renames, inplace=True

SyntaxError: incomplete input (2077455323.py, line 13)

In [None]:
merged_data.shape

(204242, 8)

Split into train / test sets for validation

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(merged_data, test_size=0.50)

# Create integer mappings for user IDs and ISBNs in the training set
user_train = {id_: idx for idx, id_ in enumerate(train_data['id'].unique())}
isbn_train = {isbn: idx for idx, isbn in enumerate(train_data['isbn'].unique())}

# Create integer mappings for user IDs and ISBNs in the testing set
user_test = {id_: idx for idx, id_ in enumerate(test_data['id'].unique())}
isbn_test = {isbn: idx for idx, isbn in enumerate(test_data['isbn'].unique())}

# Apply integer mappings to training and testing data
train_data['u_user'] = train_data['id'].map(user_train)
train_data['u_isbn'] = train_data['isbn'].map(isbn_train)
test_data['u_user'] = test_data['id'].map(user_test)
test_data['u_isbn'] = test_data['isbn'].map(isbn_test)


# Reduce data to features for collaborative filtering
train_data = train_data[['u_user', 'u_isbn', 'rating']]
test_data = test_data[['u_user', 'u_isbn', 'rating']]

In [None]:
train_data.sample(5)

Unnamed: 0,u_user,u_isbn,rating
94074,8971,17463,10
157863,2387,12604,7
42975,12417,5483,7
178163,2806,6918,7
165378,125,303,9


In [None]:
# Create User-Book Interaction Matrix
n_users = train_data['u_user'].nunique()
n_books = train_data['u_isbn'].nunique()
train_matrix = np.zeros((n_users, n_books))
for entry in train_data.itertuples():
    train_matrix[entry.u_user, entry.u_isbn] = entry.rating

In [None]:
train_matrix.shape

(42397, 18179)

In [None]:
# Create Test Interaction Matrix
n_users_test = test_data['u_user'].nunique()
n_books_test = test_data['u_isbn'].nunique()
test_matrix = np.zeros((n_users_test, n_books_test))
for entry in test_data.itertuples():
    test_matrix[entry.u_user, entry.u_isbn] = entry.rating

Find similarity betwen users and items

In [None]:
# Calculate similarity matrices for collaborative filtering through cosine similarity
from sklearn.metrics.pairwise import pairwise_distances
train_matrix_small = train_matrix[:5000, :5000]
test_matrix_small = test_matrix[:5000, :5000]
user_similarity = pairwise_distances(train_matrix_small, metric='cosine')
item_similarity = pairwise_distances(train_matrix_small.T, metric='cosine')

In [None]:
# Function to predict ratings
def predict_books(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    else: 
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [None]:
item_prediction = predict_books(train_matrix_small, item_similarity, type='item')
user_prediction = predict_books(train_matrix_small, user_similarity, type='user')

Evaluate Collaborative Filtering using RMSE

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

print(f'Item-based CF RMSE: {rmse(item_prediction, test_matrix_small)}')
print(f'User-based CF RMSE: {rmse(user_prediction, test_matrix_small)}')

Item-based CF RMSE: 7.997508952927154
User-based CF RMSE: 7.996410502723139


In [None]:
from surprise import Reader, Dataset
from surprise import SVD, model_selection, accuracy

# Creating a 'Reader' object to set the limit of the ratings 
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(ratings, reader)

model = SVD()

# Train on books dataset
%time model_selection.cross_validate(model, data, measures=['RMSE'], cv=5, verbose=True)

# train and test split
trainset, testset = model_selection.train_test_split(data, test_size=0.50)

# SVD model
model = SVD()
model.fit(trainset)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6098  1.6132  1.6213  1.6124  1.6188  1.6151  0.0043  
Fit time          1.95    2.05    2.15    1.99    2.00    2.03    0.07    
Test time         0.20    0.20    0.22    0.18    0.18    0.19    0.02    
CPU times: total: 13.1 s
Wall time: 13.4 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x28f03349b20>

In [None]:
# Display RMSE results
predictions = model.test(testset)
print(f"The accuracy is {accuracy.rmse(predictions)}")

RMSE: 1.6162
The accuracy is 1.6162345522605832


In [None]:
# Run a test case
uid = 276744  
iid = '038550120X' 
pred = model.predict(uid, iid, verbose=True)

user: 276744     item: 038550120X r_ui = None   est = 7.31   {'was_impossible': False}


In [None]:
n_ratings = pd.read_csv("BX-NewBooksRatings.csv")
n_ratings = n_ratings.rename(columns={'User-ID': 'id', 'ISBN': 'isbn', 'Book-Rating': 'n__bookrating'})
n_ratings= n_ratings.sample(frac=0.50)
n_ratings.head()

Unnamed: 0,id,isbn,n__bookrating
6799,66942,754022471,7
17088,170513,590440691,5
11666,110267,441008585,7
22083,225886,886778115,6
819,8245,761119396,9


In [None]:
ratings = ratings.rename(columns={'User-ID': 'id', 'ISBN': 'isbn', 'Book-Rating': 'book_rating'})
ratings.head()

Unnamed: 0,user_id,isbn,book_rating
0,276744,038550120X,7
1,276754,0684867621,8
2,276755,0451166892,5
3,276762,0380711524,5
4,276772,0553572369,7


In [None]:
# Display predicted and actual ratings
print(f'Predicted rating for ISBN {pred.iid} from user {pred.uid} is {pred.est:.2f}.\n')
actual_rtg= ratings[(ratings.user_id==pred.uid) & (ratings.isbn==pred.iid)].book_rating.values[0]
print(f'The actual rating given was {actual_rtg:.2f}.')

Predicted rating for ISBN 038550120X from user #276744 is 7.31.

The actual rating given was 7.00.
