# **Model Building**

In [None]:
!pip uninstall -y numpy pandas scikit-learn scikit-surprise
!pip install numpy==1.26.4  # Install a compatible NumPy 1.x version for Python 3.12
!pip install pandas scikit-learn scikit-surprise --no-deps # Reinstall dependent libraries ensuring compatibility
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset, Reader, SVD, KNNBaseline

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: pandas 2.3.3
Uninstalling pandas-2.3.3:
  Successfully uninstalled pandas-2.3.3
Found existing installation: scikit-learn 1.8.0
Uninstalling scikit-learn-1.8.0:
  Successfully uninstalled scikit-learn-1.8.0
Found existing installation: scikit-surprise 1.1.4
Uninstalling scikit-surprise-1.1.4:
  Successfully uninstalled scikit-surprise-1.1.4
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tsfresh 0.21.1 requires pandas>=0.25.0, which is not installed.
tsfresh 0.21.1

Collecting pandas
  Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4-cp312-cp312-linux_x86_64.whl
Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
Using cached scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (8.9 MB)
Installing collected packages: scikit-surprise, scikit-learn, pandas
Successfully installed pandas-2.3.3 scikit-learn-1.8.0 scikit-surprise-1.1.4


In [None]:
import pandas as pd
df_final= pd.read_csv(r"/content/final_merged_data.csv")
df_final.shape

(725639, 20)

In [None]:
# Reduce Dataset
# Keep active users (>= 20 ratings)
user_counts = df_final['User-ID'].value_counts()
active_users = user_counts[user_counts >= 20].index

# Keep popular books (>= 20 ratings)
book_counts = df_final['ISBN'].value_counts()
popular_books = book_counts[book_counts >= 20].index

df_small = df_final[
    (df_final['User-ID'].isin(active_users)) &
    (df_final['ISBN'].isin(popular_books))
].copy()

user_id_list = df_small["User-ID"].unique().tolist()
isbn_list    = df_small["ISBN"].unique().tolist()

print("Unique Users:", len(user_id_list))
print("Unique Books:", len(isbn_list))


print("Reduced data shape:", df_small.shape)

Unique Users: 5343
Unique Books: 6694
Reduced data shape: (268588, 20)


In [None]:
# Build SVD Explicit Rating model
reader = Reader(rating_scale=(0, 10))

data = Dataset.load_from_df(
    df_small[['User-ID', 'ISBN', 'Book-Rating']],
    reader
)
trainset = data.build_full_trainset()

svd = SVD()
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7eef43d5a210>

In [None]:
# Build KNN Collaberative Filtering model
sim_options = {
    "name": "pearson_baseline", # Changed similarity metric to pearson_baseline
    "user_based": True,
    "min_support": 5
}

knn = KNNBaseline(sim_options=sim_options)
knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7eef31c90530>

In [None]:
# TF-IDF Content Based Model
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = tfidf.fit_transform(df_small["Title_Clean_Lower"].fillna(""))

In [None]:
# Content Similarity Score Function
def content_similarity(isbn1, isbn2):
    try:
        idx1 = df_small.index[df_small['ISBN'] == isbn1][0]
        idx2 = df_small.index[df_small['ISBN'] == isbn2][0]
        sim = linear_kernel(tfidf_matrix[idx1], tfidf_matrix[idx2]).flatten()[0]
        return sim
    except:
        return 0

In [None]:
def recommend_books_title(user_id, top_n=10):

    # Books user has already rated
    user_books = set(df_small[df_small["User-ID"] == user_id]["ISBN"])

    # All books available
    all_books = df_small["ISBN"].unique()

    # For mapping ISBN â†’ Book Title
    isbn_to_title = df_small[['ISBN', 'Book-Title']].drop_duplicates().set_index('ISBN')['Book-Title']

    results = []

    for isbn in all_books:
        if isbn in user_books:
            continue

        # 1. SVD score
        try:
            svd_score = svd.predict(user_id, isbn).est
        except:
            svd_score = 0

        # 2. KNN CF score
        try:
            knn_score = knn.predict(user_id, isbn).est
        except:
            knn_score = 0

        # 3. Content similarity score
        try:
            recent_books = list(user_books)[:5]
            if len(recent_books) > 0:
                cont_scores = [content_similarity(isbn, b) for b in recent_books]
                cont_score = np.mean(cont_scores)
            else:
                cont_score = 0
        except:
            cont_score = 0

        # Hybrid score
        final_score = 0.5 * svd_score + 0.3 * knn_score + 0.2 * cont_score

        # Append with title
        title = isbn_to_title.get(isbn, "Unknown Title")

        results.append((title, isbn, final_score))

    # Sort results
    results = sorted(results, key=lambda x: x[2], reverse=True)

    return results[:top_n]

In [None]:
test_user = df_small['User-ID'].sample(1).iloc[0]
recommendations = recommend_books_title(test_user, top_n=10)

for title, isbn, score in recommendations:
    print(f"{title}  |  ISBN: {isbn}  |  Score: {round(score, 3)}")

Pippi Longstocking (Seafarer Book)  |  ISBN: 0140309578  |  Score: 5.274
The Other Boleyn Girl  |  ISBN: 0743227441  |  Score: 4.841
All I Need to Know I Learned from My Cat  |  ISBN: 0894808249  |  Score: 4.716
A Sand County Almanac (Outdoor Essays &amp; Reflections)  |  ISBN: 0345345053  |  Score: 3.99
Belgarath the Sorcerer  |  ISBN: 0345403959  |  Score: 3.988
The Curious Incident of the Dog in the Night-Time : A Novel  |  ISBN: 0385509456  |  Score: 3.987
The Neverending Story  |  ISBN: 0140386335  |  Score: 3.958
Einstein's Dreams  |  ISBN: 0446670111  |  Score: 3.936
The Vanished Man : A Lincoln Rhyme Novel  |  ISBN: 0743222008  |  Score: 3.93
Chicken Soup for the Teenage Soul II (Chicken Soup for the Soul Series)  |  ISBN: 1558746161  |  Score: 3.902


In [None]:
import pickle

# Save SVD model
with open('/content/svd_model.pkl', 'wb') as f:
    pickle.dump(svd, f)

# Save KNN model
with open('/content/knn_model.pkl', 'wb') as f:
    pickle.dump(knn, f)

# Save TF-IDF vectorizer
with open('/content/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save df_small (the reduced dataset)
with open('/content/df_small.pkl', 'wb') as f:
    pickle.dump(df_small, f)

print("Models and df_small saved as .pkl files in /content/")

Models and df_small saved as .pkl files in /content/
