<a href="https://www.kaggle.com/code/ujjwalmalik/reccomendation-system-amazon-book-reviews?scriptVersionId=129465092" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from scipy import stats
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

In [30]:
# consider on user_id, book_id, and ratings
df = pd.read_csv('/kaggle/input/amazon-books-reviews/Books_rating.csv')
df = df[['Id','User_id','Title','review/score', 'review/time']]
df.rename(columns={'Id':'ProductId','User_id':'UserId','review/time':'Time','Title':'title','review/score':'Score'},inplace=True)
df.head()

Unnamed: 0,ProductId,UserId,title,Score,Time
0,1882931173,AVCGYZL8FQQTD,Its Only Art If Its Well Hung!,4.0,940636800
1,826414346,A30TK6U7DNS82R,Dr. Seuss: American Icon,5.0,1095724800
2,826414346,A3UH4UZ4RSVO82,Dr. Seuss: American Icon,5.0,1078790400
3,826414346,A2MVUWT453QH61,Dr. Seuss: American Icon,4.0,1090713600
4,826414346,A22X4XUPKF66MR,Dr. Seuss: American Icon,4.0,1107993600


In [31]:
#lots of fields where user is NaN
df = df.dropna(subset=['UserId'])

In [33]:
print("Size of 'ProductId' column:", len(df['ProductId']))
print("Size of 'UserId' column:", len(df['UserId']))

# Define the threshold values
product_id_threshold = 200 
user_id_threshold = 10

# Count the occurrences of ProductId and UserId
product_id_counts = df['ProductId'].value_counts()
user_id_counts = df['UserId'].value_counts()

# Filter out rows below the threshold
filtered_df = df[(df['ProductId'].isin(product_id_counts[product_id_counts >= product_id_threshold].index)) &
                 (df['UserId'].isin(user_id_counts[user_id_counts >= user_id_threshold].index))]

print("Size of 'ProductId' column:", len(filtered_df['ProductId']))
print("Size of 'UserId' column:", len(filtered_df['UserId']))

Size of 'ProductId' column: 2438213
Size of 'UserId' column: 2438213
Size of 'ProductId' column: 346930
Size of 'UserId' column: 346930


In [34]:
# Get unique UserIds and ProductIds
unique_user_ids = filtered_df['UserId'].unique()
unique_product_ids = filtered_df['ProductId'].unique() #unique ids for books are less

user_id_to_index = {user_id: index for index, user_id in enumerate(unique_user_ids)}
product_id_to_index = {product_id: index for index, product_id in enumerate(unique_product_ids)}

# clean matrix
matrix = np.zeros((len(unique_user_ids), len(unique_product_ids)))

# users as rows, books as columns with their ratings
for _, row in filtered_df.iterrows():
    user_id = row['UserId']
    product_id = row['ProductId']
    score = row['Score']
    
    user_index = user_id_to_index[user_id]
    product_index = product_id_to_index[product_id]
    
    if matrix[user_index][product_index] < score:
        matrix[user_index][product_index] = score
print(matrix.shape)
matrix

(25751, 1572)


array([[5., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 5., 0., 0.],
       [0., 0., 0., ..., 0., 5., 0.],
       [0., 0., 0., ..., 0., 5., 0.]])

In [54]:
matrix = stats.zscore(matrix, axis=0)

In [40]:
def calculate_mse(predicted_matrix, test_matrix):
    num_users = min(predicted_matrix.shape[0], test_matrix.shape[0])
    num_items = min(predicted_matrix.shape[1], test_matrix.shape[1])
    mse = np.mean((predicted_matrix[:num_users, :num_items] - test_matrix[:num_users, :num_items]) ** 2)
    return mse

def calculate_f1_score(recall, precision):
    if recall + precision == 0:
        return 0
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

def precision_at_k(actual_matrix, predicted_matrix, k, threshold):
    binary_predicted_matrix = predicted_matrix >= threshold
    
    precision = []
    for i in range(len(actual_matrix)):
        actual_indices = np.where(actual_matrix[i] >= threshold)[0]
        predicted_indices = np.argsort(~binary_predicted_matrix[i])[:k]
        common_indices = np.intersect1d(actual_indices, predicted_indices)
        precision.append(len(common_indices) / len(predicted_indices))
    
    return np.mean(precision)

def recall_at_k(true_matrix, pred_matrix, k, threshold):
    pred_matrix_sorted = np.argsort(pred_matrix, axis=1)[:, ::-1][:, :k]
    recall_scores = []
    for i in range(len(true_matrix)):
        true_positives = len(set(pred_matrix_sorted[i]).intersection(set(np.where(true_matrix[i] >= threshold)[0])))
        actual_positives = len(np.where(true_matrix[i] >= threshold)[0])
        if actual_positives > 0:
            recall_scores.append(true_positives / actual_positives)
    recall = np.mean(recall_scores)
    return recall

In [55]:
def split_train_test(matrix, test_size=0.2, random_state=42):
    train_matrix, test_matrix = train_test_split(matrix, test_size=test_size, random_state=random_state)
    return train_matrix, test_matrix

def calculate_svd(train_matrix, k=600):
    train_sparse = csr_matrix(train_matrix)
    # Perform SVD on the sparse matrix
    U_train, S_train, VT_train = svds(train_sparse, k=k)
    # Reverse the singular values, columns of U_train, and rows of VT_train
    S_train_k = np.diag(S_train[::-1])
    U_train_k = U_train[:, ::-1]
    VT_train_k = VT_train[::-1, :]
    
    return U_train_k, S_train_k, VT_train_k

train_matrix, test_matrix = split_train_test(matrix)

# training set
U_train, S_train, VT_train = calculate_svd(train_matrix)
U_train_pred = np.dot(train_matrix, VT_train.T)
train_pred_matrix = np.dot(U_train_pred, VT_train)

# Make predictions for the test set
U_test_pred = np.dot(test_matrix, VT_train.T)
predicted_matrix = np.dot(U_test_pred, VT_train)

# Calculate MSE 
train_mse = calculate_mse(train_matrix, train_pred_matrix)
test_mse = calculate_mse(test_matrix, predicted_matrix)

print("Train Set Mean Squared Error (MSE):", train_mse)
print("Test Set Mean Squared Error (MSE):", test_mse)

Train Set Mean Squared Error (MSE): 0.08058264355590046
Test Set Mean Squared Error (MSE): 0.13022686417695492


In [57]:
# Calculate Precision at k for the test set
precision = precision_at_k(test_matrix, predicted_matrix, k=10, threshold=3)

# Calculate Recall at k for the test set
recall = recall_at_k(test_matrix, predicted_matrix, k=10, threshold=3)

# Calculate F1 score
f1_score = calculate_f1_score(recall, precision)
print("RMSE (training): ", np.sqrt(train_mse) )
print("RMSE (test): ", np.sqrt(test_mse))
print("Precision @ 10: ", precision)
print("Recall @ 10:", recall)
print("F1 Score:", f1_score)

RMSE (training):  0.28387082195234586
RMSE (test):  0.36086959442013805
Precision @ 10:  0.6870510580469812
Recall @ 10: 0.8293337583839886
F1 Score: 0.7515172006442172


In [58]:
def fetch_relevant_items_for_user(user_id, relevant_items=5):
    # Get the index of the user
    user_index = user_id_to_index[user_id]
    user_embedding = U_train[user_index, :]
    
    similarity_scores = VT_train.T.dot(user_embedding)

    sorted_indices = similarity_scores.argsort()[::-1]
    top_relevant_indices = sorted_indices[:relevant_items]
    
    relevant_items = [list(product_id_to_index.keys())[list(product_id_to_index.values()).index(idx)] for idx in top_relevant_indices]
    relevant_titles = df.loc[df['ProductId'].isin(relevant_items), 'title'].tolist()
    
    # Remove any duplicate titles
    unique_relevant_titles = list(set(relevant_titles))
    
    # Get the final set of relevant items without duplicate titles
    final_relevant_items = []
    for title in unique_relevant_titles:
        final_relevant_items.append(title)
    
    return final_relevant_items

In [59]:
user_id = ["A30C4HNZBZYDI2", "A3OWUSU9RG4NMF", "A2BIFGERNRDLBB"] 
top_n = 5

for id in user_id:
    relevant_items = fetch_relevant_items_for_user(id, top_n)
    print(f"User: {id}")
    print("Relevant Items:")
    for i, item in enumerate(relevant_items):
        print(f"{i+1}. {item}")
    print()

User: A30C4HNZBZYDI2
Relevant Items:
1. One Last Time
2. Awaken The Giant Within (CD)
3. The Tao of Pooh
4. Where the Heart is
5. The Tao of Pooh (Turtleback School & Library Binding Edition)

User: A3OWUSU9RG4NMF
Relevant Items:
1. Unfit for Command: Swift Boat Veterans Speak Out Against John Kerry
2. Myths, Lies, and Downright Stupidity: Get Out the Shovel -- Why Everything You Know is Wrong
3. Liberal Fascism: The Secret History of the American Left, From Mussolini to the Politics of Meaning
4. America Alone: The End of the World as We Know It
5. Understanding Exposure

User: A2BIFGERNRDLBB
Relevant Items:
1. The Duke and I (Bridgerton Series, Book 1)
2. A Knight in Shining Armor
3. Whitney, My Love
4. Every Breath You Take
5. Paradise

