# Elective Project
## Amazon Recommendation System

### Install libraries

In [1]:
# install libraries

%pip install scikit-surprise
%pip install torch

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# base libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

# ml
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

# error handling
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

print(f'NumPy version: {np.__version__}')
print(f'Pandas version: {pd.__version__}')
print('All libraries imported successfully!')

NumPy version: 1.26.4
Pandas version: 2.2.2
All libraries imported successfully!


### Load data

In [3]:
# import data
data = pd.read_csv("/Users/lukeroberts/My Drive(lukejrobertsza@gmail.com)/Colab Notebooks/mit_adsp_notebooks/7.1.elective_project/ratings_Electronics.csv")

In [4]:
# copy the data
copy_of_data = data.copy()

In [5]:
# Assign column headers without replacing the first row
data.columns = ['user', 'item', 'rating', 'timestamp']
copy_of_data.columns = ['user', 'item', 'rating', 'timestamp']

In [6]:
# drop timestamp from data and copy_of_data

data = data.drop(columns='timestamp')
copy_of_data = copy_of_data.drop(columns='timestamp')

### Define functions

In [7]:
def function_unique_values(df):
    """
    Returns a dictionary with column names as keys and lists of unique values as values.
    Also prints the count of unique values per column.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame to analyze
        
    Returns:
    --------
    dict
        Dictionary with column names as keys and unique values as values
    """
    unique_values = {}
    print("Number of unique values per column:")
    for column in df.columns:
        uniques = df[column].unique()
        unique_values[column] = uniques
        print(f"{column}: {len(uniques):,} unique values")
    
    return unique_values

In [8]:
# Test the function on the data
unique_data = function_unique_values(data)

Number of unique values per column:
user: 4,201,696 unique values
item: 476,001 unique values
rating: 5 unique values


In [9]:
def function_sparsity(df):
    """
    Calculate the sparsity of a user-item interaction matrix.
    
    Sparsity is defined as: 1 - (number of observed interactions / total possible interactions)
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing user-item interactions
        
    Returns:
    --------
    float
        Sparsity value between 0 and 1, where higher values indicate more sparsity
    """
    # Count unique users and items
    n_users = df['user'].nunique()
    n_items = df['item'].nunique()
    
    # Total possible interactions
    total_possible = n_users * n_items
    
    # Number of observed interactions
    observed = len(df)
    
    # Calculate sparsity
    sparsity = 1 - (observed / total_possible)
    
    print(f"Number of users: {n_users:,}")
    print(f"Number of items: {n_items:,}")
    print(f"Number of observed interactions: {observed:,}")
    print(f"Number of possible interactions: {total_possible:,}")
    print(f"Matrix sparsity: {sparsity:.6f} ({sparsity:.4%})")
    
    return sparsity

In [11]:
function_sparsity(data)

Number of users: 4,201,696
Number of items: 476,001
Number of observed interactions: 7,824,481
Number of possible interactions: 2,000,011,497,696
Matrix sparsity: 0.999996 (99.9996%)


0.9999960877819908

In [None]:
# Step 1: Filter users who have rated 20 or more items
user_rating_counts = data.groupby('user').size()
users_with_20_or_more_ratings = user_rating_counts[user_rating_counts >= 20].index
filtered_data_by_users = data[data['user'].isin(users_with_20_or_more_ratings)]

# Step 2: Filter items that have been rated by 20 or more users
item_rating_counts = filtered_data_by_users.groupby('item').size()
items_with_20_or_more_ratings = item_rating_counts[item_rating_counts >= 20].index
final_filtered_data = filtered_data_by_users[filtered_data_by_users['item'].isin(items_with_20_or_more_ratings)]

# Print the results
print(f"Original data shape: {data.shape}")
print(f"Filtered data shape after user threshold: {filtered_data_by_users.shape}")
print(f"Final filtered data shape after item threshold: {final_filtered_data.shape}")

In [None]:
# Reapply the user filter to ensure only users with 20 or more ratings are retained
users_with_20_or_more_ratings = final_filtered_data['user'].value_counts()
users_with_20_or_more_ratings = users_with_20_or_more_ratings[users_with_20_or_more_ratings >= 20].index

# Filter the final_filtered_data to include only these users
final_filtered_data = final_filtered_data[final_filtered_data['user'].isin(users_with_20_or_more_ratings)]

# Verify the results
print(f"Final filtered data shape: {final_filtered_data.shape}")
print(f"Number of unique users: {final_filtered_data['user'].nunique()}")
print(f"Minimum ratings per user: {final_filtered_data.groupby('user').size().min()}")

### Start Filtering

In [None]:
stop 

#### Users

In [12]:
# create user filtering

# list of all users
list_of_users = data['user']

# dictionary to store users in
dictionary_users = {}

# loop to count users' ratings
for user in list_of_users:

    # add +1 if the user is already in the dictionary
    if user in dictionary_users:
        dictionary_users[user] +=1

    # otherwise let the user = 1
    else:
        dictionary_users[user] = 1

In [14]:
# apply user filter

# set threshold
THRESHOLD_USER = 20

# empty list for users below the threshold
remove_users_below_threshold = []

# set for loop
# iterate through the dictionary to find users with fewer ratings than threshold
for user, rating_count in dictionary_users.items():
    if rating_count < THRESHOLD_USER:
        remove_users_below_threshold.append(user)

# print results
print(f"Number of users to remove: {len(remove_users_below_threshold):,}")
print(f"Percentage of users to remove: {len(remove_users_below_threshold) / len(dictionary_users):.2%}")
# Filter the data to keep only users with ratings >= THRESHOLD_USER
filtered_data = data[~data['user'].isin(remove_users_below_threshold)]


Number of users to remove: 4,188,345
Percentage of users to remove: 99.68%


In [None]:
# Filter the data to keep only users with ratings >= THRESHOLD_USER
filtered_data = data[~data['user'].isin(remove_users_below_threshold)]

# Print the new dataframe info and reduction stats
print(f"Original data shape: {data.shape}")
print(f"Filtered data shape: {filtered_data.shape}")
print(f"Reduction: {(1 - len(filtered_data)/len(data)):.2%} of data removed")

# Check sparsity of the filtered data
print("\nUnique values after user filtering:")
function_unique_values(filtered_data)
print("\nSparsity after user filtering:")
function_sparsity(filtered_data)

In [None]:
user_df = pd.DataFrame(filtered_data.groupby('user').count().sort_values(by='rating', ascending=True))
user_df

#### Items

In [None]:
# create item filtering

# list of items
list_of_items = data['item']

# Dictionary to store item counts
dictionary_items = {}

# Loop to count items' occurrences
for item in list_of_items:
    if item in dictionary_items:
        dictionary_items[item] += 1
    else:
        dictionary_items[item] = 1

In [None]:
# apply item filter

# set threshold
THRESHOLD_ITEM = 10

# empty list for items below the threshold
remove_items_below_threshold = []

# set for loop
# iterate through the dictionary to find items with fewer ratings than threshold
for item, item_rating_count in dictionary_items.items():
    if item_rating_count < THRESHOLD_ITEM:
        remove_items_below_threshold.append(item)

# print results
print(f"Number of items to remove: {len(remove_items_below_threshold):,}")
print(f"Percentage of items to remove: {len(remove_items_below_threshold) / len(dictionary_items):.2%}")

In [None]:
# Filter the data to keep only items with ratings >= THRESHOLD_ITEM
final_filtered_data = data[~data['item'].isin(remove_items_below_threshold)]

# Print the new dataframe info and reduction stats
print(f"Data before user filtering: {data.shape}")
print(f"Data after user filtering: {filtered_data.shape}")
print(f"Data after item filtering: {final_filtered_data.shape}")
print(f"Total reduction: {(1 - len(final_filtered_data)/len(data)):.2%} of original data removed")

# Check sparsity of the final filtered data
print("\nUnique values after item filtering:")
function_unique_values(final_filtered_data)
print("\nSparsity after item filtering:")
function_sparsity(final_filtered_data)

In [None]:
item_df = pd.DataFrame(final_filtered_data.groupby('item').count().sort_values(by='user', ascending=True))
item_df

In [None]:
print(user_df.shape, item_df.shape)

In [None]:

final_matrix_merged = pd.merge(user_df, item_df, how='left')

In [None]:
final_matrix_merged.groupby('user').count().sort_values(by='rating', ascending=True)