In [15]:
import os  # Used to check if file already exists
import pandas as pd  # Used to load and manipulate the ratings data
import zipfile  # Used to extract the contents of the zip file

# Define the URL and file name for the MovieLens dataset
url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
filename = 'ml-latest-small.zip'

# Check if file already exists before downloading
if not os.path.exists(filename):
    # Download the MovieLens dataset
    !wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip

# Extract the dataset
with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall()

# Load the ratings data from the csv file
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')

# Preprocess the data to handle missing values
mean_rating = ratings_df['rating'].mean()
ratings_df['rating'].fillna(mean_rating, inplace=True)

# Remove duplicates from the ratings data
ratings_df.drop_duplicates(inplace=True)

# Remove missing values from the ratings data
ratings_df.dropna(inplace=True)

# Convert the ratings data into a user-item rating matrix
ratings_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating')

# Print the shape (rows, columns) of the dataframe to check that the data set was loaded correctly
ratings_df.shape


(100836, 4)

In [25]:
from sklearn.decomposition import NMF

# Define the number of latent factors to use in the matrix factorization
k = 20

# Initialize the matrix factorization model
model = NMF(n_components=k, init='random', random_state=0, max_iter=500)

# Fit the model to the ratings matrix
W = model.fit_transform(ratings_matrix.fillna(0))
H = model.components_

