we import all the necessary modules and variables

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict

upload local file

In [None]:
from google.colab import files
uploaded = files.upload()


Saving ratings.csv to ratings.csv


define some parameter

In [None]:
import io
data_path = 'ratings.csv'
n_users = 6040
n_movies = 9743

We then develop the following function to load the rating data from ratings.dat

In [None]:
def load_rating_data(data_path, n_users, n_movies):
    data = np.zeros([n_users, n_movies], dtype=np.float32)
    movie_id_mapping = {}
    movie_n_rating = defaultdict(int)
    with open(data_path, 'r') as file:
        for line in file.readlines()[1:]:
            user_id, movie_id, rating, _ = line.split(",")
            user_id = int(user_id) - 1
            if movie_id not in movie_id_mapping:
                movie_id_mapping[movie_id] = len(movie_id_mapping)
            rating = int(float(rating))
            data[user_id, movie_id_mapping[movie_id]] = rating
            if rating > 0:
                movie_n_rating[movie_id] += 1
    return data, movie_n_rating, movie_id_mapping

And then we load the data using this function

In [None]:
 data, movie_n_rating, movie_id_mapping = load_rating_data(data_path, n_users, n_movies)

It is always recommended to analyze the data distribution. We do the following

In [None]:
def display_distribution(data):
  values, counts = np.unique(data, return_counts=True)
  for value, count in zip(values, counts):
    print(f'Number of rating {int(value)}: {count}')
display_distribution(data)

Number of rating 0: 58748254
Number of rating 1: 4602
Number of rating 2: 13101
Number of rating 3: 33183
Number of rating 4: 35369
Number of rating 5: 13211


Since most ratings are unknown, we take the movie with the most known ratings as our target movie

In [None]:
movie_id_most, n_rating_most = sorted(movie_n_rating.items(),key=lambda d: d[1], reverse=True)[0]
print(f'Movie ID {movie_id_most} has {n_rating_most} ratings.')

Movie ID 356 has 328 ratings.


We construct the dataset accordingly

In [None]:
X_raw = np.delete(data, movie_id_mapping[movie_id_most],axis=1)
Y_raw = data[:, movie_id_mapping[movie_id_most]]

We discard samples without a rating in movie ID 356

In [None]:
X = X_raw[Y_raw > 0]
Y = Y_raw[Y_raw > 0]
print('Shape of X:', X.shape)
print('Shape of Y:', Y.shape)

Shape of X: (328, 9742)
Shape of Y: (328,)


Again, we take a look at the distribution of the target movie ratings

In [None]:
display_distribution(Y)

Number of rating 1: 1
Number of rating 2: 12
Number of rating 3: 66
Number of rating 4: 133
Number of rating 5: 116


We can consider movies with ratings greater than 3 as being liked (being recommended)

In [None]:
recommend = 3
Y[Y <= recommend] = 0
Y[Y > recommend] = 1
n_pos = (Y == 1).sum()
n_neg = (Y == 0).sum()
print(f'{n_pos} positive samples and {n_neg} negativesamples.')

249 positive samples and 79 negativesamples.


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2, random_state=42)

In [None]:
print(len(Y_train), len(Y_test))

262 66


In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1.0, fit_prior=True)
clf.fit(X_train, Y_train)

MultinomialNB()

Then, we use the trained model to make predictions on the testing set. We get the predicted probabilities as follows

In [None]:
prediction_prob = clf.predict_proba(X_test)
print(prediction_prob[0:10])

[[5.84898799e-07 9.99999415e-01]
 [2.91964285e-10 1.00000000e+00]
 [1.16699945e-14 1.00000000e+00]
 [1.62143225e-18 1.00000000e+00]
 [1.17778539e-14 1.00000000e+00]
 [5.56378983e-24 1.00000000e+00]
 [5.07106739e-85 1.00000000e+00]
 [2.82229843e-61 1.00000000e+00]
 [8.64057221e-01 1.35942779e-01]
 [1.00000000e+00 1.15572676e-29]]


We get the predicted class as follows

In [None]:
prediction = clf.predict(X_test)
print(prediction[:10])

[1. 1. 1. 1. 1. 1. 1. 1. 0. 0.]


Finally, we evaluate the model's performance with classification accuracy, which is the proportion of correct predictions

In [None]:
accuracy = clf.score(X_test, Y_test)
print(f'The accuracy is: {accuracy*100:.1f}%')

The accuracy is: 69.7%
