In [1]:
import numpy as np
from collections import defaultdict
data_path = 'ratings.dat'
n_users = 6040
n_movies = 3706


In [2]:
def load_rating_data(data_path, n_users, n_movies):
    data = np.zeros([n_users, n_movies], dtype=np.float32)
    movie_id_mapping = {}
    movie_n_rating = defaultdict(int)
    with open(data_path, 'r') as file:
        for line in file.readlines()[1:]:
            user_id, movie_id, rating, _ = line.split("::")
            user_id = int(user_id) - 1
            if movie_id not in movie_id_mapping:
                movie_id_mapping[movie_id] = len(movie_id_mapping)
            rating = int(rating)
            data[user_id, movie_id_mapping[movie_id]] = rating
            if rating > 0:
                movie_n_rating[movie_id] += 1
    return data, movie_n_rating, movie_id_mapping


In [3]:
data, movie_n_rating, movie_id_mapping = load_rating_data(data_path, n_users, n_movies)
print(movie_id_mapping)
print(movie_n_rating)
print(data)

{'661': 0, '914': 1, '3408': 2, '2355': 3, '1197': 4, '1287': 5, '2804': 6, '594': 7, '919': 8, '595': 9, '938': 10, '2398': 11, '2918': 12, '1035': 13, '2791': 14, '2687': 15, '2018': 16, '3105': 17, '2797': 18, '2321': 19, '720': 20, '1270': 21, '527': 22, '2340': 23, '48': 24, '1097': 25, '1721': 26, '1545': 27, '745': 28, '2294': 29, '3186': 30, '1566': 31, '588': 32, '1907': 33, '783': 34, '1836': 35, '1022': 36, '2762': 37, '150': 38, '1': 39, '1961': 40, '1962': 41, '2692': 42, '260': 43, '1028': 44, '1029': 45, '1207': 46, '2028': 47, '531': 48, '3114': 49, '608': 50, '1246': 51, '1357': 52, '3068': 53, '1537': 54, '647': 55, '2194': 56, '648': 57, '2268': 58, '2628': 59, '1103': 60, '2916': 61, '3468': 62, '1210': 63, '1792': 64, '1687': 65, '1213': 66, '3578': 67, '2881': 68, '3030': 69, '1217': 70, '434': 71, '2126': 72, '3107': 73, '3108': 74, '3035': 75, '1253': 76, '1610': 77, '292': 78, '2236': 79, '3071': 80, '902': 81, '368': 82, '1259': 83, '3147': 84, '1544': 85, '12

In [4]:
def display_distribution(data):
    values, counts = np.unique(data, return_counts=True)
    for value, count in zip(values, counts):
        print(f'Number of rating {int(value)}: {count}')
        
display_distribution(data)

Number of rating 0: 21384032
Number of rating 1: 56174
Number of rating 2: 107557
Number of rating 3: 261197
Number of rating 4: 348971
Number of rating 5: 226309


In [5]:
movie_id_most, n_rating_most = sorted(movie_n_rating.items(), key=lambda d: d[1], reverse=True)[0]
print(f'Movie ID {movie_id_most} has {n_rating_most} ratings.')


Movie ID 2858 has 3428 ratings.


In [6]:
X_raw = np.delete(data, movie_id_mapping[movie_id_most],axis=1)
Y_raw = data[:, movie_id_mapping[movie_id_most]]

print(X_raw)
print(Y_raw)

[[3. 3. 4. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [3. 4. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[0. 4. 4. ... 0. 0. 4.]


In [7]:
X = X_raw[Y_raw > 0]
Y = Y_raw[Y_raw > 0]
print('Shape of X:', X.shape)
print('Shape of Y:', Y.shape)
print('Shape of data:', data.shape)


Shape of X: (3428, 3705)
Shape of Y: (3428,)
Shape of data: (6040, 3706)


In [8]:
display_distribution(Y)


Number of rating 1: 83
Number of rating 2: 134
Number of rating 3: 358
Number of rating 4: 890
Number of rating 5: 1963


In [9]:
recommend = 3
Y[Y <= recommend] = 0
Y[Y > recommend] = 1
n_pos = (Y == 1).sum()
n_neg = (Y == 0).sum()
print(f'{n_pos} positive samples and {n_neg} negative samples.')


2853 positive samples and 575 negative samples.


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [11]:
print(len(Y_train), len(Y_test))
print(Y_train)
print(X_train)
print(Y_test)
print(X_test)


2742 686
[1. 1. 1. ... 0. 1. 1.]
[[0. 0. 4. ... 0. 0. 0.]
 [0. 0. 5. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 5. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0.
 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0.
 1. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1.
 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1.
 1. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1

In [12]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1.0, fit_prior=True)
clf.fit(X_train, Y_train)

MultinomialNB()

In [13]:
prediction_prob = clf.predict_proba(X_test)
print(prediction_prob[0:10])


[[7.50487439e-23 1.00000000e+00]
 [1.01806208e-01 8.98193792e-01]
 [3.57740570e-10 1.00000000e+00]
 [1.00000000e+00 2.94095407e-16]
 [1.00000000e+00 2.49760836e-25]
 [7.62630220e-01 2.37369780e-01]
 [3.47479627e-05 9.99965252e-01]
 [2.66075292e-11 1.00000000e+00]
 [5.88493563e-10 9.99999999e-01]
 [9.71326867e-09 9.99999990e-01]]


In [14]:
prediction = clf.predict(X_test)
print(prediction[:10])


[1. 1. 1. 0. 0. 0. 1. 1. 1. 1.]


In [15]:
accuracy = clf.score(X_test, Y_test)
print(f'The accuracy is: {accuracy*100:.1f}%')


The accuracy is: 71.6%
