In [1]:
import pandas

users = pandas.read_csv('dataset/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pandas.read_csv('dataset/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pandas.read_csv('dataset/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')
movies_test = pandas.read_csv('dataset/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')
movies_train['genre'] = movies_train.genre.str.split('|')
movies_test['genre'] = movies_test.genre.str.split('|')
users.reset_index(inplace=True)

users.age = users.age.astype('category')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('category')
ratings.userid = ratings.userid.astype('category')

movies_train.reset_index(inplace=True)
movies_test.reset_index(inplace=True)

movies_train.head()

Unnamed: 0,movieid,title,genre
0,1650,Washington Square (1997),[Drama]
1,185,"Net, The (1995)","[Sci-Fi, Thriller]"
2,1377,Batman Returns (1992),"[Action, Adventure, Comedy, Crime]"
3,3204,"Boys from Brazil, The (1978)",[Thriller]
4,1901,Dear Jesse (1997),[Documentary]


In [2]:
ratings.drop('timestamp', axis=1, inplace=True)
rating_movies_train = ratings.merge(movies_train, on='movieid', how='inner')
rating_movies_test = ratings.merge(movies_test, on='movieid', how='inner')

rating_movies_train.drop('title', axis=1, inplace=True)
rating_movies_test.drop('title', axis=1, inplace=True)

# delete users who have rated less than 20 movies in the training set
delete_users = []
for user in rating_movies_train.userid.unique():
    if len(rating_movies_train[rating_movies_train.userid == user]) < 20:
        delete_users.append(user)

print(len(delete_users))
rating_movies_train = rating_movies_train[~rating_movies_train.userid.isin(delete_users)]
rating_movies_test = rating_movies_test[~rating_movies_test.userid.isin(delete_users)]

# delete movies which have been rated less than 20 times in the training set
delete_movies = []
for movie in rating_movies_train.movieid.unique():
    if len(rating_movies_train[rating_movies_train.movieid == movie]) < 20:
        delete_movies.append(movie)

print(len(delete_movies))
rating_movies_train = rating_movies_train[~rating_movies_train.movieid.isin(delete_movies)]
rating_movies_test = rating_movies_test[~rating_movies_test.movieid.isin(delete_movies)]

# delete movies which have been rated less than 20 times in the test set
delete_movies = []
for movie in rating_movies_test.movieid.unique():
    if len(rating_movies_test[rating_movies_test.movieid == movie]) < 20:
        delete_movies.append(movie)

print(len(delete_movies))
rating_movies_train = rating_movies_train[~rating_movies_train.movieid.isin(delete_movies)]
rating_movies_test = rating_movies_test[~rating_movies_test.movieid.isin(delete_movies)]

# delete users who have rated less than 20 movies in the test set
delete_users = []
for user in rating_movies_test.userid.unique():
    if len(rating_movies_test[rating_movies_test.userid == user]) < 20:
        delete_users.append(user)

print(len(delete_users))
rating_movies_train = rating_movies_train[~rating_movies_train.userid.isin(delete_users)]
rating_movies_test = rating_movies_test[~rating_movies_test.userid.isin(delete_users)]

329
533
132
3039


In [3]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from tqdm import tqdm
import pandas as pd

mlb = MultiLabelBinarizer()
mlb.fit(rating_movies_train.genre)

user_feature = dict()

movie_feature = pd.Series(
    mlb.transform(movies_train.genre).tolist(),
    index=movies_train.movieid
).to_dict()

epoch = 0

for i in range(epoch + 1):
    print('Epoch: {}'.format(i))

    # update user feature
    done_users = []
    user_feature_prev = user_feature.copy()
    for user_id in tqdm(rating_movies_train.userid.unique().tolist()):
        if user_id in done_users:
            continue
        user_ratings = rating_movies_train[rating_movies_train.userid == user_id]
        
        movie_id_X= user_ratings.movieid.values.tolist()
        X = [movie_feature[movie_id] for movie_id in movie_id_X]
        X = np.array(X)

        y = user_ratings['rating'].values.tolist()
        y = np.array(y).reshape(-1, 1)
        y = y / 5

        clf = Ridge(alpha=1.0)
        clf.fit(X, y)
        
        user_feature[user_id] = np.array(clf.coef_, dtype=float).flatten()
    
    # update movie feature
    movie_feature_prev = movie_feature.copy()
    done_movies = []
    for movie_id in tqdm(rating_movies_train.movieid.unique().tolist()):
        if movie_id in done_movies:
            continue

        movie_ratings = rating_movies_train[rating_movies_train.movieid == movie_id]
        user_id_y = movie_ratings.userid.values.tolist()
        y = movie_ratings['rating'].values.tolist()
        y = np.array(y).reshape(-1, 1)
        y = y / 5
        
        X = [user_feature[user_id].flatten() for user_id in user_id_y]
        X = np.array(X)

        clf = Ridge(alpha=1.0)
        clf.fit(X, y)

        movie_feature[movie_id] = np.array(clf.coef_, dtype=float).flatten()

    if i > 0:
        if np.allclose(np.array(list(user_feature.values())), np.array(list(user_feature_prev.values()))):
            print('User feature converged')
            break
        # calculate distance between user feature prev and user feature
        dist = 0
        for user_id in user_feature.keys():
            dist += np.linalg.norm(user_feature[user_id] - user_feature_prev[user_id])
        print('User feature distance: {}'.format(dist))
        

Epoch: 0


100%|██████████| 2672/2672 [00:13<00:00, 205.44it/s]
100%|██████████| 2427/2427 [00:11<00:00, 211.53it/s]


In [4]:
movie_feature = dict()
for movie_id in tqdm(rating_movies_test.movieid.unique().tolist()):
    movie_ratings = rating_movies_test[rating_movies_test.movieid == movie_id]
    y = movie_ratings.rating.values.tolist()
    y = np.array(y)

    X = []
    for user_id in movie_ratings.userid.tolist():
        X.append(user_feature[user_id].flatten())
    X = np.array(X)

    clf = Ridge(alpha=1.0)
    clf.fit(X, y)
    movie_feature[movie_id] = clf.coef_

  0%|          | 0/611 [00:00<?, ?it/s]

100%|██████████| 611/611 [00:01<00:00, 356.55it/s]


In [5]:
import numpy as np

bit_length = 18
all_genres_combine = []

for i in range(2**bit_length):
    binary_representation = [(i >> j) & 1 for j in range(bit_length - 1, -1, -1)]
    all_genres_combine.append(binary_representation)

all_genres_combine = np.array(all_genres_combine)

print(all_genres_combine.shape)

(262144, 18)


In [6]:
import torch
from torchmetrics.classification import MultilabelF1Score, MultilabelPrecision, MultilabelRecall
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'
metric = MultilabelF1Score(num_labels=18, threshold=0.5, average='macro')
metric.to(device)
precision = MultilabelPrecision(num_labels=18, threshold=0.5, average='macro')
precision.to(device)
recall = MultilabelRecall(num_labels=18, threshold=0.5, average='macro')
recall.to(device)

all_pred = []
all_true = []
for key, value in tqdm(movie_feature.items()):    
    genres = torch.tensor(value, dtype=torch.float32, device=device).reshape(1, -1)
    y_true = torch.tensor(mlb.transform([movies_test[movies_test.movieid == key].genre.tolist()[0]]), dtype=torch.float32, device=device).reshape(1, -1)
    all_pred.append(genres)
    all_true.append(y_true)
    
f1_all = metric(torch.cat(all_pred), torch.cat(all_true))
precision_all = precision(torch.cat(all_pred), torch.cat(all_true))
recall_all = recall(torch.cat(all_pred), torch.cat(all_true))

print(f'f1: {f1_all}, precision: {precision_all}, recall: {recall_all}')

100%|██████████| 611/611 [00:00<00:00, 1441.83it/s]

f1: 0.2348705232143402, precision: 0.1643388718366623, recall: 0.8041476607322693



