In [2]:
# preprocess movielenz data
# 1. read data
# 2. split data
# 3. save data
# 4. load data
# 5. get user-item matrix
# 6. get user-user similarity matrix
# 7. get item-item similarity matrix
# 8. get user-item rating matrix

import pandas as pd
import numpy as np
import pickle
import os
import random
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score


# 2. split data
def split_data(data, test_size=0.2):
    test = data.sample(frac=test_size)
    train = data.drop(test.index)
    return train, test


# 5. get user-item matrix

def get_user_item_matrix(data):
    user_item_matrix = data.pivot(index='userId', columns='movieId', values='rating')
    user_item_matrix.fillna(0, inplace=True)
    return user_item_matrix

# 6. get user-user similarity matrix

def get_user_user_similarity_matrix(user_item_matrix):
    user_user_similarity_matrix = cosine_similarity(user_item_matrix)
    return user_user_similarity_matrix

# 7. get item-item similarity matrix

def get_item_item_similarity_matrix(user_item_matrix):
    item_item_similarity_matrix = cosine_similarity(user_item_matrix.T)
    return item_item_similarity_matrix

# 8. get user-item rating matrix




In [3]:
def genre():
    # maps movieId to genres
    movie_genre = {}
    with open('movies.csv', 'r') as f:
        for line in f.readlines()[1:]:
            movie_id, genres = line.strip().split(',')[0], line.strip().split(',')[2]
            movie_genre[movie_id] = genres
    return movie_genre
    

In [4]:
m=pd.read_csv('movies.csv')

### 전처리 1


In [5]:
m

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


### 전처리 2


In [6]:
genre = genre()

In [7]:
genre

{'1': 'Adventure|Animation|Children|Comedy|Fantasy',
 '2': 'Adventure|Children|Fantasy',
 '3': 'Comedy|Romance',
 '4': 'Comedy|Drama|Romance',
 '5': 'Comedy',
 '6': 'Action|Crime|Thriller',
 '7': 'Comedy|Romance',
 '8': 'Adventure|Children',
 '9': 'Action',
 '10': 'Action|Adventure|Thriller',
 '11': ' The (1995)"',
 '12': 'Comedy|Horror',
 '13': 'Adventure|Animation|Children',
 '14': 'Drama',
 '15': 'Action|Adventure|Romance',
 '16': 'Crime|Drama',
 '17': 'Drama|Romance',
 '18': 'Comedy',
 '19': 'Comedy',
 '20': 'Action|Comedy|Crime|Drama|Thriller',
 '21': 'Comedy|Crime|Thriller',
 '22': 'Crime|Drama|Horror|Mystery|Thriller',
 '23': 'Action|Crime|Thriller',
 '24': 'Drama|Sci-Fi',
 '25': 'Drama|Romance',
 '26': 'Drama',
 '27': 'Children|Drama',
 '28': 'Drama|Romance',
 '29': ' The (Cité des enfants perdus',
 '30': 'Crime|Drama',
 '31': 'Drama',
 '32': 'Mystery|Sci-Fi|Thriller',
 '34': 'Children|Drama',
 '36': 'Crime|Drama',
 '38': 'Children|Comedy',
 '39': 'Comedy|Romance',
 '40': ' the

In [8]:

a=pd.read_csv('ratings.csv')

In [9]:
a

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


### 전처리 3

In [10]:
get_user_item_matrix(a)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 전처리 4

In [39]:
#user user similarity matrix
def user_user_similarity_matrix(user_item_matrix):
    user_user_similarity_matrix = cosine_similarity(user_item_matrix)
    return user_user_similarity_matrix

user_user_similarity_matrix(get_user_item_matrix(a)).shape

(610, 610)

### 전처리 5

In [14]:
# item item similarity matrix
def item_item_similarity_matrix(user_item_matrix):
    item_item_similarity_matrix = cosine_similarity(user_item_matrix.T)
    return item_item_similarity_matrix

In [38]:
item_item_similarity_matrix(get_user_item_matrix(a)).shape

(9724, 9724)

### memory based Collaborate filtering

In [15]:
# memory based collaborative filtering
# 1. user-user collaborative filtering

def user_user_collaborative_filtering(user_item_matrix, user_user_similarity_matrix, user_id, k=5):
    # get similarity matrix
    similarity_matrix = user_user_similarity_matrix[user_id]
    # get top k similar users
    top_k_similar_users = similarity_matrix.argsort()[-k:][::-1]
    # get user-item matrix
    user_item_matrix = user_item_matrix.values
    # get user-item rating matrix
    user_item_rating_matrix = user_item_matrix[top_k_similar_users]
    # get user-item rating prediction
    user_item_rating_prediction = np.dot(similarity_matrix[top_k_similar_users], user_item_rating_matrix) / np.sum(similarity_matrix[top_k_similar_users])
    return user_item_rating_prediction

# 2. item-item collaborative filtering

def item_item_collaborative_filtering(user_item_matrix, item_item_similarity_matrix, user_id, k=5):

    # get user-item matrix
    user_item_matrix = user_item_matrix.values
    # get user-item rating matrix
    user_item_rating_matrix = user_item_matrix[:, user_id]
    # get top k similar items
    top_k_similar_items = item_item_similarity_matrix[user_id].argsort()[-k:][::-1]
    # get user-item rating prediction
    user_item_rating_prediction = np.dot(user_item_rating_matrix[top_k_similar_items], item_item_similarity_matrix[user_id][top_k_similar_items]) / np.sum(item_item_similarity_matrix[user_id][top_k_similar_items])
    return user_item_rating_prediction




In [55]:
# recommendation for item-item collaborative filtering

def recommendation_item_item_collaborative_filtering(user_item_matrix, item_item_similarity_matrix, user_id, k=5):
    # get user-item matrix
    user_item_matrix = user_item_matrix.values
    # get user-item rating matrix
    user_item_rating_matrix = user_item_matrix[:, user_id]
    # get top k similar items
    top_k_similar_items = item_item_similarity_matrix[user_id].argsort()[-k:][::-1]
    # get user-item rating prediction
    user_item_rating_prediction = np.dot(user_item_rating_matrix[top_k_similar_items], item_item_similarity_matrix[user_id][top_k_similar_items]) / np.sum(item_item_similarity_matrix[user_id][top_k_similar_items])
    # get recommendation
    recommendation = np.argsort(user_item_rating_prediction)[::-1]
    return top_k_similar_items

In [42]:
# recommendation for user-user collaborative filtering

def recommendation_user_user_collaborative_filtering(user_item_matrix, user_user_similarity_matrix, user_id, k=5):
    # get similarity matrix
    similarity_matrix = user_user_similarity_matrix[user_id]
    # get top k similar users
    top_k_similar_users = similarity_matrix.argsort()[-k:][::-1]
    # get user-item matrix
    user_item_matrix = user_item_matrix.values
    # get user-item rating matrix
    user_item_rating_matrix = user_item_matrix[top_k_similar_users]
    # get user-item rating prediction
    user_item_rating_prediction = np.dot(similarity_matrix[top_k_similar_users], user_item_rating_matrix) / np.sum(similarity_matrix[top_k_similar_users])
    # get recommendation
    recommendation = np.argsort(user_item_rating_prediction)[::-1]
    return recommendation

In [21]:
# 3. model based collaborative filtering
# 1. matrix factorization

def matrix_factorization(user_item_matrix, k=5, lr=0.01, epochs=3):
    # get user-item matrix
    user_item_matrix = user_item_matrix.values
    # get user number and item number
    user_num, item_num = user_item_matrix.shape
    # initialize user latent matrix and item latent matrix
    user_latent_matrix = np.random.rand(user_num, k)
    item_latent_matrix = np.random.rand(k, item_num)
    # train model
    for epoch in range(epochs):
        for i in range(user_num):
            for j in range(item_num):
                if user_item_matrix[i][j] > 0:
                    error = user_item_matrix[i][j] - np.dot(user_latent_matrix[i], item_latent_matrix[:, j])
                    user_latent_matrix[i] += lr * error * item_latent_matrix[:, j]
                    item_latent_matrix[:, j] += lr * error * user_latent_matrix[i]
    # get user-item rating prediction
    user_item_rating_prediction = np.dot(user_latent_matrix, item_latent_matrix)
    return user_item_rating_prediction





In [24]:
matrix_factorization(get_user_item_matrix(a), k=5, lr=0.01, epochs=3)

array([[4.52459777, 4.26963903, 3.66176678, ..., 4.05391876, 3.09043463,
        3.97389748],
       [3.37482546, 3.00851365, 3.06343499, ..., 3.13465971, 2.50764358,
        2.92672338],
       [2.44190231, 2.5244116 , 2.42132618, ..., 2.06714275, 1.73941709,
        2.14965631],
       ...,
       [4.1691965 , 4.10554644, 3.71398089, ..., 3.68999007, 2.87137459,
        3.64106855],
       [2.97980447, 2.93723234, 2.60273509, ..., 2.70888631, 2.15685933,
        2.6781248 ],
       [4.42237679, 4.31945553, 3.87866294, ..., 3.92637636, 3.04483185,
        3.86596892]])

In [16]:
#recommendation

def recommendation(user_item_rating_prediction, user_id, top_n=10):
    # get user-item rating prediction
    user_item_rating_prediction = user_item_rating_prediction[user_id]
    # get top n item index
    top_n_item_index = user_item_rating_prediction.argsort()[-top_n:][::-1]
    # get top n item id
    top_n_item_id = [i for i in range(len(top_n_item_index))]
    for i in range(len(top_n_item_index)):
        top_n_item_id[i] = top_n_item_index[i]
    return top_n_item_id

In [43]:
recommendation(matrix_factorization(get_user_item_matrix(a), k=5, lr=0.01, epochs=3), 1, top_n=10)


[900, 686, 840, 2224, 714, 1493, 2579, 1631, 731, 596]

In [53]:
recommendation_user_user_collaborative_filtering(get_user_item_matrix(a), user_user_similarity_matrix(get_user_item_matrix(a)), 1, k=5)


array([7355, 6693, 8287, ..., 6444, 6443, 6452])

In [56]:
recommendation_item_item_collaborative_filtering(get_user_item_matrix(a), item_item_similarity_matrix(get_user_item_matrix(a)), 1, k=5)

array([  1, 322, 436, 325, 418])

In [57]:
# 4. evaluation

def evaluation(user_item_matrix, user_item_rating_prediction, user_id):

    # get user-item matrix
    user_item_matrix = user_item_matrix.values
    # get user-item rating prediction
    user_item_rating_prediction = user_item_rating_prediction[user_id]
    # get user-item rating prediction index
    user_item_rating_prediction_index = np.where(user_item_matrix[user_id] > 0)
    # get user-item rating prediction
    user_item_rating_prediction = user_item_rating_prediction[user_item_rating_prediction_index]
    # get user-item rating
    user_item_rating = user_item_matrix[user_id][user_item_rating_prediction_index]
    # get rmse
    rmse = np.sqrt(mean_squared_error(user_item_rating, user_item_rating_prediction))
    # get mae
    mae = mean_absolute_error(user_item_rating, user_item_rating_prediction)
    # get accuracy
    accuracy = accuracy_score(np.around(user_item_rating), np.around(user_item_rating_prediction))
    return rmse, mae, accuracy

In [58]:
evaluation(get_user_item_matrix(a), matrix_factorization(get_user_item_matrix(a), k=5, lr=0.01, epochs=3), 1)

(0.8792079015484251, 0.6472124710617574, 0.5517241379310345)