# 關於 Movielens 協同舉薦作業

### 檔案說明
> 第一部分使用用戶協同推薦算法完成

### 一、基於用戶的協同過濾算法

#### 1) 讀取數據

In [1]:
import pandas as pd
import numpy as np
import math

In [14]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./movielens/ml-100k/u.user', sep='|', names=u_cols,encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('./movielens/ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')

m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url'] 
movies = pd.read_csv('./movielens/ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),encoding='latin-1') 

movie_ratings = pd.merge(movies, ratings) 
lens = pd.merge(movie_ratings, users)

# print(lens)


#### 2) 整理數據格式

In [2]:
# 943 名用户   1682 部电影


user_list = np.zeros([943, 1682])

for i in range(len(ratings.values)):
    user_list[ratings.values[i][0]-1][ratings.values[i][1]-1] = ratings.values[i][2]



#### 3) 核心推薦算法實現

In [2]:
def user_based_recommend(data, user_num, userK, topK):
    """ 基於用戶user的topK 舉薦
    
    Args:
        data: 數據表
        user: 用戶編號
        userK: 用戶組 topK
        topK: 商品組 topK

    Returns:
        舉薦列表
    """
    user_num -= 1
    user = data[user_num]
    sim_list = []
    # del data[user_num]  # 從用戶數據表中刪除用戶本身
    for i in range(len(data)):
        sim_list.append([cos_sim(user, data[i]), i])
    sim_list.sort()
    sim_list = sim_list[-userK - 1:]

    result = {}
    for i in range(len(sim_list)):
        for a in range(len(data[0])):
            if data[sim_list[i][1]][a] != 0 and user[a] == 0:
                if a in result:
                    result[a] += data[sim_list[i][1]][a]
                else:
                    result[a] = data[sim_list[i][1]][a]

    result = sorted(result.items(), key=lambda x: x[1], reverse=True)
    
    # print(result[:topK])
    return result[:topK]


def cos_sim(x_, y_):
    """ 余憲相似性

    Args:
    - x: mat, 以行向量形式存儲
    - y: mat, 以行向量形式存儲

    Return: x 和 y 之間的余憲相似的度
    """
    x, y = [], []
    for i in range(len(x_)):
        if x_[i] != 0 and y_[i] != 0:
            x.append(x_[i])
            y.append(y_[i])

    x, y = np.array(x), np.array(y)
    numerator = np.sum(x.T * y)  
    denominator = np.sqrt(np.sum(x.T * x)) * np.sqrt(np.sum(y.T * y))
    return numerator / denominator


In [72]:
user_based_recommend(user_list, 1, 3, 3)

[(287, 12.0), (301, 8.0), (306, 8.0)]

#### 說明: 從數據中可以查到，一號用戶未看過的三部電影，算法通過TopK排名為其推薦，下列將其電影的詳細打印出來

In [73]:
li = user_based_recommend(user_list, 1, 3, 3)
# print(movies.values)
for i in li:
    print(movies.values[i[0]])


[288 'Scream (1996)' '20-Dec-1996' nan
 'http://us.imdb.com/M/title-exact?Scream%20(1996)']
[302 'L.A. Confidential (1997)' '01-Jan-1997' nan
 'http://us.imdb.com/M/title-exact?L%2EA%2E+Confidential+(1997)']
[307 "Devil's Advocate, The (1997)" '01-Jan-1997' nan
 "http://us.imdb.com/M/title-exact?Devil's+Advocate,+The+(1997)"]


#### 4) 使用驗證集和測試集進行實驗

In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
train = pd.read_csv('./movielens/ml-100k/u1.base', sep='\t', names=r_cols, encoding='latin-1')
test = pd.read_csv('./movielens/ml-100k/u1.test', sep='\t', names=r_cols, encoding='latin-1')

train_list = np.zeros([943, 1682])
test_list = np.zeros([943, 1682])

for i in range(len(train.values)):
    train_list[train.values[i][0] - 1][train.values[i][1] - 1] = train.values[i][2]

for i in range(len(test.values)):
    test_list[test.values[i][0] - 1][test.values[i][1] - 1] = test.values[i][2]

user_based_recommend(train_list, 1, 10, 5)



[(312, 45.0), (257, 34.0), (299, 30.0), (327, 26.0), (322, 20.0)]

In [11]:
tj = user_based_recommend(train_list, 2, 10, 10)
tr=0
for i in tj:
    if test_list[5-1][i[0]]>0:
        tr+=1
        
print(tr)



[(171, 26.0), (184, 26.0), (426, 26.0), (196, 25.0), (173, 24.0), (482, 22.0), (49, 20.0), (97, 19.0), (602, 19.0), (478, 19.0)]
2
