In [1]:
""" 匯入函式庫，讀取題目給的資料 """
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# 新方法 可以建立一個 key dict 
from collections import defaultdict

# 更改user，num數字可以改變目標user和推薦電影數量 

user = 2
num = 10


# 匯入並查看資料形式 
ratings_small = pd.read_csv('ratings_small.csv')
input_table = ratings_small
ratings_small.info

<bound method DataFrame.info of         userId  movieId  rating   timestamp
0            1       31     2.5  1260759144
1            1     1029     3.0  1260759179
2            1     1061     3.0  1260759182
3            1     1129     2.0  1260759185
4            1     1172     4.0  1260759205
...        ...      ...     ...         ...
99999      671     6268     2.5  1065579370
100000     671     6269     4.0  1065149201
100001     671     6365     4.0  1070940363
100002     671     6385     2.5  1070979663
100003     671     6565     3.5  1074784724

[100004 rows x 4 columns]>

In [2]:
input_table.to_csv('first1000_data.csv')

In [3]:
print(type(ratings_small["userId"][0]))
print(type(ratings_small["movieId"][0]))
print(type(ratings_small["rating"][0]))
print(type(ratings_small["timestamp"][0]))

<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.float64'>
<class 'numpy.int64'>


In [4]:
""" 確認一下電影與使用者的數量 (ps.電影id編號不只9066，是這個資料表中有出現9066種) """

print('unique user in train set:', ratings_small['userId'].nunique())
print('unique movie in train set:', ratings_small['movieId'].nunique())
print('All combinations:',ratings_small['userId'].nunique() * ratings_small['movieId'].nunique())

unique user in train set: 671
unique movie in train set: 9066
All combinations: 6083286


In [5]:
""" 先把每個user最近看的那部電影抓出來做LeaveOneOut的validation """

#  把電影按照每個user分群，同個group裡再做timestamp做rank最近看的timestamp會最大rank會=1
ratings_small["rank"] = ratings_small.groupby("userId")["timestamp"].rank(method='first',ascending=False)
# print(ratings_small)
#  在每個user的group中找到rank=1(也就是最近看的那部電影)的 userId:moiveId
validation_data = ratings_small.loc[ratings_small["rank"]==1,["userId","movieId"]] #.values
print(validation_data)
ratings_small_test = ratings_small.drop(ratings_small[ratings_small["rank"]==1].index)
# print(ratings_small_test)

       userId  movieId
4           1     1172
66          2      405
110         3      736
231         4     1334
414         5     4025
...       ...      ...
99791     667      592
99801     668      296
99839     669     2702
99884     670     2723
99962     671     3386

[671 rows x 2 columns]


In [6]:
""" 選定特定user推薦num數量的電影 """
def recommend_movie(user,num):
    # 首先先計算與目標user有都看過的電影，並計算出其評分相似度 
    # 並用一個2維的list 紀錄對 1.某user對應的2.相似程度 
    user_similarity = []
    # 必須對list做for迴圈，不能對一個整數做for迴圈 
    for other_user in ratings_small.userId.unique():
        # 若是自己則不用計算，直接跳過 
        if other_user == user:
            continue
        common_movies = find_common_movie(user,other_user)
        # 避免出現nan而且similarity因分母是0出現問題 (不確定會不會出問題 ) 
        if(common_movies):
            similarity = cal_user_similarity_with_movie_rating(user,other_user,common_movies)
        else:
            similarity = 0
        user_similarity.append([other_user,similarity])
        
    # print(user_similarity)
    
    user_similarity=np.array(user_similarity)
    # 按照similarity程度sort後，取所有項的第[1]列值(也就是similarity)，取reverse，在切出0~9(top10) 
    # argsort是排序的index 
    sorted_index = np.argsort(user_similarity, axis=0)[:,1][::-1][:10]
    # 所以要再從此index找出對應哪個user (userId) 
    top10_similar_user = user_similarity[:,0][sorted_index]
    top10_similar = user_similarity[:,1][sorted_index]
    # 至此我們找到了前10位與目標user相似的users了 
    
    # print(top10_similar_user)
    # print(top10_similar)
    
    # 接下來我們要找出num部電影推薦給目標user 
    seen_movies = ratings_small.loc[ratings_small["userId"]==user,"movieId"].values
    not_seen_movies = defaultdict(list) 
    for similar_user in top10_similar_user:
        # 某個人所有看過的電影 
        movies = ratings_small.loc[ratings_small.userId==similar_user,["movieId","rating"]].values.tolist()
        if isinstance(movies[0], list):
            for movie in movies:
                # 如果看過的電影就跳過 
                if movie[0] in seen_movies:
                    continue
                """ 
                    movie[0] : movieId --- key
                    movie[1] : rating  --- value
                """
                
                # 沒看過的紀錄其評分到這個dict裡 
                not_seen_movies[movie[0]].append(movie[1])
    
    # print(not_seen_movies)
    
    # 再算出每部沒看過電影的平均分數 
    for movie in not_seen_movies:
        not_seen_movies[movie] = np.mean(not_seen_movies[movie])
    
    # print(not_seen_movies)
    
    # 按照分數高低對推薦電影做排序 
    # key是指要以什麼方式排序lambda是一個特殊的function可以直接寫 
    recommend_rating = sorted(not_seen_movies.items(), key=lambda x: x[1], reverse=True)
    
    
    
    # 將指定數量的Top電影加入推薦名單 
    recommend_list = []
    for i in range(num):
        recommend_list.append(recommend_rating[i][0])
    return recommend_list
    # return [movie for movie, rating in recommend_rating][0:num]
    

""" 找出user1、user2共同看過的電影 """
def find_common_movie(user1,user2):
    # 取所有user1看過的電影 (取 userId == user1 的所有movieId 的值 存成一個list) 
    s1=set(ratings_small.loc[ratings_small["userId"]==user1,"movieId"].values)
    s2=set(ratings_small.loc[ratings_small["userId"]==user2,"movieId"].values)
    # 找出user1和user2看過電影的交集 
    return s1.intersection(s2)

""" 計算餘弦相似度 """
def cosine_similarity(vec1, vec2):
    """
    :param vec1: 向量 a 
    :param vec2: 向量 b
    :return: sim
    """
    vec1 = np.mat(vec1)
    vec2 = np.mat(vec2)
    num = float(vec1 * vec2.T)
    denom = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    cos = num / denom
    # 將sim 從[-1:1] 變成 [0:1] 
    sim = 0.5 + 0.5 * cos
    return sim

""" 計算user1 、 user2 對於特定某些電影比對的評分相似程度 """
def cal_user_similarity_with_movie_rating(user1,user2,movies_id):
    u1 = ratings_small[ratings_small["userId"]==user1]
    u2 = ratings_small[ratings_small["userId"]==user2]
    vec1 = u1[u1.movieId.isin(movies_id)].sort_values(by="movieId")["rating"].values
    vec2 = u2[u2.movieId.isin(movies_id)].sort_values(by="movieId")["rating"].values
    return cosine_similarity(vec1, vec2)

In [7]:
hit_number=0
recommend_2={}
recommend_list=[]
for i in range(1,672):    #1~671
    #在validation_data中validation_data的userId==i的列取出movieId的那欄轉成values
    last_seen_movie = validation_data[validation_data['userId'] == i].movieId.values
    result_list = recommend_movie(i,10)
    recommend_2={'user':i,'first':result_list[0],'second':result_list[1]}
    recommend_list.append(recommend_2)
    print(i," : ",result_list[0]," ",result_list[1])
    if last_seen_movie in result_list:
        hit_number+=1

1  :  1035.0   1148.0
2  :  246.0   1231.0
3  :  26.0   58.0
4  :  2231.0   52.0
5  :  32.0   589.0
6  :  1.0   223.0
7  :  6156.0   6754.0
8  :  69.0   1343.0
9  :  18.0   52.0
10  :  246.0   910.0
11  :  260.0   527.0
12  :  904.0   919.0
13  :  704.0   1429.0
14  :  1968.0   2144.0
15  :  3920.0   3925.0
16  :  260.0   904.0
17  :  6156.0   6754.0
18  :  926.0   1207.0
19  :  169.0   2422.0
20  :  2058.0   3897.0
21  :  235.0   348.0
22  :  1035.0   1287.0
23  :  6156.0   6754.0
24  :  904.0   926.0
25  :  527.0   904.0
26  :  2058.0   1194.0
27  :  904.0   919.0
28  :  318.0   741.0
29  :  720.0   50.0
30  :  575.0   1223.0
31  :  47.0   377.0
32  :  260.0   919.0
33  :  260.0   904.0
34  :  169.0   2422.0
35  :  551.0   1198.0
36  :  541.0   912.0
37  :  50.0   318.0
38  :  524.0   908.0
39  :  265.0   750.0
40  :  904.0   919.0
41  :  32.0   524.0
42  :  235.0   348.0
43  :  1376.0   1527.0
44  :  858.0   1221.0
45  :  314.0   785.0
46  :  858.0   1089.0
47  :  919.0   926.0
48  

371  :  135.0   237.0
372  :  260.0   904.0
373  :  69.0   1343.0
374  :  380.0   500.0
375  :  678.0   908.0
376  :  112852.0   1270.0
377  :  500.0   62.0
378  :  2671.0   2724.0
379  :  86.0   494.0
380  :  778.0   1213.0
381  :  260.0   527.0
382  :  671.0   858.0
383  :  50.0   923.0
384  :  899.0   900.0
385  :  802.0   1073.0
386  :  3.0   7.0
387  :  6156.0   6754.0
388  :  1835.0   2132.0
389  :  370.0   589.0
390  :  231.0   1213.0
391  :  900.0   901.0
392  :  858.0   318.0
393  :  36.0   671.0
394  :  1464.0   1641.0
395  :  4993.0   8972.0
396  :  3176.0   3429.0
397  :  1035.0   4306.0
398  :  858.0   924.0
399  :  590.0   1242.0
400  :  6156.0   6754.0
401  :  364.0   1036.0
402  :  903.0   7064.0
403  :  2890.0   3181.0
404  :  101.0   920.0
405  :  6156.0   6754.0
406  :  1015.0   1019.0
407  :  314.0   969.0
408  :  2890.0   3181.0
409  :  175.0   778.0
410  :  246.0   581.0
411  :  110.0   293.0
412  :  858.0   2081.0
413  :  1.0   1729.0
414  :  858.0   1247.0
415  

In [8]:
print("Hit:",hit_number)
print("Hit rate:",hit_number/671)

Hit: 0
Hit rate: 0.0


In [9]:
print("目標user:",user)
print("推薦數量:",num)

result_list = recommend_movie(user,num)
i=0
print("\nTop\t:\tMovieID")
for movie in result_list:
    i=i+1
    print(i,"\t:\t",int(movie))

目標user: 2
推薦數量: 10

Top	:	MovieID
1 	:	 246
2 	:	 1231
3 	:	 1246
4 	:	 2109
5 	:	 2268
6 	:	 2997
7 	:	 1194
8 	:	 969
9 	:	 1221
10 	:	 169


In [10]:
pd.DataFrame(recommend_list).to_csv('user_recommend.csv')

In [11]:
import csv
labels = ['userId', 'first', 'second']
with open('dct.csv', 'w') as f:  
    writer = csv.writer(f)
    for k, v in recommend_2.items():
       writer.writerow([k, v])

In [12]:
# """ LeaveOneOut """
# # 基本上和原來的方法差不多，部過每次要提出一個user不做統計，最後再全部取平均 
# def recommend_movie_LOO(user,num):
#     user_similarity = []
#     for other_user in ratings_small.userId.unique():
#         if other_user == user:
#             continue
#         common_movies = find_common_movie(user,other_user)
#         if(common_movies):
#             similarity = cal_user_similarity_with_movie_rating(user,other_user,common_movies)
#         else:
#             similarity = 0
#         user_similarity.append([other_user,similarity])
    
#     user_similarity=np.array(user_similarity)
#     sorted_index = np.argsort(user_similarity, axis=0)[:,1][::-1][:10]
#     top10_similar_user = user_similarity[:,0][sorted_index]
#     top10_similar = user_similarity[:,1][sorted_index]
    
#     seen_movies = ratings_small.loc[ratings_small["userId"]==user,"movieId"].values
#     not_seen_movies_LOO = defaultdict(list) 
#     for similar_user in top10_similar_user:
#         for similar_other_user in top10_similar_user:
#             if similar_user == similar_other_user:
#                 continue
#             movies = ratings_small.loc[ratings_small.userId==similar_other_user,["movieId","rating"]].values.tolist()
#             if isinstance(movies[0], list):
#                 for movie in movies:
#                     if movie[0] in seen_movies:
#                         continue
#                     """ 
#                         movie[0] : movieId --- key
#                         movie[1] : rating  --- value
#                     """
#                     not_seen_movies_LOO[movie[0]].append(movie[1])

    
#     # print(not_seen_movies_LOO)
#     for movie in not_seen_movies_LOO:
#         not_seen_movies_LOO[movie] = np.mean(not_seen_movies_LOO[movie])
    
#     recommend_rating = sorted(not_seen_movies_LOO.items(), key=lambda x: x[1], reverse=True)
    
#     recommend_list = []
#     for i in range(num):
#         recommend_list.append(recommend_rating[i][0])
#     return recommend_list

In [13]:
# print("目標user:",user)
# print("推薦數量:",num)

# result_list_LOO = recommend_movie_LOO(user,num)
# i=0
# print("\nLeaveOneOut")
# print("Top\t:\tMovieID")
# for movie_LOO in result_list_LOO:
#     i=i+1
#     print(i,"\t:\t",int(movie_LOO))

In [14]:
# print(ratings_small)
print(ratings_small[0:1].userId.values)
print(ratings_small[0:1].movieId.values)
print(ratings_small[0:1].rating.values)
print(ratings_small[0:1].timestamp.values)

[1]
[31]
[2.5]
[1260759144]


In [15]:
# 現在是找跟user相近的user
# 去看不同user共同看過的電影找評分比較相近的

# 可以找看過的電影與其他電影的相似度，可以推薦比較相似的電影
# 看不同movie共同被同user看過且評分較類似的，或許可以代表是同性質的電影，這樣評判標準或許會比較好，但運算量會大很多