<a href="https://colab.research.google.com/github/lunayee/recommender-system/blob/main/%E6%8E%A8%E8%96%A6%E7%B3%BB%E7%B5%B1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 載入套件

In [None]:
import pandas as pd
import numpy as np
#繪圖相關套件
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
plt.style.use('ggplot')
#標籤編碼
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 
#決策樹
from sklearn.tree import DecisionTreeClassifier
#隨機森林(Random Forest)
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display
#餘弦相似度
from sklearn.metrics.pairwise import cosine_similarity

# 載入資料

In [None]:
from google.colab import files
uploaded = files.upload

In [None]:
df_movie = pd.read_csv('movies.csv')
df_movie.drop('title',axis=1,inplace=True)
df_ratings = pd.read_csv('ratings.csv')
df_ratings.drop(['rating', 'timestamp'],axis=1,inplace=True)
df_rating = df_ratings.iloc[:20000]#由於電腦跑不動，只擷取20000筆
df_movies = pd.merge(df_rating, df_movie, on='movieId')#合併
df_movies

Unnamed: 0,userId,movieId,genres
0,1,296,Comedy|Crime|Drama|Thriller
1,3,296,Comedy|Crime|Drama|Thriller
2,4,296,Comedy|Crime|Drama|Thriller
3,5,296,Comedy|Crime|Drama|Thriller
4,7,296,Comedy|Crime|Drama|Thriller
...,...,...,...
19995,166,945,Comedy|Musical|Romance
19996,166,948,Drama|Romance|Western
19997,166,982,Drama
19998,166,1009,Adventure|Children|Fantasy


In [None]:
#確認合併正不正確
df_movies[(df_movies['movieId']==1)&(df_movies['userId']==2)]


Unnamed: 0,userId,movieId,genres
620,2,1,Adventure|Animation|Children|Comedy|Fantasy


# 特徵工程


## (1)資料整理
將電影中genres的類型做區別，以離散數字0,1表示

In [None]:
dummies = df_movie['genres'].str.get_dummies('|')
df_movie_vec = pd.concat([df_movie,dummies],axis=1)
df_movie_vec.drop('genres',axis=1,inplace=True)
df_movie_vec.set_index('movieId',inplace=True)
print(df_movie_vec.shape)
df_movie_vec.head()

(62423, 20)


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## (2)用戶特徵萃取
將user中genres的類型做區別，以離散數字0,1表示

假設user1有10條觀看紀錄，其中動作類型佔4條，則在動作類型這個特徵為0.4分。

In [None]:
dummies = df_movies['genres'].str.get_dummies('|')
df_user_vec = pd.concat([df_movies,dummies],axis=1)
df_user_vec.drop(['genres','movieId'],axis=1,inplace=True)
df_user_vec = df_user_vec.groupby('userId').mean()
print(df_user_vec.shape)
df_user_vec.head()

(166, 20)


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,0.057143,0.157143,0.028571,0.042857,0.328571,0.114286,0.014286,0.757143,0.071429,0.014286,0.014286,0.0,0.071429,0.057143,0.257143,0.071429,0.071429,0.071429,0.014286
2,0.0,0.358696,0.407609,0.092391,0.13587,0.342391,0.097826,0.0,0.494565,0.157609,0.0,0.016304,0.032609,0.059783,0.043478,0.184783,0.152174,0.163043,0.081522,0.027174
3,0.001524,0.509146,0.301829,0.07622,0.073171,0.268293,0.20122,0.004573,0.353659,0.118902,0.007622,0.068598,0.123476,0.009146,0.091463,0.091463,0.341463,0.364329,0.039634,0.012195
4,0.0,0.599174,0.471074,0.128099,0.115702,0.334711,0.152893,0.020661,0.202479,0.161157,0.0,0.041322,0.169421,0.028926,0.07438,0.041322,0.35124,0.231405,0.03719,0.03719
5,0.0,0.178218,0.207921,0.039604,0.089109,0.485149,0.138614,0.0,0.445545,0.079208,0.0,0.029703,0.029703,0.069307,0.069307,0.19802,0.108911,0.237624,0.019802,0.049505


## (3)建構用戶-影片相似度矩陣
輸入影片的ID，得到一組影片特徵向量，逐一計算與用戶特徵向量的相似度，列出相似度最高的前幾名即是可能的受眾名單

1.   今日要找出與movie1相似度最高的20個用戶(換言之，即是最喜歡此類型電影的前20個受眾)，方法是得到movie1影片特徵向量，逐一計算與用戶特徵向量的相似度，列出相似度最高的前20名。
2.   今日要找出與movie1相似度最高的100個用戶，方法是得到movie1影片特徵向量，逐一計算與用戶特徵向量的相似度，列出相似度最高的前100名。

很顯然的，無論是前10個、前100個，真正耗時之處在於逐一計算與用戶特徵向量的相似度，倘若我事先就構成用戶-影片相似度矩陣，ID檢索時就能直接以sort排列索取出最相似的名單，以降低頻繁計算造成伺服器負擔。

In [None]:
#建構餘弦相似度矩陣 表示user i和movie j的相似度
df_user_movie_matrix = cosine_similarity(df_user_vec.values,df_movie_vec.values)#值越大表越相關
df_user_movie_matrix = 1- df_user_movie_matrix#使argsort好用
df_user_movie_matrix = pd.DataFrame(df_user_movie_matrix,index=df_user_vec.index,columns=df_movie_vec.index)
df_user_movie_matrix

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,209035,209037,209041,209049,209051,209053,209055,209057,209063,209065,209067,209069,209073,209075,209079,209085,209089,209101,209103,209119,209121,209123,209129,209131,209133,209135,209137,209139,209141,209143,209145,209147,209151,209153,209155,209157,209159,209163,209169,209171
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.689687,0.827008,0.542805,0.144145,0.637289,0.845218,0.542805,0.843885,0.936920,0.817903,0.144145,0.732373,0.854323,0.164188,0.699540,0.319783,0.208271,0.637289,0.637289,0.344110,0.672226,0.499267,0.845218,0.353236,0.208271,0.164188,0.375538,0.208271,0.449899,0.319783,0.164188,0.872532,0.735960,0.375538,0.208271,0.319783,0.988849,0.710071,0.542805,0.164188,...,0.721222,1.000000,0.921942,0.984230,1.000000,0.637289,0.152516,0.164188,1.000000,1.000000,1.000000,0.208271,0.164188,0.386689,0.968460,0.716139,1.000000,1.000000,1.000000,0.152516,0.984230,0.984230,0.984230,0.984230,1.000000,0.933093,0.984230,0.164188,0.637289,0.966547,0.164188,0.899640,1.000000,0.152516,0.590282,0.164188,0.984230,0.152516,1.000000,0.380871
2,0.437696,0.551937,0.587365,0.347010,0.620990,0.604038,0.587365,0.574603,0.602942,0.406057,0.347010,0.719238,0.593618,0.452542,0.392163,0.536317,0.468254,0.620990,0.620990,0.278960,0.614458,0.596433,0.604038,0.493777,0.468254,0.452542,0.506539,0.468254,0.378507,0.536317,0.452542,0.770759,0.600564,0.506539,0.468254,0.536317,0.974476,0.625651,0.587365,0.452542,...,0.659682,1.000000,0.804317,1.000000,1.000000,0.620990,0.344888,0.452542,1.000000,1.000000,1.000000,0.468254,0.452542,0.540571,0.897728,0.795455,1.000000,1.000000,1.000000,0.344888,1.000000,1.000000,1.000000,1.000000,1.000000,0.876635,0.969920,0.452542,0.620990,0.927682,0.452542,0.910667,1.000000,0.344888,0.433843,0.452542,1.000000,0.344888,1.000000,0.194182
3,0.605004,0.699600,0.732014,0.566089,0.717364,0.346352,0.732014,0.720659,0.463634,0.285160,0.566089,0.749047,0.725561,0.627434,0.451122,0.586666,0.668424,0.717364,0.717364,0.200671,0.492844,0.491532,0.346352,0.482197,0.668424,0.627434,0.682050,0.668424,0.431206,0.586666,0.627434,0.515096,0.685693,0.682050,0.668424,0.586666,0.904615,0.745640,0.732014,0.627434,...,0.743369,0.998394,0.854652,0.995182,0.998394,0.717364,0.536702,0.627434,0.998394,0.998394,0.998394,0.668424,0.627434,0.679779,0.919706,0.903647,0.998394,0.998394,0.998394,0.536702,0.995182,0.995182,0.995182,0.995182,0.998394,0.908022,0.987153,0.627434,0.717364,0.939817,0.627434,0.799010,0.998394,0.536702,0.471519,0.627434,0.995182,0.536702,0.998394,0.291650
4,0.466920,0.574864,0.738220,0.671166,0.670470,0.440982,0.738220,0.591509,0.410101,0.260123,0.671166,0.738220,0.593655,0.800655,0.368168,0.752604,0.830275,0.670470,0.670470,0.330466,0.591306,0.690705,0.440982,0.614522,0.830275,0.800655,0.778494,0.830275,0.445088,0.752604,0.800655,0.626538,0.612445,0.778494,0.830275,0.752604,0.867672,0.686440,0.738220,0.800655,...,0.677810,1.000000,0.798631,0.979659,1.000000,0.670470,0.626029,0.800655,1.000000,1.000000,1.000000,0.830275,0.800655,0.769864,0.873884,0.959317,1.000000,1.000000,1.000000,0.626029,0.979659,0.979659,0.979659,0.979659,1.000000,0.873425,0.963386,0.800655,0.670470,0.896439,0.800655,0.864795,1.000000,0.626029,0.445679,0.800655,0.979659,0.626029,1.000000,0.276565
5,0.506120,0.733751,0.407894,0.201253,0.405350,0.607633,0.407894,0.742563,0.781557,0.558587,0.201253,0.553775,0.761777,0.453893,0.586613,0.493707,0.442219,0.405350,0.405350,0.185912,0.390430,0.495265,0.607633,0.519450,0.442219,0.453893,0.536613,0.442219,0.500693,0.493707,0.453893,0.705725,0.691712,0.536613,0.442219,0.493707,0.974256,0.502288,0.407894,0.453893,...,0.545194,1.000000,0.897025,1.000000,1.000000,0.405350,0.193363,0.453893,1.000000,1.000000,1.000000,0.442219,0.453893,0.579519,0.951457,0.757286,1.000000,1.000000,1.000000,0.193363,1.000000,1.000000,1.000000,1.000000,1.000000,0.931350,0.939321,0.453893,0.405350,0.965675,0.453893,0.854119,1.000000,0.193363,0.390430,0.453893,1.000000,0.193363,1.000000,0.411449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,0.187324,0.386645,0.453071,0.359743,0.589965,0.870873,0.453071,0.433302,0.916129,0.720224,0.359743,0.690293,0.338222,0.664517,0.564195,0.729830,0.505787,0.589965,0.589965,0.566573,0.682562,0.749946,0.870873,0.723241,0.505787,0.664517,0.413534,0.505787,0.545735,0.729830,0.664517,0.881633,0.601857,0.413534,0.505787,0.729830,0.986821,0.360818,0.453071,0.664517,...,0.466250,1.000000,0.571682,1.000000,1.000000,0.589965,0.472839,0.664517,1.000000,1.000000,1.000000,0.505787,0.664517,0.518966,0.655198,0.636560,1.000000,1.000000,1.000000,0.472839,1.000000,1.000000,1.000000,1.000000,1.000000,0.815494,1.000000,0.664517,0.589965,0.756188,0.664517,0.947284,1.000000,0.472839,0.505011,0.664517,1.000000,0.472839,1.000000,0.580336
163,0.381307,0.530159,0.539652,0.518413,0.511727,0.506667,0.539652,0.539652,0.430348,0.377461,0.518413,0.625967,0.577143,0.816898,0.400953,0.812984,0.755440,0.511727,0.511727,0.317618,0.553651,0.754342,0.506667,0.683511,0.755440,0.816898,0.625967,0.755440,0.572374,0.812984,0.816898,0.706349,0.682857,0.625967,0.755440,0.812984,0.942456,0.410179,0.539652,0.816898,...,0.597195,1.000000,0.827369,1.000000,1.000000,0.511727,0.525266,0.816898,1.000000,1.000000,1.000000,0.755440,0.816898,0.812984,0.918621,0.837242,1.000000,1.000000,1.000000,0.525266,1.000000,1.000000,1.000000,1.000000,1.000000,0.884913,1.000000,0.816898,0.511727,0.942456,0.816898,0.913685,1.000000,0.525266,0.295239,0.816898,1.000000,0.525266,1.000000,0.389207
164,0.477261,0.676633,0.437506,0.184553,0.464264,0.695379,0.437506,0.724493,0.788952,0.601650,0.184553,0.580999,0.732871,0.383092,0.564158,0.477684,0.380108,0.464264,0.464264,0.248563,0.507920,0.535344,0.695379,0.448985,0.380108,0.383092,0.489163,0.380108,0.426440,0.477684,0.383092,0.765676,0.653201,0.489163,0.380108,0.477684,0.913904,0.546561,0.437506,0.383092,...,0.569520,1.000000,0.827808,0.935062,1.000000,0.464264,0.184957,0.383092,1.000000,1.000000,1.000000,0.380108,0.383092,0.512122,0.926945,0.740249,1.000000,1.000000,1.000000,0.184957,0.935062,0.935062,0.935062,0.935062,1.000000,0.833548,1.000000,0.383092,0.464264,0.902424,0.383092,0.873726,1.000000,0.184957,0.418877,0.383092,0.935062,0.184957,1.000000,0.357953
165,0.628147,0.798375,0.506122,0.308714,0.584256,0.385524,0.506122,0.800097,0.634145,0.395125,0.308714,0.670748,0.798375,0.501107,0.500738,0.482604,0.447326,0.584256,0.584256,0.114991,0.356720,0.405036,0.385524,0.576676,0.447326,0.501107,0.600194,0.447326,0.568651,0.482604,0.501107,0.625554,0.683161,0.600194,0.447326,0.482604,0.952964,0.658989,0.506122,0.501107,...,0.658989,1.000000,0.905928,0.983370,1.000000,0.584256,0.353254,0.501107,1.000000,1.000000,1.000000,0.447326,0.501107,0.600194,0.933481,0.717294,1.000000,1.000000,1.000000,0.353254,0.983370,0.983370,0.983370,0.983370,1.000000,0.941205,0.933481,0.501107,0.584256,0.941205,0.501107,0.800097,1.000000,0.353254,0.385524,0.501107,0.983370,0.353254,1.000000,0.375923


## (4)獲取推薦名單
任一用戶及任一影片的相似度已然悉知，以ID檢索時直接以sort排列並索取最相似的名單即可。例如以userId獲取相似度最高的前幾名電影，便能以此輔助函式索取。

In [None]:
#找尋與特定user距離最近/最相似的前幾部movie
 
def get_the_most_similar_movies(user_id, user_movie_matrix,num):
    """找尋與特定user距離最近/最相似的前幾部movie"""
    user_vec = user_movie_matrix.loc[user_id].values 
    sorted_index = np.argsort(user_vec)[:num]
    return list(user_movie_matrix.columns[sorted_index])
 
def get_the_most_similar_users(movie_id, user_movie_matrix,num):
    """找尋與特定movie距離最近/最相似的前幾部movie"""
    movie_vec = user_movie_matrix[movie_id].values 
    sorted_index = np.argsort(movie_vec)[:num]#返回由小到大的索引值
    return list(user_movie_matrix.index[sorted_index])    


#為user1找出最相似的10部影片
print(get_the_most_similar_movies(1, df_user_movie_matrix,10))

#為movie1找出最相似的10個用戶
get_the_most_similar_users(1, df_user_movie_matrix,10)

[2506, 91741, 130091, 101018, 89347, 71804, 179931, 39231, 898, 122282]


[93, 67, 162, 77, 125, 111, 98, 114, 17, 49]