In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

1.读取电影数据集（用户信息、电影信息、评分行为信息）

In [2]:
df_user = pd.read_csv("./ml-1m/users.dat", 
                      sep="::",header=None, engine="python",
                     encoding='iso-8859-1',
                     names="UserID::Gender::Age::Occupation::Zip-code".split("::"))

In [3]:
df_user.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
df_movie = pd.read_csv("./ml-1m/movies.dat", sep="::",
                      header=None, engine="python",encoding="iso-8859-1",
                      names="MovieID::Title::Genres".split("::"))

In [5]:
df_movie.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
df_movie.shape

(3883, 3)

In [7]:
df_rating = pd.read_csv("./ml-1m/ratings.dat", sep="::",
                       header=None, engine="python", encoding="iso-8859-1",
                       names="UserID::MovieID::Rating::Timestamp".split("::"))

In [8]:
df_rating.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
df_rating.shape

(1000209, 4)

2.计算电影中每个题材的次数

In [10]:
import collections
genre_count = collections.defaultdict(int)
for genres in df_movie["Genres"].str.split("|"):
    for genre in genres :
        genre_count[genre] += 1


In [11]:
genre_count

defaultdict(int,
            {'Animation': 105,
             "Children's": 251,
             'Comedy': 1200,
             'Adventure': 283,
             'Fantasy': 68,
             'Romance': 471,
             'Drama': 1603,
             'Action': 503,
             'Crime': 211,
             'Thriller': 492,
             'Horror': 343,
             'Sci-Fi': 276,
             'Documentary': 127,
             'War': 143,
             'Musical': 114,
             'Mystery': 106,
             'Film-Noir': 44,
             'Western': 68})

3.特征处理

3.1 每个电影只保留频率最高（代表性）的电影题材标签

sorted函数，其中，iterable 表示指定的序列，key 参数可以自定义排序规则；reverse 参数指定以升序（False，默认）还是降序（True）进行排序。sorted() 函数会返回一个排好序的列表。 

In [12]:
def get_highrate_genre(x):
    sub_value = {}
    for genre in x.split("|") :
        sub_value[genre] = genre_count[genre]
    return sorted(sub_value.items(), key=lambda x:x[1],reverse=True)[0][0]

In [13]:
df_movie["Genres"] = df_movie["Genres"].map(get_highrate_genre)

In [14]:
df_movie["Genres"].head()

0       Comedy
1    Adventure
2       Comedy
3        Drama
4       Comedy
Name: Genres, dtype: object

3.2 给特征做序列编码
python教程
http://c.biancheng.net/view/2239.html
3.2.1 enumerate(sequence, [start=0])

    sequence -- 一个序列、迭代器或其他支持迭代对象。
    start -- 下标起始位置。
    返回 enumerate(枚举) 对象。
    >>>seasons = ['Spring', 'Summer', 'Fall', 'Winter']
    >>> list(enumerate(seasons))
    [(0, 'Spring'), (1, 'Summer'), (2, 'Fall'), (3, 'Winter')]
    >>> list(enumerate(seasons, start=1))       # 下标从 1 开始
    [(1, 'Spring'), (2, 'Summer'), (3, 'Fall'), (4, 'Winter')]
    >>> tuple(enumerate(seasons, start=1))
    ((1, 'Spring'), (2, 'Summer'), (3, 'Fall'), (4, 'Winter'))


In [15]:
def add_index_column(param_df, colum_name):
    values = list(param_df[colum_name].unique())
    #得到dict词表
    value_index_dict = {v:idx for idx, v in enumerate(values)}
    param_df[f"{colum_name}_idx"] = param_df[colum_name].map(value_index_dict)

In [16]:
add_index_column(df_user, "UserID")
add_index_column(df_user, "Gender")
add_index_column(df_user, "Age")
add_index_column(df_user, "Occupation")
add_index_column(df_movie, "MovieID")
add_index_column(df_movie, "Genres")

In [17]:
df_user.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code,UserID_idx,Gender_idx,Age_idx,Occupation_idx
0,1,F,1,10,48067,0,0,0,0
1,2,M,56,16,70072,1,1,1,1
2,3,M,25,15,55117,2,1,2,2
3,4,M,45,7,2460,3,1,3,3
4,5,M,25,20,55455,4,1,2,4


In [18]:
df_movie.head()

Unnamed: 0,MovieID,Title,Genres,MovieID_idx,Genres_idx
0,1,Toy Story (1995),Comedy,0,0
1,2,Jumanji (1995),Adventure,1,1
2,3,Grumpier Old Men (1995),Comedy,2,0
3,4,Waiting to Exhale (1995),Drama,3,2
4,5,Father of the Bride Part II (1995),Comedy,4,0


In [19]:
df_rating.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


3.3 合并为一个表格

In [20]:
df = pd.merge(pd.merge(df_rating, df_user), df_movie)

In [21]:
df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,UserID_idx,Gender_idx,Age_idx,Occupation_idx,Title,Genres,MovieID_idx,Genres_idx
0,1,1193,5,978300760,F,1,10,48067,0,0,0,0,One Flew Over the Cuckoo's Nest (1975),Drama,1176,2
1,2,1193,5,978298413,M,56,16,70072,1,1,1,1,One Flew Over the Cuckoo's Nest (1975),Drama,1176,2
2,12,1193,4,978220179,M,25,12,32793,11,1,2,7,One Flew Over the Cuckoo's Nest (1975),Drama,1176,2
3,15,1193,4,978199279,M,25,7,22903,14,1,2,3,One Flew Over the Cuckoo's Nest (1975),Drama,1176,2
4,17,1193,5,978158471,M,50,1,95350,16,1,4,6,One Flew Over the Cuckoo's Nest (1975),Drama,1176,2


In [22]:
df.drop(columns=["Timestamp","Zip-code","Title"], inplace=True)

https://blog.csdn.net/Daisy4/article/details/121548990
DataFrame.sample方法主要是用来对DataFrame进行简单随机抽样的。
参数
--n  设置抽样数量,这个参数不能与frac参数同时使用，而且如果没有指定frac参数，n参数的默认值是1。
--frac   设置抽样比例
--replace   设置是否放回, replace接收一个bool类型数据，False表示执行无放回抽样，True表示执行有放回抽样。默认值为False，即执行无放回抽样。

--weights   设置样本权重
--random_state   设置随机数种子
--axis sample方法可以对行进行抽样，也可以对列进行抽样。控制这一行为的参数就是axis。当axis指定为0或者'index'时，对行进行抽样，当axis指定为1或者'col'时，对列进行抽样。默认执行的是行抽样。

In [23]:
df.sample(frac=1).head(3)

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Occupation,UserID_idx,Gender_idx,Age_idx,Occupation_idx,Genres,MovieID_idx,Genres_idx
1683,5916,1193,3,M,50,20,5915,1,4,4,Drama,1176,2
105129,3539,1690,4,F,25,4,3538,0,2,12,Action,1644,3
527501,1522,2011,3,M,35,20,1521,1,5,4,Comedy,1942,0


In [24]:
num_users = df["UserID_idx"].max() + 1
num_movies = df["MovieID_idx"].max() + 1
num_genders = df["Gender_idx"].max() + 1
num_ages = df["Age_idx"].max() + 1
num_occupations = df["Occupation_idx"].max() + 1
num_genres = df["Genres_idx"].max() + 1

In [25]:
num_users, num_movies, num_genders, num_ages, num_occupations, num_genres

(6040, 3883, 2, 7, 21, 18)

3.4 评分的归一化

In [26]:
min_rating = df["Rating"].min()
max_rating = df["Rating"].max()
df["Rating"] = df["Rating"].map(lambda x : (x-min_rating)/(max_rating - min_rating))

In [27]:
df.sample(frac=1).head(3)

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Occupation,UserID_idx,Gender_idx,Age_idx,Occupation_idx,Genres,MovieID_idx,Genres_idx
604228,4400,1608,0.75,F,18,4,4399,0,6,12,Action,1566,3
694855,6007,2160,0.25,M,35,17,6006,1,5,8,Thriller,2091,5
841158,5861,965,0.75,F,50,1,5860,0,4,6,Thriller,953,5


4.构建训练数据集

In [28]:
df_sample = df.sample(frac=0.1)
X = df_sample[["UserID_idx","Gender_idx","Age_idx","Occupation_idx","MovieID_idx","Genres_idx"]]
Y = df_sample["Rating"]

5.搭建双塔模型并训练
内外积的实现
https://blog.csdn.net/Young824/article/details/105049099

In [29]:
def get_model():
    #输入
    user_id = keras.layers.Input(shape=(1,),name="user_id")
    gender = keras.layers.Input(shape=(1,),name="gender")
    age = keras.layers.Input(shape=(1,),name="age")
    occupation = keras.layers.Input(shape=(1,), name="occupation")
    movie_id = keras.layers.Input(shape=(1,), name="movie_id")
    genre = keras.layers.Input(shape=(1,), name="genre")
    
    #user 塔
    user_vector = tf.keras.layers.concatenate([
        layers.Embedding(num_users,100)(user_id),
        layers.Embedding(num_genders, 2)(gender), 
        layers.Embedding(num_ages, 2)(age), 
        layers.Embedding(num_occupations, 2)(occupation)
    ])
    user_vector = layers.Dense(32,activation='relu')(user_vector)
    user_vector = layers.Dense(8, activation='relu',
                             name='user_embedding',
                             kernel_regularizer='l2')(user_vector)
    
    #item 塔
    movie_vector = tf.keras.layers.concatenate([
        layers.Embedding(num_movies,100)(movie_id),
        layers.Embedding(num_genres,2)(genre)
    ])
    movie_vector = layers.Dense(32, activation='relu')(movie_vector)
    movie_vector = layers.Dense(8,activation='relu', 
                               name='movie_embedding',
                               kernel_regularizer='l2')(movie_vector)
    #计算点击
    dot_user_movie = tf.keras.layers.Dot(axes=1)([movie_vector, user_vector])
    output = layers.Dense(1, activation='sigmoid')(dot_user_movie)
    return keras.models.Model(inputs=[user_id, gender, age, occupation, movie_id, genre],
                             outputs=[output])

In [30]:
model = get_model()
model.compile(loss=tf.keras.losses.MeanSquaredError(),
             optimizer=tf.keras.optimizers.RMSprop())

2022-11-24 12:32:10.612331: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [31]:
fit_x_train = [
        X["UserID_idx"], 
        X["Gender_idx"],
        X["Age_idx"],
        X["Occupation_idx"],
        X["MovieID_idx"],
        X["Genres_idx"]
    ]

history = model.fit(x=fit_x_train,y=Y,batch_size=32, epochs=5,verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


6.利用模型进行预测


In [32]:
inputs = df[["UserID_idx","Gender_idx","Age_idx","Occupation_idx","MovieID_idx", "Genres_idx"]].head(5)

In [33]:
inputs

Unnamed: 0,UserID_idx,Gender_idx,Age_idx,Occupation_idx,MovieID_idx,Genres_idx
0,0,0,0,0,1176,2
1,1,1,1,1,1176,2
2,11,1,2,7,1176,2
3,14,1,2,3,1176,2
4,16,1,4,6,1176,2


In [34]:
model.predict([ inputs["UserID_idx"], 
        inputs["Gender_idx"],
        inputs["Age_idx"],
        inputs["Occupation_idx"],
        inputs["MovieID_idx"],
        inputs["Genres_idx"]])



array([[[0.9223587 ],
        [0.9223587 ],
        [0.9223587 ],
        [0.9223587 ],
        [0.9223587 ],
        [0.9223587 ],
        [0.9223587 ],
        [0.9223587 ]],

       [[0.80911   ],
        [0.80911   ],
        [0.80911   ],
        [0.80911   ],
        [0.80911   ],
        [0.80911   ],
        [0.80911   ],
        [0.80911   ]],

       [[0.88282526],
        [0.88282526],
        [0.88282526],
        [0.88282526],
        [0.88282526],
        [0.88282526],
        [0.88282526],
        [0.8828252 ]],

       [[0.81284165],
        [0.81284165],
        [0.81284165],
        [0.81284165],
        [0.81284165],
        [0.81284165],
        [0.81284165],
        [0.81284165]],

       [[0.9044121 ],
        [0.9044121 ],
        [0.9044121 ],
        [0.9044121 ],
        [0.9044121 ],
        [0.9044121 ],
        [0.9044121 ],
        [0.90441203]]], dtype=float32)

7.可以提取模型中的user embedding
7.1 得到user 的embedding

In [35]:
model.input

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'user_id')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'gender')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'age')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'occupation')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'movie_id')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'genre')>]

In [36]:
user_layer_model = keras.models.Model(
    inputs=[model.input[0],model.input[1],model.input[2],model.input[3]],
    outputs=model.get_layer("user_embedding").output
)

In [37]:
df_user.loc[0]["UserID"].shape

()

In [38]:
np.reshape(df_user.loc[0]["UserID"],[1,1])

array([[1]])

In [39]:
user_input = [
        np.reshape(df_user.loc[0]["UserID_idx"], [1,1]),
        np.reshape(df_user.loc[0]["Gender_idx"], [1,1]),
        np.reshape(df_user.loc[0]["Age_idx"], [1,1]),
        np.reshape(df_user.loc[0]["Occupation_idx"], [1,1])
    ]

In [40]:
user_input

[array([[0]]), array([[0]]), array([[0]]), array([[0]])]

join函数的语法及用法 (1)语法:'sep'.join(sep_object)

In [41]:
user_embeddings = []
for index, row in df_user.iterrows():
    user_id = row["UserID"]
    user_input = [
        np.reshape(row["UserID_idx"],[1,1]),
        np.reshape(row["Gender_idx"],[1,1]),
        np.reshape(row["Age_idx"],[1,1]),
        np.reshape(row["Occupation_idx"],[1,1])
    ]
    user_embedding = user_layer_model(user_input)
    embedding_str = ",".join([str(x) for x in user_embedding.numpy().flatten()])
    user_embeddings.append([user_id,embedding_str])
df_user_embedding = pd.DataFrame(user_embeddings, columns = ["user_id","user_embedding"])

In [42]:
df_user_embedding.head()

Unnamed: 0,user_id,user_embedding
0,1,"0.0,0.6361018,0.0,0.0,0.8173893,0.69815946,0.8..."
1,2,"0.0,0.5119589,0.0,0.0,0.5495044,0.5283283,0.54..."
2,3,"0.0,0.62886083,0.0,0.0,0.8003235,0.6877961,0.8..."
3,4,"0.0,0.53765434,0.0,0.0,0.60572064,0.5631246,0...."
4,5,"0.0,0.43246138,0.0,0.0,0.37417445,0.41909945,0..."


7.2 得到movie的embedding

In [45]:
movie_layer_model = tf.keras.models.Model(
    inputs=[model.input[4], model.input[5]],
    outputs=model.get_layer("movie_embedding").output
)
movie_embeddings = []
for index, row in df_movie.iterrows():
    movie_id = row["MovieID"]
    movie_input = [
        np.reshape(row["MovieID_idx"],[1,1]),
        np.reshape(row["Genres_idx"],[1,1])
    ]
    movie_embedding = movie_layer_model(movie_input)
    embedding_str = ",".join([str(x) for x in movie_embedding.numpy().flatten()])
    movie_embeddings.append([movie_id, embedding_str])

In [47]:
df_movie_embedding=pd.DataFrame(movie_embeddings,columns = ["movie_id","movie_embedding"])
df_movie_embedding.head()

Unnamed: 0,movie_id,movie_embedding
0,1,"0.77566123,0.7756612,0.77566123,0.77566123,0.7..."
1,2,"0.44748273,0.44748273,0.44748276,0.44748273,0...."
2,3,"0.3875825,0.3875825,0.3875825,0.3875825,0.3875..."
3,4,"0.32614362,0.32614362,0.3261436,0.32614362,0.3..."
4,5,"0.36609992,0.36609992,0.36609992,0.36609992,0...."
