##  生成Embedding的几种方法

In [3]:
import findspark
findspark.init("D:\software\spark-2.4.4-bin-hadoop2.7")

In [5]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

### 1. 内容向量word2vec

In [6]:
from pyspark.ml.feature import Word2Vec

# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)

result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [-0.09118835926055908,0.024794609472155574,-0.0023326151072978973]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [-0.004019131617886679,0.0024854108674584752,0.003071522439963051]

Text: [Logistic, regression, models, are, neat] => 
Vector: [0.05315629169344902,0.02378393579274416,-0.059764140844345094]



把（文档ID，用户词语列表），变成（用户ID，播放电影ID列表），输入到word2vec，就能得到每个电影的Embedding向量

### 2. 使用Spark ALS的矩阵分解的行为Embedding

In [12]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

lines = spark.read.text("D:/workbench/ant-learn-recsys/datas/als/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, 
          regParam=0.01, 
          userCol="userId", 
          itemCol="movieId", 
          ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)

Root-mean-square error = 1.7636180291295112


In [14]:
training.show(10)

+-------+------+----------+------+
|movieId|rating| timestamp|userId|
+-------+------+----------+------+
|      0|   1.0|1424380312|     3|
|      0|   1.0|1424380312|     5|
|      0|   1.0|1424380312|     6|
|      0|   1.0|1424380312|     8|
|      0|   1.0|1424380312|    11|
|      0|   1.0|1424380312|    13|
|      0|   1.0|1424380312|    15|
|      0|   1.0|1424380312|    19|
|      0|   1.0|1424380312|    20|
|      0|   1.0|1424380312|    21|
+-------+------+----------+------+
only showing top 10 rows



In [16]:
model.itemFactors.show(10, truncate=False)

+---+------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                |
+---+------------------------------------------------------------------------------------------------------------------------+
|0  |[1.2830094, -0.5151567, 0.18020844, -0.71568125, -2.535846, 0.52966636, -0.9128374, -1.1174475, 0.19547723, 0.698292]   |
|10 |[1.0180752, -2.7507377, 1.0930991, -1.134008, -0.5343898, -1.9026369, -1.1077534, -0.8857196, -0.39179775, -0.01721368] |
|20 |[1.4512534, -3.517298, 0.7868661, -1.4977869, -0.24221556, -2.037162, 1.0100238, -1.0118681, -0.3201244, -0.18585977]   |
|30 |[1.4444151, -3.6120615, 1.6103011, 0.17859526, 0.15473363, -0.7841998, 3.4736896, -0.54864204, -0.19071166, 1.2209741]  |
|40 |[1.8021005, -2.4846869, -1.0012007, -0.11796358, -3.8910062, 0.6172575, 0.46259242, 0.20520537, -0.7537476

### 3. DNN中的Embedding

In [23]:
import tensorflow as tf
import numpy as np

In [24]:
model = tf.keras.Sequential()
# 注意，这一层是Embedding层
model.add(tf.keras.layers.Embedding(1000, 64, input_length=10))
model.compile('rmsprop', 'mse')

input_array = np.random.randint(1000, size=(32, 10))
output_array = model.predict(input_array)
print(output_array.shape)

(32, 10, 64)


In [25]:
# 训练完之后，embedding的layer的weights就是embedding向量
model.layers[0].get_weights()

[array([[ 0.00781213,  0.03940525, -0.00024771, ...,  0.03611508,
          0.02551547, -0.03192703],
        [-0.03161997, -0.02198304,  0.03973298, ..., -0.02881846,
         -0.03093203, -0.01212269],
        [ 0.00935531, -0.01970615,  0.03177864, ...,  0.04194124,
         -0.02666444,  0.02423222],
        ...,
        [-0.04647785,  0.01175867,  0.02346585, ..., -0.00246744,
         -0.01744302, -0.00606211],
        [-0.01508133, -0.00510512, -0.02035259, ...,  0.04146155,
         -0.00624609,  0.03074067],
        [ 0.02103826, -0.01366248,  0.01829243, ..., -0.03217832,
          0.02095122, -0.03056069]], dtype=float32)]