In [1]:
from gensim.models import Word2Vec
import numpy as np
from elasticsearch import Elasticsearch

In [2]:
# Create an Elasticsearch client
es = Elasticsearch(hosts='http://localhost:9200')
# Index name and type
index_name = "movie_keywords"

# movies_data represent movies keywords


In [3]:
movies_data = [
    {'id': 1, 'keywords': ['surfer', 'surfboard', 'surfing']},
    {'id': 2, 'keywords': ['hotel', 'beach', 'vacation']},
    {'id': 3, 'keywords': ['action', 'thriller', 'suspense activity']},
    {'id': 4, 'keywords': ['playground', 'school', 'football match']},
    {'id': 5, 'keywords': ['basketball match', 'university', 'sport','fun']},
    
    {'id': 6, 'keywords': ['drama', 'romance', 'love story']},
    {'id': 7, 'keywords': ['comedy', 'laughter', 'fun','future']},
    {'id': 8, 'keywords': ['adventure', 'treasure', 'exploration']},
    {'id': 9, 'keywords': ['sci-fi', 'technology', 'future']},
]

# Input keywords

In [4]:
# Input keywords
input_keywords = ['drama', 'school','fun','love story']

# Split phrases into words for both input keywords and movie keywords
input_words = [word for phrase in input_keywords for word in phrase.split()]
movies_words = [word for movie in movies_data for keyword in movie['keywords'] for word in keyword.split()]

# Build Word2Vec model
wmodel = Word2Vec([movies_words], min_count=1, vector_size=100)

# Calculate the average vector for input keywords
input_vector = np.mean([wmodel.wv[word] for word in input_words if word in wmodel.wv], axis=0)

# Calculate cosine similarity between input vector and movie keyword vectors
similarities = []
for movie in movies_data:
    movie_id = movie['id']
    movie_keywords = movie['keywords']
    keyword_vectors = [wmodel.wv[word] for keyword in movie_keywords for word in keyword.split() if word in wmodel.wv]
    if keyword_vectors:
        movie_vector = np.mean(keyword_vectors, axis=0)
        similarity = np.dot(input_vector, movie_vector) / (np.linalg.norm(input_vector) * np.linalg.norm(movie_vector))
        similarities.append((movie_id, similarity))

# Sort by similarity
similarities.sort(key=lambda x: x[1], reverse=True)

# Output the top three movie IDs
if similarities:
    top_three_movies = similarities[:3]
    for movie_id, similarity in top_three_movies:
        print(f"Movie ID: {movie_id} - Similarity: {similarity}")
else:
    print("No movies found matching the input keywords.")

Movie ID: 6 - Similarity: 0.680613100528717
Movie ID: 4 - Similarity: 0.2695939540863037
Movie ID: 9 - Similarity: 0.24582575261592865


In [9]:
wmodel.save("word2vec.model")
print("Word2Vec is saved")

Word2Vec is saved


In [11]:
w2v = Word2Vec.load("word2vec.model")

print("Word2Vec is loaded")
w2v

Word2Vec is loaded


<gensim.models.word2vec.Word2Vec at 0x1adf8005660>

In [5]:
similarities[0]

(6, 0.6806131)

# ES for storage

In [6]:
# 删除已存在的索引
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)


# 创建新的索引并指定映射
es.indices.create(index=index_name, body={
    "mappings": {
        "properties": {
            "movie_id": {"type": "integer"},
#             "keywords": {"type": "nested", "properties": {"vector": {"type": "dense_vector", "dims": 100}}}
            "keywords": {"type": "dense_vector", "dims": 100}
        }
    }
})

# 将关键词向量索引到 Elasticsearch
for movie in movies_data:
    movie_id = movie['id']
    movie_keywords = movie['keywords']
    keyword_vectors = [wmodel.wv[word].tolist() for keyword in movie_keywords for word in keyword.split() if word in wmodel.wv]
    if keyword_vectors:
        nested_keywords = [{'vector': vector} for vector in keyword_vectors]
        
        movie_vector = np.mean(keyword_vectors, axis=0)
        
        doc = {
            'movie_id': movie_id,
            'keywords': movie_vector
        }
        es.index(index=index_name,  body=doc)

# 刷新索引以确保数据可搜索
# es.indices.refresh(index_name)


  es.indices.create(index=index_name, body={
  es.index(index=index_name,  body=doc)


In [7]:
input_vector.shape

(100,)

# Query

In [8]:
query = {
        "query": {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'keywords') + 1.0",
                    "params": {"query_vector": input_vector.tolist()}
                }
            }
        },
        "size": 5
    }


# 执行搜索查询
results = es.search(index=index_name, body=query)

# 输出匹配结果
if results['hits']['total']['value'] > 0:
    for hit in results['hits']['hits']:
        movie_id = hit['_source']['movie_id']
        similarity = hit['_score']
        print(f"Movie ID: {movie_id} - Similarity: {similarity}")
else:
    print("No movies found matching the input keywords.")


No movies found matching the input keywords.


  results = es.search(index=index_name, body=query)
