In [2]:
import jieba
import re
import json
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import argparse
import numpy as np

In [4]:
def vectorize(Text): return TfidfVectorizer().fit_transform(Text).toarray()


def seg(text):
    pattern = re.compile(r'^[a-zA-Z0-9\u4e00-\u9fff]+$')
    text_list = list(jieba.cut(text))
    filtered_list = [word for word in text_list if pattern.match(word)]
    return filtered_list


def data_process(json_content):
    results = []
    for data in json_content:
        for device in data[0]['devices']:
            device_info = seg(device['device']) + seg(device['description'])
            results.extend(device_info)
        category_info = seg(data[0]['category'])
        results.extend(category_info)
    return [" ".join(results)]

In [36]:
def text_similarity():
    all_list = []
    file_names = []

    input_files = ["./example_data/1.json", "./example_data/2.json"]
    # loading content
    for file in input_files:
        file_names.append(file)
        file_path = os.path.join(os.getcwd(), file)
        with open(file_path, 'r', encoding='utf-8') as file:
            json_content = json.load(file)
        all_list += data_process(json_content)

    query_file = "./example_data/query.json"
    with open(query_file, 'r', encoding='utf-8') as file:
        json_content = json.load(file)
    all_list += data_process(json_content)

    vectors = vectorize(all_list)
    key_vectors = vectors[:-1]
    query_vectors = vectors[-1]
    query_vectors = query_vectors.reshape(1, -1)
    similarity_scores = cosine_similarity(key_vectors, query_vectors)
    max_index = np.argmax(similarity_scores)
    print('相似度：', similarity_scores)
    print("最匹配的文件名：", file_names[max_index])

    similarity_scores = similarity_scores.flatten()
    # similarity_scores = np.argsort(similarity_scores)
    # similarity_scores = similarity_scores[::-1]
    top_similarity_scores = similarity_scores[:3]
    print(top_similarity_scores)

In [39]:
text_similarity()

相似度： [[0.20156099]
 [0.12939368]]
最匹配的文件名： ./example_data/1.json
[0.20156099 0.12939368]


In [41]:
v = [1, 2, 3, 4]
print(v[:-1])
print(v[-1])

[1, 2, 3]
4
