In [54]:
import sqlite3

# 查找匹配 tags 的 work_ID 函数，并按照 view_count 排序，返回前 10 个结果
def find_top_50_by_tag(tags=None, db_filename='web_info.db'):
    # 连接到 SQLite 数据库
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()

    # 构造 SQL 查询
    if tags is None:
        # 情况 1：没有输入 tag，搜索整个数据库
        query = '''
            SELECT like_count, bookmark_count, view_count, tags
            FROM web_info
            ORDER BY view_count DESC
            LIMIT 50
        '''
        cursor.execute(query)
    else:
        # 情况 2 和 3：输入一个或多个 tag
        # 构造 WHERE 子句，确保每个 tag 都包含在 tags 字段中
        tag_conditions = ' AND '.join([f'tags LIKE ?' for _ in tags])
        query = f'''
            SELECT like_count, bookmark_count, view_count, tags
            FROM web_info
            WHERE {tag_conditions}
            ORDER BY view_count DESC
            LIMIT 50
        '''
        # 为每个 tag 添加百分号前后匹配
        like_params = ['%' + tag + '%' for tag in tags]
        cursor.execute(query, like_params)

    # 获取查询结果
    results = cursor.fetchall()

    # 关闭数据库连接
    conn.close()

    # 返回匹配的 work_IDs 和对应的 view_count
    return [result[0] for result in results],[result[1] for result in results],[result[2] for result in results],[result[3] for result in results]

tag = 'R-18','女の子'
matching_like, matching_bookmark, matching_view ,matching_tags = find_top_50_by_tag(tag, db_filename='web_info.db')
#print(matching_tags)

In [57]:
import random
from collections import Counter

# 标准化标签函数：去除前后空格，并将全角空格替换为半角
def normalize_tag(tag):
    return tag.strip()


def find_top_10_tags(matching_tags):
   
    # 将所有 tags 连接为一个长字符串，再分割为单个标签的列表，并标准化每个标签
    all_tags = [normalize_tag(tag) for tag in ','.join(matching_tags).split(',')]
    
    # 使用 Counter 统计每个标签的出现频率
    tag_count = Counter(all_tags)
    
    # 找出前 3 个最常出现的标签（如果有并列则随机选择）
    most_common = tag_count.most_common()
    #print(most_common)
    
    # 获取最高的频率
    top_3 = []
    current_rank = 1
    i = 0
    
    while current_rank <= 10 and i < len(most_common):
        freq = most_common[i][1]  # 当前标签的出现次数
        # 找出所有出现次数相同的标签
        same_freq_tags = [tag for tag, count in most_common if count == freq]
        
        # 从相同频率的标签中随机选择
        selected_tags = random.sample(same_freq_tags, min(10 - len(top_3), len(same_freq_tags)))
        top_3.extend(selected_tags)
        
        # 移动到下一个不同频率的标签
        i += len(same_freq_tags)
        current_rank += len(selected_tags)
    
    return top_3[:10]

# 示例用法

top_10_tags = find_top_10_tags(matching_tags)

print(f'前 10 个最常出现的 tags: {top_10_tags}。')


前 10 个最常出现的 tags: ['R-18', '女の子', '裸体', '巨乳', '魅惑の谷間', '3D', '釉瑚', '鸣潮', '全裸', 'epic7']。


In [58]:
import numpy as np
mean_like = np.round(np.array(matching_like).mean(),0)
std_like  = np.round(np.array(matching_like).std(),0)

mean_bookmark = np.round(np.array(matching_bookmark).mean(),0)
std_bookmark  = np.round(np.array(matching_bookmark).std(),0)

mean_view = np.round(np.array(matching_view).mean(),0)
std_view  = np.round(np.array(matching_view).std(),0)


print(f'匹配"{tag}"的like : {mean_like} +- {std_like }')
print(f'匹配"{tag}"的bookmark : {mean_bookmark} +- {std_bookmark }')
print(f'匹配"{tag}"的view : {mean_view} +- {std_view }')

匹配"('R-18', '女の子')"的like : 129.0 +- 166.0
匹配"('R-18', '女の子')"的bookmark : 226.0 +- 284.0
匹配"('R-18', '女の子')"的view : 1902.0 +- 2332.0
