In [28]:
#work_ID = 121188719

In [1]:
import requests
import random
import time
from tqdm import tqdm
import numpy as np

def check_pixiv_work_ids(start_id, max_attempts=1000):
    valid_ids = []
    
    # 創建tqdm進度條
    with tqdm(total=50, desc="Finding valid work IDs", unit="ID") as pbar:
        for attempt in range(max_attempts):
            work_id = start_id + attempt
            url = f"https://www.pixiv.net/artworks/{work_id}"
            
            try:
                response = requests.head(url)
                if response.status_code == 200:
                    valid_ids.append(work_id)
                    pbar.update(1)  # 更新進度條
                    pbar.set_postfix_str(f"Latest ID: {work_id}")  # 顯示最新找到的ID
                
                if len(valid_ids) == 50:
                    break
                
                # 添加延遲以避免過於頻繁的請求
                time.sleep(3)
            
            except requests.RequestException as e:
                tqdm.write(f"Error checking work ID {work_id}: {e}")
    
    if len(valid_ids) == 50:
        print("\nSuccessfully found 10 valid work IDs.")
    else:
        print(f"\nReached maximum attempts. Found {len(valid_ids)} valid work IDs.")
    
    return valid_ids

# 使用函數
start_id = 121188700  # 隨機選擇起始ID
print(f"Starting search from ID: {start_id}")
result = check_pixiv_work_ids(start_id)
print("Valid work IDs:", result)

Starting search from ID: 121188700


Finding valid work IDs: 100%|█| 50/50 [03:05<00:00,  3.72s/ID, Latest ID: 121188


Reached maximum attempts. Found 50 valid work IDs.
Valid work IDs: [121188700, 121188701, 121188702, 121188703, 121188704, 121188706, 121188707, 121188708, 121188709, 121188710, 121188711, 121188712, 121188713, 121188714, 121188716, 121188718, 121188719, 121188721, 121188722, 121188723, 121188724, 121188725, 121188726, 121188727, 121188728, 121188729, 121188730, 121188731, 121188732, 121188733, 121188734, 121188736, 121188738, 121188739, 121188740, 121188741, 121188742, 121188743, 121188744, 121188745, 121188747, 121188748, 121188749, 121188751, 121188752, 121188753, 121188754, 121188755, 121188756, 121188758]





In [13]:
import requests
from bs4 import BeautifulSoup
import os

def save_webpage_as_single_file(url, filename):
    try:
        # 发送 GET 请求获取网页内容
        response = requests.get(url)
        response.raise_for_status()  # 检查请求是否成功
        time.sleep(np.random.randint(3,7))

        # 解析网页内容
        soup = BeautifulSoup(response.text, 'html.parser')

        # 获取网页的标题
        title = soup.title.string if soup.title else 'webpage'
        
        # 创建 .mhtml 文件内容
        mhtml_content = f"<!DOCTYPE html>\n<html>\n<head>\n<title>{title}</title>\n</head>\n<body>\n"
        mhtml_content += str(soup)  # 添加网页内容
        mhtml_content += "\n</body>\n</html>"

        # 保存为 .mhtml 文件
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(mhtml_content)

        print(f"网页内容已成功保存为 {filename}")
    except requests.exceptions.RequestException as e:
        print(f"请求失败: {e}")
    except Exception as e:
        print(f"保存文件时出错: {e}")

# 输入 URL 和文件名




In [17]:
import pandas as pd
import os
import sqlite3
import re

# 提取信息的函数
def extract_info_from_mhtml(mhtml_file,work_ID):
    with open(mhtml_file, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # 使用 BeautifulSoup 解析 HTML 内容
    soup = BeautifulSoup(content, 'html.parser')

    # 提取 <title> 标签的内容
    title = soup.title.string if soup.title else '无标题'
    
     # 使用正则表达式提取多个 "tag"
    tag_pattern = re.findall(r'"tag"\s*:\s*"([^"]+)"', content)
    
    # 使用正则表达式提取 "likeCount", "bookmarkCount", "viewCount"
    like_count_pattern = re.search(r'"likeCount"\s*:\s*(\d+)', content)
    bookmark_count_pattern = re.search(r'"bookmarkCount"\s*:\s*(\d+)', content)
    view_count_pattern = re.search(r'"viewCount"\s*:\s*(\d+)', content)
    image_pattern = re.search(r'"regular"\s*:\s*"([^"]+)"', content)

    # 获取正则表达式的匹配结果
    tags = tag_pattern if tag_pattern else ['无标签']
    like_count = like_count_pattern.group(1) if like_count_pattern else '无点赞数'
    bookmark_count = bookmark_count_pattern.group(1) if bookmark_count_pattern else '无收藏数'
    view_count = view_count_pattern.group(1) if view_count_pattern else '无浏览数'
    image_count = image_pattern.group(1) if image_pattern else '無影像連結'
    return {
        'work_ID': str(work_ID),
        'title': title.split(' - ')[0],  # 清理标题内容
        'tags': ', '.join(tags),  # 标签列表转为字符串
        'like_count': like_count,
        'bookmark_count': bookmark_count,
        'view_count': view_count,
        'image url': image_count
        
    }



# 将数据追加到 SQLite 数据库的函数
def append_to_sqlite(info, db_filename='web_info.db'):
    # 连接 SQLite 数据库（如果文件不存在，会自动创建）
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()

    # 创建表格（如果不存在）
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS web_info (
            work_ID TEXT PRIMARY KEY,
            title TEXT,
            tags TEXT,
            like_count INTEGER,
            bookmark_count INTEGER,
            view_count INTEGER,
            image_url TEXT
        )
    ''')

    # 插入数据，如果 work_ID 已存在则忽略插入
    cursor.execute('''
        INSERT OR REPLACE INTO web_info (work_ID, title, tags, like_count, bookmark_count, view_count, image_url)
        VALUES (?, ?, ?, ?, ?, ?, ?)
    ''', (
        info['work_ID'],
        info['title'],
        info['tags'],
        int(info['like_count']),
        int(info['bookmark_count']),
        int(info['view_count']),
        info['image url']
    ))

    # 提交更改并关闭连接
    conn.commit()
    conn.close()





In [21]:
db_filename = 'web_info.db'
for work_ID in result:
    url = f'https://www.pixiv.net/artworks/{work_ID}'
    filename = f'{work_ID}.mhtml'
    save_webpage_as_single_file(url, filename)
    # 示例：提取和追加数据
    mhtml_file = f'{work_ID}.mhtml'  # 替换为实际的 .mhtml 文件路径
    info = extract_info_from_mhtml(mhtml_file,work_ID)

    # 将提取的信息追加到 'web_info.xlsx'
    append_to_sqlite(info)

    print(f"新信息已成功追加到 {db_filename}")



网页内容已成功保存为 121188700.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188701.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188702.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188703.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188704.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188706.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188707.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188708.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188709.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188710.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188711.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188712.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188713.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188714.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188716.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188718.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188719.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188721.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188722.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188723.mhtml
新信息已成功追加到 web_info.db
网页内容已成功保存为 121188724

In [27]:
import sqlite3

con = sqlite3.connect('web_info.db')
cur = con.cursor()

for row in cur.execute("SELECT work_ID,tags FROM web_info ORDER BY work_ID"):
    print(row)

con.close()

('121188700', 'VTuber, ファンアート, 美少女, ショートカット, ロック少女, 夏, web, YouTube, YouTuber, VTuber')
('121188701', 'R-18, Naruto, Tsunade, 綱手, うずまきナルト, NarutoUzumaki, NARUTO, creampie, bikini')
('121188702', 'R-18, pokemon, pikachulibre, pikachu, furry, ポケモン, 盛るペコ')
('121188703', 'R-18, マギアレコード, 環いろは')
('121188704', '女の子, 綺麗, かわいい, 風景, 獣耳, ファンタジー, 悪魔, 美少女, 悪魔娘, サキュバス')
('121188706', 'R-18, 種付けプレス, ビッチ, 痴女, チン負け, わからせ')
('121188707', 'R-18, オリジナル, 眼鏡, マスク, ローター, 調教, JK, たくしあげ, ノーブラ, 貧乳')
('121188708', 'R-18, Fate/GrandOrder, Fate, FGO, 藤丸立香, ジャンヌ・オルタ, 邪ンヌ, ジャンヌオルタ')
('121188709', '人形, 天使, オリジナルイラスト, オリジナル, イラスト')
('121188710', 'pokemon, ハクリュー, Dragonair, ポケモン, pokemon')
('121188711', 'R-18, 咲-Saki-, 原村和, 敗北, NTR, 肉便器, ビキニ, 爆乳, AIイラスト, NovelAI')
('121188712', 'R-18, EpicSeven, epic7, luna, 女の子, 熟女, 魅惑の谷間, 巨乳, AIイラスト, 3D')
('121188713', '手描き, 東方, 東方Project, 水着, 純狐, 東方水着娘')
('121188714', 'R-18, Texas, テキサス(アークナイツ), 明日方舟, 德克萨斯, 博德, 缄默德克萨斯, 内射, 受精')
('121188716', 'ケモノ, メスケモ, furry, anthro, ドワ子, 剣と魔法と学園モノ

In [16]:
'''
import requests

# 图片的 URL
url = 'https://i.pximg.net/img-master/img/2024/08/05/22/47/33/121215791_p0_master1200.jpg'

# 图片保存的本地文件名
filename = 'downloaded_image.png'

# 请求头，模拟完整的浏览器请求
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
    'Referer': 'https://www.pixiv.net/',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Connection': 'keep-alive'
}

# 发起 GET 请求以下载图片
response = requests.get(url, headers=headers)

# 检查请求是否成功
if response.status_code == 200:
    # 将图片数据写入本地文件
    with open(filename, 'wb') as file:
        file.write(response.content)
    print(f"图片已成功保存为 {filename}")
else:
    print(f"图片下载失败，状态码：{response.status_code}")

'''


图片已成功保存为 downloaded_image.png
