In [28]:
#work_ID = 121188719

In [45]:
import requests
import random
import time
from tqdm import tqdm
import numpy as np

def check_pixiv_work_ids(start_id, max_attempts=1000):
    valid_ids = []
    
    # 創建tqdm進度條
    with tqdm(total=10, desc="Finding valid work IDs", unit="ID") as pbar:
        for attempt in range(max_attempts):
            work_id = start_id + attempt
            url = f"https://www.pixiv.net/artworks/{work_id}"
            
            try:
                response = requests.head(url)
                if response.status_code == 200:
                    valid_ids.append(work_id)
                    pbar.update(1)  # 更新進度條
                    pbar.set_postfix_str(f"Latest ID: {work_id}")  # 顯示最新找到的ID
                
                if len(valid_ids) == 10:
                    break
                
                # 添加延遲以避免過於頻繁的請求
                time.sleep(1)
            
            except requests.RequestException as e:
                tqdm.write(f"Error checking work ID {work_id}: {e}")
    
    if len(valid_ids) == 10:
        print("\nSuccessfully found 10 valid work IDs.")
    else:
        print(f"\nReached maximum attempts. Found {len(valid_ids)} valid work IDs.")
    
    return valid_ids

# 使用函數
start_id = 121188700  # 隨機選擇起始ID
print(f"Starting search from ID: {start_id}")
result = check_pixiv_work_ids(start_id)
print("Valid work IDs:", result)

Starting search from ID: 121188700


Finding valid work IDs: 100%|████████████████████████████████████| 10/10 [00:12<00:00,  1.26s/ID, Latest ID: 121188710]


Successfully found 10 valid work IDs.
Valid work IDs: [121188700, 121188701, 121188702, 121188703, 121188704, 121188706, 121188707, 121188708, 121188709, 121188710]





In [46]:
import requests
from bs4 import BeautifulSoup
import os

def save_webpage_as_single_file(url, filename):
    try:
        # 发送 GET 请求获取网页内容
        response = requests.get(url)
        response.raise_for_status()  # 检查请求是否成功

        # 解析网页内容
        soup = BeautifulSoup(response.text, 'html.parser')

        # 获取网页的标题
        title = soup.title.string if soup.title else 'webpage'
        
        # 创建 .mhtml 文件内容
        mhtml_content = f"<!DOCTYPE html>\n<html>\n<head>\n<title>{title}</title>\n</head>\n<body>\n"
        mhtml_content += str(soup)  # 添加网页内容
        mhtml_content += "\n</body>\n</html>"

        # 保存为 .mhtml 文件
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(mhtml_content)

        print(f"网页内容已成功保存为 {filename}")
    except requests.exceptions.RequestException as e:
        print(f"请求失败: {e}")
    except Exception as e:
        print(f"保存文件时出错: {e}")

# 输入 URL 和文件名




In [47]:
import pandas as pd
import os

# 提取信息的函数
def extract_info_from_mhtml(mhtml_file,work_ID):
    with open(mhtml_file, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # 使用 BeautifulSoup 解析 HTML 内容
    soup = BeautifulSoup(content, 'html.parser')

    # 提取 <title> 标签的内容
    title = soup.title.string if soup.title else '无标题'
    
     # 使用正则表达式提取多个 "tag"
    tag_pattern = re.findall(r'"tag"\s*:\s*"([^"]+)"', content)
    
    # 使用正则表达式提取 "likeCount", "bookmarkCount", "viewCount"
    like_count_pattern = re.search(r'"likeCount"\s*:\s*(\d+)', content)
    bookmark_count_pattern = re.search(r'"bookmarkCount"\s*:\s*(\d+)', content)
    view_count_pattern = re.search(r'"viewCount"\s*:\s*(\d+)', content)
    image_pattern = re.search(r'"regular"\s*:\s*"([^"]+)"', content)

    # 获取正则表达式的匹配结果
    tags = tag_pattern if tag_pattern else ['无标签']
    like_count = like_count_pattern.group(1) if like_count_pattern else '无点赞数'
    bookmark_count = bookmark_count_pattern.group(1) if bookmark_count_pattern else '无收藏数'
    view_count = view_count_pattern.group(1) if view_count_pattern else '无浏览数'
    image_count = image_pattern.group(1) if image_pattern else '無影像連結'
    return {
        'work_ID': str(work_ID),
        'title': title.split(' - ')[0],  # 清理标题内容
        'tags': ', '.join(tags),  # 标签列表转为字符串
        'like_count': like_count,
        'bookmark_count': bookmark_count,
        'view_count': view_count,
        'image url': image_count
        
    }



# 将数据追加到 Excel 文件的函数
def append_to_excel(info, excel_filename='web_info.xlsx'):
    # 如果 Excel 文件不存在，创建并写入数据
    if not os.path.exists(excel_filename):
        df = pd.DataFrame({
            'work_ID':[info['work_ID']],
            'title': [info['title']],
            'tags': [info['tags']],
            'like_count': [int(info['like_count'])],
            'bookmark_count': [int(info['bookmark_count'])],
            'view_count': [int(info['view_count'])],
            'image url': [info['image url']]
        })
        df.to_excel(excel_filename, index=False)
    else:
        # 如果 Excel 文件已经存在，追加新数据
        existing_df = pd.read_excel(excel_filename)

        # 创建新的 DataFrame
        new_df = pd.DataFrame({
            'work_ID':[info['work_ID']],
            'title': [info['title']],
            'tags': [info['tags']],
            'like_count': [int(info['like_count'])],
            'bookmark_count': [int(info['bookmark_count'])],
            'view_count': [int(info['view_count'])],
            'image url': [info['image url']]
        })

        # 追加新数据到现有的 DataFrame 中
        updated_df = pd.concat([existing_df, new_df], ignore_index=True)

        # 写回到 Excel 文件
        updated_df.to_excel(excel_filename, index=False)




In [48]:
for work_ID in result:
    url = f'https://www.pixiv.net/artworks/{work_ID}'
    filename = f'{work_ID}.mhtml'
    save_webpage_as_single_file(url, filename)
    # 示例：提取和追加数据
    mhtml_file = f'{work_ID}.mhtml'  # 替换为实际的 .mhtml 文件路径
    info = extract_info_from_mhtml(mhtml_file,work_ID)

    # 将提取的信息追加到 'web_info.xlsx'
    append_to_excel(info)

    print(f"新信息已成功追加到 {excel_filename}")



网页内容已成功保存为 121188700.mhtml
新信息已成功追加到 web_info.xlsx
网页内容已成功保存为 121188701.mhtml
新信息已成功追加到 web_info.xlsx
网页内容已成功保存为 121188702.mhtml
新信息已成功追加到 web_info.xlsx
网页内容已成功保存为 121188703.mhtml
新信息已成功追加到 web_info.xlsx
网页内容已成功保存为 121188704.mhtml
新信息已成功追加到 web_info.xlsx
网页内容已成功保存为 121188706.mhtml
新信息已成功追加到 web_info.xlsx
网页内容已成功保存为 121188707.mhtml
新信息已成功追加到 web_info.xlsx
网页内容已成功保存为 121188708.mhtml
新信息已成功追加到 web_info.xlsx
网页内容已成功保存为 121188709.mhtml
新信息已成功追加到 web_info.xlsx
网页内容已成功保存为 121188710.mhtml
新信息已成功追加到 web_info.xlsx


In [16]:
'''
import requests

# 图片的 URL
url = 'https://i.pximg.net/img-master/img/2024/08/05/22/47/33/121215791_p0_master1200.jpg'

# 图片保存的本地文件名
filename = 'downloaded_image.png'

# 请求头，模拟完整的浏览器请求
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
    'Referer': 'https://www.pixiv.net/',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Connection': 'keep-alive'
}

# 发起 GET 请求以下载图片
response = requests.get(url, headers=headers)

# 检查请求是否成功
if response.status_code == 200:
    # 将图片数据写入本地文件
    with open(filename, 'wb') as file:
        file.write(response.content)
    print(f"图片已成功保存为 {filename}")
else:
    print(f"图片下载失败，状态码：{response.status_code}")

'''


图片已成功保存为 downloaded_image.png
