预处理第一步

In [2]:
from __future__ import annotations

import asyncio as aio
import io
import shutil
from pathlib import Path

import pandas as pd
from tqdm.notebook import tqdm
from PIL import Image

加载数据

In [3]:
RAW_DATASET_DIR = Path("../crawler/data")  # 未预处理的数据集文件夹

data = pd.DataFrame()

files = list(RAW_DATASET_DIR.glob("*.csv"))
for filename in files:
    hotel = pd.read_csv(filename)
    data = pd.concat([data, hotel], ignore_index=True)

print("共加载", data.shape[0], "条数据")

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

过滤数据

In [None]:
MIN_IMAGES_NUM = 1  # 最少图片数量，如果少于这个数量则跳过这条评论
MIN_TEXT_LENGTH = 20  # 最短评论长度

photo_columns = [f"photo_{i} link" for i in range(1, 11)]
data = data.drop(data[data[photo_columns[MIN_IMAGES_NUM - 1]].isna()].index)
data = data[data["review text"].apply(lambda x: len(x.split())) > MIN_TEXT_LENGTH]
data = data.drop_duplicates()
data.reset_index(inplace=True, drop=True)
print(f"过滤少于 {MIN_IMAGES_NUM} 张图片、评论字数少于 {MIN_TEXT_LENGTH} 字的数据")
print(f"过滤后剩余 {data.shape[0]} 条数据")

下载图片

In [None]:
async def download(url, to_file, client, sem, t, size):
    async with sem:
        for _ in range(3):
            try:
                resp = await client.get(url)
                break
            except (httpx.TimeoutException, httpx.NetworkError):
                pass
        else:
            print(f"图片下载失败：{url}")
            t.update()
            return
        
        try:
            to_file.parent.mkdir(exist_ok=True, parents=True)
            image = Image.open(io.BytesIO(resp.content))
            image = image.resize(size)
            image.save(to_file)
        except Image.UnidentifiedImageError:
            print(f"不支持的文件格式，请检查：{url}")
        t.update()

async def download_images(data, to_dir, sem, size):
    photo_columns = [f"photo_{i} link" for i in range(1, 11)]

    async with httpx.AsyncClient(
        timeout=30,
        limits=httpx.Limits(max_keepalive_connections=40, max_connections=100)
    ) as client:
        t = tqdm(desc="下载图片", mininterval=1, dynamic_ncols=True)
        # 并发前的准备工作
        tasks = []
        for i, row in data.iterrows():
            parent = to_dir / str(i)

            urls = (url for _, url in row[photo_columns].iteritems() if isinstance(url, str))  # 获取图片链接
            to_files = (parent / f"{i}.jpg" for i in range(1, 11))
            
            tasks.extend(download(url, to_file, client, sem, t, size) for url, to_file in zip(urls, to_files))

        t.total = len(tasks)
        await aio.gather(*tasks)

    t.refresh()

In [None]:
RAW_IMAGE_DIR = Path("images/")  # 图片下载目录
IMAGE_SIZE = (512, 512)  # 图片下载后的大小（数据量太大，需要提前调整图片大小）
LIMIT = 20  # 并发量

RAW_IMAGE_DIR.mkdir(exist_ok=True, parents=True)
shutil.rmtree(RAW_IMAGE_DIR)  # 清空原有内容

await download_images(data, RAW_IMAGE_DIR, aio.Semaphore(LIMIT), IMAGE_SIZE)

保存数据

In [None]:
# Linux only, need to run on kaggle
!tar -czf raw_images.tar.gz images --remove-files

from IPython.display import FileLink
FileLink('raw_images.tar.gz')

In [None]:
DATASET_PATH = Path("dataset.csv")  # 预处理后数据存放路径

data.to_csv(DATASET_PATH)