In [2]:

!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --upgrade


!pip install "datasets" "Pillow"

Looking in indexes: https://download.pytorch.org/whl/cu121


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
zip_file_path = '/content/drive/My Drive/hackathon_data.zip' # 确保路径正确
!unzip -q "{zip_file_path}" -d /content/
print("✅ 数据已从 Google Drive 成功解压！")

✅ 数据已从 Google Drive 成功解压！


In [None]:

import pandas as pd


csv_path = '/content/hackathon_data/reviews.csv'

try:
    df = pd.read_csv(csv_path, encoding='utf-8', encoding_errors='ignore')
    print("数据加载成功！")
    print(df.head())
except FileNotFoundError:
    print(f"错误：在 '{csv_path}' 找不到文件。请检查你的 zip 包里的文件夹结构和 CSV 文件名是否正确。")

In [6]:
!pip install "unsloth[colab-new]"

Collecting unsloth[colab-new]
  Using cached unsloth-2025.8.10-py3-none-any.whl.metadata (52 kB)
Collecting unsloth_zoo>=2025.8.9 (from unsloth[colab-new])
  Using cached unsloth_zoo-2025.8.9-py3-none-any.whl.metadata (9.5 kB)
Collecting xformers>=0.0.27.post2 (from unsloth[colab-new])
  Using cached xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes (from unsloth[colab-new])
  Using cached bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting tyro (from unsloth[colab-new])
  Using cached tyro-0.9.30-py3-none-any.whl.metadata (11 kB)
Collecting datasets<4.0.0,>=3.4.1 (from unsloth[colab-new])
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.19.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth[colab-new])
  Using cached trl-0.22.1-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy (from unsloth_zoo>=2025.8.9->unsloth[colab-new])
  Using cached cut_cro

In [7]:
import torch
import pandas as pd
import json
import os
from PIL import Image
from unsloth import FastLanguageModel
from transformers import AutoProcessor
from tqdm import tqdm


max_seq_length = 2048
dtype = None
load_in_4bit = True

MODEL_ID = "google/gemma-3-12b-it"
print(f"正在加载多模态模型: {MODEL_ID}")
print("如果是第一次加载，需要下载模型文件，请耐心等待...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_ID,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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"
)
processor = AutoProcessor.from_pretrained(MODEL_ID)
FastLanguageModel.for_inference(model)
print("✅ 模型加载并准备就绪！")


def classify_review_multimodal(review_row: pd.Series) -> dict:

    review_text = str(review_row.get('text', ''))
    rating = review_row.get('rating', 'N/A')
    image_path = review_row.get('photo')

    image = None
    has_photo = False
    if pd.notna(image_path) and os.path.exists(image_path):
        try:
            image = Image.open(image_path).convert("RGB")
            has_photo = True
        except Exception as e:
            print(f"警告：加载图片失败: {image_path}, 错误: {e}")
            pass


    instruction = (
        f"Analyze this review based on the text and, if provided, the image. "
        f"Review text: \"{review_text}\". User rating: {rating}/5. "
        f"Is this review an advertisement, irrelevant, or a rant without a visit? "
        f"Respond ONLY with a valid JSON object with three boolean fields: "
        f"\"is_advertisement\", \"is_irrelevant\", \"is_rant_without_visit\"."
    )
    content = [{"type": "text", "text": instruction}]
    if has_photo and image is not None:
        content.insert(0, {"type": "image"})

    messages = [{"role": "user", "content": content}]


    input_text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    inputs = processor(images=image, text=input_text, return_tensors="pt").to("cuda")


    try:
        output = model.generate(**inputs, max_new_tokens=150)
        response_text = processor.batch_decode(output, skip_special_tokens=True)[0]
        json_part = response_text[response_text.find('{'):response_text.rfind('}')+1]

        if json_part:
            return json.loads(json_part)
        else:
            return {"is_advertisement": False, "is_irrelevant": False, "is_rant_without_visit": False}
    except Exception as e:
        print(f"处理第 {review_row.name} 行时发生错误: {e}")
        return {"is_advertisement": False, "is_irrelevant": False, "is_rant_without_visit": False}


input_csv_path = '/content/hackathon_data/reviews.csv'

progress_csv_path = '/content/drive/My Drive/gemma3_classified_reviews_progress.csv'


try:

    df = pd.read_csv(input_csv_path,encoding='utf-8', encoding_errors='ignore')
    print(f"✅ 成功从 '{input_csv_path}' 加载了 {len(df)} 行数据。")



    try:

        progress_df = pd.read_csv(progress_csv_path)

        processed_indices = progress_df['original_index'].unique()
        print(f"✅ 成功加载进度文件，已处理 {len(processed_indices)} / {len(df)} 条评论。")
    except FileNotFoundError:

        print("ℹ️ 未发现进度文件，将从头开始处理。")
        progress_df = pd.DataFrame()
        processed_indices = []


    df_to_process = df[~df.index.isin(processed_indices)]


    if not df_to_process.empty:
        print(f"\n🚀 准备处理 {len(df_to_process)} 条剩余评论...")
        print("（您的进度将在每处理完一行后自动保存到 Google Drive）")

        for index, row in tqdm(df_to_process.iterrows(), total=len(df_to_process)):

            classification_result = classify_review_multimodal(row)

            new_row_df = pd.DataFrame([classification_result])
            new_row_df['original_index'] = index


            progress_df = pd.concat([progress_df, new_row_df], ignore_index=True)


            progress_df.to_csv(progress_csv_path, index=False)
    else:
        print("\n🎉 恭喜！所有评论都已处理完毕！")


    print("\n✅ 批量处理完成！正在整合最终结果...")


    final_progress_df = pd.read_csv(progress_csv_path)

    df['original_index'] = df.index
    final_progress_df['original_index'] = final_progress_df['original_index'].astype(int)


    final_df = pd.merge(df, final_progress_df, on='original_index', how='left')
    final_df = final_df.drop(columns=['original_index']) # 移除辅助列


    final_output_path = '/content/drive/My Drive/gemma3_classified_reviews_FINAL.csv'
    final_df.to_csv(final_output_path, index=False)
    print(f"🎉 最终的完整合并文件已成功保存到您的 Google Drive！路径为: {final_output_path}")

except FileNotFoundError:
    print(f"❌ 错误：在 '{input_csv_path}' 找不到数据文件。请确保文件已解压且路径正确。")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
正在加载多模态模型: google/gemma-3-12b-it
如果是第一次加载，需要下载模型文件，请耐心等待...
==((====))==  Unsloth 2025.8.10: Fast Gemma3 patching. Transformers: 4.55.4.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

✅ 模型加载并准备就绪！
✅ 成功从 '/content/hackathon_data/reviews.csv' 加载了 1100 行数据。
ℹ️ 未发现进度文件，将从头开始处理。

🚀 准备处理 1100 条剩余评论...
（您的进度将在每处理完一行后自动保存到 Google Drive）


100%|██████████| 1100/1100 [2:00:48<00:00,  6.59s/it]


✅ 批量处理完成！正在整合最终结果...
🎉 最终的完整合并文件已成功保存到您的 Google Drive！路径为: /content/drive/My Drive/gemma3_classified_reviews_FINAL.csv





In [18]:

import pandas as pd
from sklearn.metrics import classification_report


validation_file_path = '/content/drive/My Drive/gemma3_classified_reviews_FINAL.csv' # 确保路径和文件名正确

try:
    validation_df = pd.read_csv(validation_file_path)
    print("✅ 成功加载已标注的验证集文件！")

    labels = ['is_advertisement', 'is_irrelevant', 'is_rant_without_visit']


    for label in labels:
        print("\n" + "="*40)
        print(f"       评估报告: {label}")
        print("="*40)


        pred_col = label
        true_col = f"human_{label}"


        validation_df[pred_col] = validation_df[pred_col].fillna(False)
        validation_df[true_col] = validation_df[true_col].fillna(False)


        y_pred = validation_df[pred_col].astype(bool)
        y_true = validation_df[true_col].astype(bool)


        report = classification_report(y_true, y_pred, target_names=[f'Not {label}', label], zero_division=0)
        print(report)

except FileNotFoundError:
    print(f"❌ 错误：在 Google Drive 中找不到文件 '{validation_file_path}'。")
except KeyError as e:
    print(f"❌ 错误：找不到列 {e}。请检查您的 validation_set.csv 文件中的列名是否完全正确。")

✅ 成功加载已标注的验证集文件！

       评估报告: is_advertisement
                      precision    recall  f1-score   support

Not is_advertisement       1.00      1.00      1.00      1096
    is_advertisement       0.00      0.00      0.00         4

            accuracy                           1.00      1100
           macro avg       0.50      0.50      0.50      1100
        weighted avg       0.99      1.00      0.99      1100


       评估报告: is_irrelevant
                   precision    recall  f1-score   support

Not is_irrelevant       0.99      0.98      0.99      1093
    is_irrelevant       0.00      0.00      0.00         7

         accuracy                           0.97      1100
        macro avg       0.50      0.49      0.49      1100
     weighted avg       0.99      0.97      0.98      1100


       评估报告: is_rant_without_visit
                           precision    recall  f1-score   support

Not is_rant_without_visit       1.00      1.00      1.00      1095
    is_rant_without_v

  validation_df[true_col] = validation_df[true_col].fillna(False)
  validation_df[true_col] = validation_df[true_col].fillna(False)
  validation_df[true_col] = validation_df[true_col].fillna(False)
