In [11]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import seaborn as sns
os.makedirs("outputs", exist_ok=True)

def read_file():
    """
    Reads a CSV file and returns a DataFrame.
    """
    csv_dir = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "data", "csv")
    print("CSV directory:", csv_dir)
    csv_files = glob.glob(os.path.join(csv_dir, "sf_crime_*.csv.gz"))
    print("CSV files found:", csv_files)

    if csv_files:
        file_path = max(csv_files, key=os.path.getctime)
    else:
        return None

    try:
        df = pd.read_csv(file_path, compression='gzip', low_memory=False)

        # Convert 'incident_datetime' to datetime and extract hour, weekday, and month
        df['incident_datetime'] = pd.to_datetime(df['incident_datetime'], errors='coerce')
        df['incident_hour'] = df['incident_datetime'].dt.hour
        df['incident_month'] = df['incident_datetime'].dt.month
        df = df.dropna(subset=['latitude', 'longitude', 'incident_datetime', 'incident_category'])


        print(f"df shape: {df.shape}")
        print(f"df columns: {df.columns.tolist()}")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

df = read_file()

CSV directory: /Users/chichichen/Code/ecs273-team08-CrimeForecast/data/csv
CSV files found: ['/Users/chichichen/Code/ecs273-team08-CrimeForecast/data/csv/sf_crime_20240530_20250524.csv.gz']
df shape: (65988, 30)
df columns: ['incident_datetime', 'incident_date', 'incident_time', 'incident_year', 'incident_day_of_week', 'report_datetime', 'row_id', 'incident_id', 'incident_number', 'cad_number', 'report_type_code', 'report_type_description', 'incident_code', 'incident_category', 'incident_subcategory', 'incident_description', 'resolution', 'intersection', 'cnn', 'police_district', 'analysis_neighborhood', 'supervisor_district', 'supervisor_district_2012', 'latitude', 'longitude', 'point', 'filed_online', 'location', 'incident_hour', 'incident_month']


In [7]:

import google.generativeai as genai
genai.configure(api_key="AIzaSyDdMVZdEXxbuRyJTPh6oG4hp8ubRPBCDW8")

for model in genai.list_models():
    print(model.name, model.supported_generation_methods)

models/embedding-gecko-001 ['embedText', 'countTextTokens']
models/gemini-1.0-pro-vision-latest ['generateContent', 'countTokens']
models/gemini-pro-vision ['generateContent', 'countTokens']
models/gemini-1.5-pro-latest ['generateContent', 'countTokens']
models/gemini-1.5-pro-001 ['generateContent', 'countTokens', 'createCachedContent']
models/gemini-1.5-pro-002 ['generateContent', 'countTokens', 'createCachedContent']
models/gemini-1.5-pro ['generateContent', 'countTokens']
models/gemini-1.5-flash-latest ['generateContent', 'countTokens']
models/gemini-1.5-flash-001 ['generateContent', 'countTokens', 'createCachedContent']
models/gemini-1.5-flash-001-tuning ['generateContent', 'countTokens', 'createTunedModel']
models/gemini-1.5-flash ['generateContent', 'countTokens']
models/gemini-1.5-flash-002 ['generateContent', 'countTokens', 'createCachedContent']
models/gemini-1.5-flash-8b ['createCachedContent', 'generateContent', 'countTokens']
models/gemini-1.5-flash-8b-001 ['createCachedCon

In [8]:
import google.generativeai as genai

genai.configure(api_key="AIzaSyDdMVZdEXxbuRyJTPh6oG4hp8ubRPBCDW8")
model = genai.GenerativeModel("models/gemini-1.5-flash-002")

def generate_features(row):
    prompt = f"""
    You are a crime analyst. Given the following incident description, categorize the event using a concise, meaningful category, and summarize it in one sentence.

    Incident Category: {row['incident_category']}
    Incident Description: {row['incident_description']}

    Return:
    1. Refined Category:
    2. Summary:
    3. Tags:
    """
    response = model.generate_content(prompt)
    result = response.text.split("\n")
    return pd.Series({
        "refined_category_llm": result[0].split(":")[-1].strip(),
        "llm_summary": result[1].split(":")[-1].strip(),
        "llm_tags": result[2].split(":")[-1].strip()
    })

df[["refined_category_llm", "llm_summary", "llm_tags"]] = df.apply(generate_features, axis=1)


ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).

429 Resource has been exhausted (Reach the limit of free plan)

In [12]:
import os
import pandas as pd
import google.generativeai as genai
import time

# 初始化 Gemini
genai.configure(api_key="你的_API_KEY")
model = genai.GenerativeModel("models/gemini-1.5-flash-002")

# 設定
batch_size = 3000
sleep_sec = 60
output_dir = "outputs"
os.makedirs(output_dir, exist_ok=True)


# 若沒有欄位，先補一個空的分類欄位
if "llm_balanced_category" not in df.columns:
    df["llm_balanced_category"] = None

total_len = len(df)
num_batches = (total_len + batch_size - 1) // batch_size

# 偵測已完成的批次
completed_batches = {
    int(f.split("_")[-1].split(".")[0])
    for f in os.listdir(output_dir)
    if f.startswith("llm_augmented_partial_batch_") and f.endswith(".csv")
}

print(f"✅ 已完成批次：{sorted(completed_batches)}")

def process_batch(batch_descriptions, batch_start_idx):
    formatted = "\n".join([f"{i+1}. {desc}" for i, desc in enumerate(batch_descriptions)])

    prompt = f"""
以下是 {len(batch_descriptions)} 筆犯罪描述：
{formatted}

請你幫我分類。規則如下：
1. 每筆回傳格式：<編號>. <分類名稱>（例如：1. Property Crime）
2. 分類數目請控制在 4-8 類之間。
3. 請避免使用模糊分類如 Other、Misc。
4. 請確保分類平衡。

請只回傳分類結果，不要包含說明。
"""

    try:
        response = model.generate_content(prompt)
        lines = response.text.strip().split("\n")
        categories = [line.split(". ", 1)[1].strip() for line in lines if ". " in line]
        return categories
    except Exception as e:
        print(f"[❌ Error @ batch {batch_start_idx}] {e}")
        return [None] * len(batch_descriptions)

# 執行批次處理
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, total_len)
    
    if (i + 1) in completed_batches:
        print(f"⏭️ 略過已完成批次 {i+1}")
        continue

    print(f"▶ 處理第 {i+1}/{num_batches} 批次：{start_idx} ~ {end_idx - 1}")

    batch_df = df.iloc[start_idx:end_idx]
    batch_desc = batch_df["incident_description"].fillna("").tolist()
    labels = process_batch(batch_desc, start_idx)

    # 更新到主 DataFrame
    df.loc[start_idx:end_idx - 1, "llm_balanced_category"] = labels

    # 儲存該批次 CSV
    batch_out_path = os.path.join(output_dir, f"llm_augmented_partial_batch_{i+1}.csv")
    df.iloc[start_idx:end_idx].to_csv(batch_out_path, index=False)
    print(f"✅ 批次 {i+1} 儲存至 {batch_out_path}")

    # 睡眠保護
    if i < num_batches - 1:
        print(f"😴 等待 {sleep_sec} 秒避免觸發 Gemini 限制")
        time.sleep(sleep_sec)

# 最後合併所有資料儲存總檔案
final_path = os.path.join(output_dir, "llm_augmented_full.csv")
df.to_csv(final_path, index=False)
print(f"🎉 所有批次完成，結果儲存於：{final_path}")




✅ 已完成批次：[]
▶ 處理第 1/22 批次：0 ~ 2999


E0000 00:00:1748321655.067908 27483568 plugin_credentials.cc:81] validate_metadata_from_plugin: INTERNAL:Illegal header value
E0000 00:00:1748321655.068121 27483568 plugin_credentials.cc:82] Plugin added invalid metadata value.
E0000 00:00:1748321655.713883 27474693 plugin_credentials.cc:81] validate_metadata_from_plugin: INTERNAL:Illegal header value
E0000 00:00:1748321655.713900 27474693 plugin_credentials.cc:82] Plugin added invalid metadata value.
E0000 00:00:1748321656.524160 27474693 plugin_credentials.cc:81] validate_metadata_from_plugin: INTERNAL:Illegal header value
E0000 00:00:1748321656.524176 27474693 plugin_credentials.cc:82] Plugin added invalid metadata value.
E0000 00:00:1748321657.550079 27474693 plugin_credentials.cc:81] validate_metadata_from_plugin: INTERNAL:Illegal header value
E0000 00:00:1748321657.550155 27474693 plugin_credentials.cc:82] Plugin added invalid metadata value.
E0000 00:00:1748321658.069139 27474693 plugin_credentials.cc:81] validate_metadata_from_

[❌ Error @ batch 0] Timeout of 600.0s exceeded, last exception: 503 Illegal metadata


E0000 00:00:1748324163.435019 27474693 plugin_credentials.cc:81] validate_metadata_from_plugin: INTERNAL:Illegal header value
E0000 00:00:1748324163.435083 27474693 plugin_credentials.cc:82] Plugin added invalid metadata value.


ValueError: Must have equal len keys and value when setting with an iterable