In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import seaborn as sns
os.makedirs("outputs", exist_ok=True)

def read_file():
    """
    Reads a CSV file and returns a DataFrame.
    """
    csv_dir = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "data", "csv")
    print("CSV directory:", csv_dir)
    csv_files = glob.glob(os.path.join(csv_dir, "sf_crime_*.csv.gz"))
    print("CSV files found:", csv_files)

    if csv_files:
        file_path = max(csv_files, key=os.path.getctime)
    else:
        return None

    try:
        df = pd.read_csv(file_path, compression='gzip', low_memory=False)

        # Convert 'incident_datetime' to datetime and extract hour, weekday, and month
        df['incident_datetime'] = pd.to_datetime(df['incident_datetime'], errors='coerce')
        df['incident_hour'] = df['incident_datetime'].dt.hour
        df['incident_month'] = df['incident_datetime'].dt.month
        df = df.dropna(subset=['latitude', 'longitude', 'incident_datetime', 'incident_category'])


        print(f"df shape: {df.shape}")
        print(f"df columns: {df.columns.tolist()}")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

df = read_file()

CSV directory: /Users/chichichen/Code/ecs273-team08-CrimeForecast/data/csv
CSV files found: ['/Users/chichichen/Code/ecs273-team08-CrimeForecast/data/csv/sf_crime_20240530_20250524.csv.gz']
df shape: (65988, 30)
df columns: ['incident_datetime', 'incident_date', 'incident_time', 'incident_year', 'incident_day_of_week', 'report_datetime', 'row_id', 'incident_id', 'incident_number', 'cad_number', 'report_type_code', 'report_type_description', 'incident_code', 'incident_category', 'incident_subcategory', 'incident_description', 'resolution', 'intersection', 'cnn', 'police_district', 'analysis_neighborhood', 'supervisor_district', 'supervisor_district_2012', 'latitude', 'longitude', 'point', 'filed_online', 'location', 'incident_hour', 'incident_month']


In [7]:

import google.generativeai as genai
genai.configure(api_key="AIzaSyDdMVZdEXxbuRyJTPh6oG4hp8ubRPBCDW8")

for model in genai.list_models():
    print(model.name, model.supported_generation_methods)

models/embedding-gecko-001 ['embedText', 'countTextTokens']
models/gemini-1.0-pro-vision-latest ['generateContent', 'countTokens']
models/gemini-pro-vision ['generateContent', 'countTokens']
models/gemini-1.5-pro-latest ['generateContent', 'countTokens']
models/gemini-1.5-pro-001 ['generateContent', 'countTokens', 'createCachedContent']
models/gemini-1.5-pro-002 ['generateContent', 'countTokens', 'createCachedContent']
models/gemini-1.5-pro ['generateContent', 'countTokens']
models/gemini-1.5-flash-latest ['generateContent', 'countTokens']
models/gemini-1.5-flash-001 ['generateContent', 'countTokens', 'createCachedContent']
models/gemini-1.5-flash-001-tuning ['generateContent', 'countTokens', 'createTunedModel']
models/gemini-1.5-flash ['generateContent', 'countTokens']
models/gemini-1.5-flash-002 ['generateContent', 'countTokens', 'createCachedContent']
models/gemini-1.5-flash-8b ['createCachedContent', 'generateContent', 'countTokens']
models/gemini-1.5-flash-8b-001 ['createCachedCon

In [8]:
import google.generativeai as genai

genai.configure(api_key="AIzaSyDdMVZdEXxbuRyJTPh6oG4hp8ubRPBCDW8")
model = genai.GenerativeModel("models/gemini-1.5-flash-002")

def generate_features(row):
    prompt = f"""
    You are a crime analyst. Given the following incident description, categorize the event using a concise, meaningful category, and summarize it in one sentence.

    Incident Category: {row['incident_category']}
    Incident Description: {row['incident_description']}

    Return:
    1. Refined Category:
    2. Summary:
    3. Tags:
    """
    response = model.generate_content(prompt)
    result = response.text.split("\n")
    return pd.Series({
        "refined_category_llm": result[0].split(":")[-1].strip(),
        "llm_summary": result[1].split(":")[-1].strip(),
        "llm_tags": result[2].split(":")[-1].strip()
    })

df[["refined_category_llm", "llm_summary", "llm_tags"]] = df.apply(generate_features, axis=1)


ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).

429 Resource has been exhausted (Reach the limit of free plan)