In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

def generate_dirty_data(num_rows=1000):
    # 1. 基礎資料生成
    ids = range(1, num_rows + 1)
    names = [f"User_{i}" for i in ids]
    
    # 模擬：有時候是數字，有時候變成字串 '1000'，有時候有 '$' 符號
    incomes = np.random.randint(22000, 150000, size=num_rows).astype(object)
    
    # 模擬：日期格式混亂 (2025-01-01, 2025/01/01, nan)
    base_date = datetime(2024, 1, 1)
    dates = [base_date + timedelta(days=np.random.randint(0, 365)) for _ in range(num_rows)]
    dates = [d.strftime("%Y-%m-%d") for d in dates]

    df = pd.DataFrame({
        'Cust_ID': ids,
        'Name': names,
        'Income': incomes,
        'Join_Date': dates,
        'Credit_Score': np.random.randint(300, 850, size=num_rows)
    })

    # 2. 開始搞破壞 (Injecting Dirt)
    
    # A. 隨機插入 NaN (缺失值)
    for col in df.columns:
        df.loc[df.sample(frac=0.1).index, col] = np.nan

    # B. 搞亂收入格式 (加入 '$', ',', 或者變成負數)
    mask_currency = df.sample(frac=0.1).index
    df.loc[mask_currency, 'Income'] = df.loc[mask_currency, 'Income'].apply(lambda x: f"${x}" if pd.notnull(x) else x)
    
    mask_comma = df.sample(frac=0.1).index
    df.loc[mask_comma, 'Income'] = df.loc[mask_comma, 'Income'].apply(lambda x: f"{x}," if pd.notnull(x) else x)

    # C. 搞亂日期格式 (混入 YYYY/MM/DD 或 文字)
    mask_slash = df.sample(frac=0.1).index
    df.loc[mask_slash, 'Join_Date'] = df.loc[mask_slash, 'Join_Date'].apply(lambda x: x.replace('-', '/') if pd.notnull(x) else x)

    # D. 製造重複資料 (Duplicates)
    df = pd.concat([df, df.sample(n=50)], ignore_index=True)
    
    # E. 製造極端值 (Outliers) - 比如信用分數突然變成 9999
    df.loc[df.sample(n=5).index, 'Credit_Score'] = 9999

    return df

# 每日執行這行，存成 csv，然後開始你的練習
df_dirty = generate_dirty_data()
df_dirty.to_csv("daily_challenge.csv", index=False)
print("今日挑戰已生成：daily_challenge.csv")

今日挑戰已生成：daily_challenge.csv
