In [1]:
import pandas as pd

In [2]:
def clean_and_convert_price(df, price_column='price_usd', output_column='price_php', usd_to_php=58):
    # Remove rows with missing or non-numeric prices
    df = df[pd.to_numeric(df[price_column], errors='coerce').notnull()].copy()

    # Convert to float and apply exchange rate
    df[output_column] = df[price_column].astype(float) * usd_to_php

    return df


In [30]:
gpu_df = pd.read_csv('filtered_gpu_dataset.csv')

In [31]:
gpu_df = clean_and_convert_price(gpu_df, price_column='price', output_column='item_price_in_php', usd_to_php=58)

gpu_df.head()
gpu_df.to_csv('data/processed/gpu_test.csv')

In [16]:
def get_gpu_generation(chipset: str) -> str:
    chipset = chipset.lower()
    if "rtx 50" in chipset or "rtx 40" in chipset or "rtx 30" in chipset:
        return "Supported"
    elif "rx 9000" in chipset or "rx 7000" in chipset or "rx 6000" in chipset:
        return "Supported"
    return "Old/Unsupported"

gpu_df['gpu_generation'] = gpu_df['chipset'].apply(get_gpu_generation)

gpu_df = gpu_df[gpu_df['gpu_generation'] == "Supported"]

gpu_df = gpu_df.drop(columns=["gpu_generation"])
gpu_df.to_csv('data/processed/gpu_test.csv')

In [25]:
gpu_df = pd.read_csv('data/raw/ppp/video-card.csv')

GEN_LIMITS = {
    "RTX 50": 25,
    "RTX 40": 30,
    "RTX 30": 8,
    "RX 9000": 20,
    "RX 7000": 25,
    "RX 6000": 5
}

def detect_generation(chipset):
    if "RTX 50" in chipset:
        return "RTX 50"
    elif "RTX 40" in chipset:
        return "RTX 40"
    elif "RTX 30" in chipset:
        return "RTX 30"
    elif "RX 9000" in chipset or "RX 9" in chipset:
        return "RX 9000"
    elif "RX 7000" in chipset or "RX 7" in chipset:
        return "RX 7000"
    elif "RX 6000" in chipset or "RX 6" in chipset:
        return "RX 6000"
    return None

# Add 'generation' column
gpu_df["generation"] = gpu_df["chipset"].apply(detect_generation)

# Drop unsupported generations
gpu_df = gpu_df[gpu_df["generation"].notnull()]

# Extract model core (e.g., RTX 4060, RX 6600)
gpu_df["chip_core"] = gpu_df["chipset"].str.extract(r"(RTX\s?\d{4,5}|RX\s?\d{4,5})", expand=False)

# Sort by generation > chip_core > price
gpu_df = gpu_df.sort_values(by=["generation", "chip_core"])

# === Apply per-generation cap ===
filtered_df = pd.DataFrame()
for gen, limit in GEN_LIMITS.items():
    gen_df = gpu_df[gpu_df["generation"] == gen]
    # group by chip_core (e.g., 4070, 5080) and get 5 per model max
    per_model = gen_df.groupby("chip_core").head(5)
    # then slice to match total allowed per generation
    capped = per_model.head(limit)
    filtered_df = pd.concat([filtered_df, capped])

# Clean up
filtered_df = filtered_df.drop(columns=["generation", "chip_core"])
filtered_df.to_csv("filtered_gpu_dataset.csv", index=False)

print(f"Filtered GPU dataset saved. Final count: {len(filtered_df)}")


Filtered GPU dataset saved. Final count: 91
