In [123]:
import os
import numpy as np
import pandas as pd

cpu_df = pd.read_csv("./data/cpu.csv")
cooler_df = pd.read_csv("./data/cpu-cooler.csv")
storage_df = pd.read_csv("./data/internal-hard-drive.csv")
memory_df = pd.read_csv("./data/memory.csv")
motherboard_df = pd.read_csv("./data/motherboard.csv")

In [124]:
def take_average(l, suffix):
    for i in range(len(l)):
        entry = l[i]
        try:
            noises = entry.split("-")
            if len(noises) == 1:
                l[i] = noises[0].removesuffix(suffix)
            else:
                noises[-1] = noises[-1].removesuffix(suffix)
                noises = list(map(str.strip, noises))
                noises = list(map(float, noises))
                l[i] = round(np.average(np.array(noises)), 3)
        except: 
            continue

    return l

def capacity_to_gb(l):
    for i in range(len(l)):
        entry = l[i]
        if entry.endswith("TB"):
            l[i] = float(entry.removesuffix(" TB")) * 1000
        elif entry.endswith("GB"):
            l[i] = float(entry.removesuffix(" GB"))

    return l

def process_modules(l):
    counts = []
    gbs = []
    totals = []
    for i in range(len(l)):
        entry = l[i].split("x")
        count = int(entry[0].strip())
        gb = entry[1].strip()
        if gb.endswith("MB"):
            gb = int(gb.removesuffix("MB"))*1000
        else:
            gb = int(gb.removesuffix("GB"))
        total = count * gb 

        counts.append(count)
        gbs.append(gb)
        totals.append(total)

    return counts, gbs, totals

In [127]:
# CPU preprocessing 
cpu_df["price"] = cpu_df["price"].str.removeprefix("$")
cpu_df["performance_core_clock"] = cpu_df["performance_core_clock"].str.removesuffix(" GHz")
cpu_df["performance_core_boost_clock"] = cpu_df["performance_core_boost_clock"].str.removesuffix(" GHz")
cpu_df["tdp"] = cpu_df["tdp"].str.removesuffix(" W")

# cooler preprocessing
cooler_df["price"] = cooler_df["price"].str.removeprefix("$")
cooler_df["average_noise_level"] = take_average(list(cooler_df["noise_level"]), " dB")
cooler_df["average_fan_rpm"] = take_average(list(cooler_df["fan_rpm"]), " mm")
cooler_df["radiator_size"] = cooler_df["radiator_size"].str.removesuffix(" mm")

# storage preprocessing 
storage_df["price"] = storage_df["price"].str.removeprefix("$")
storage_df["capacity_gb"] = capacity_to_gb(list(storage_df["capacity"]))
storage_df["price_per_gb"] = storage_df["price_per_gb"].str.removeprefix("$")

# memory preprocessing
memory_df["price"] = memory_df["price"].str.removeprefix("$")
memory_df["module_count"], memory_df["gb_per_module"], memory_df["total_ram"] = process_modules(list(memory_df["modules"]))
memory_df["price_per_gb"] = memory_df["price_per_gb"].str.removeprefix("$")
memory_df["first_word_latency"] = memory_df["first_word_latency"].str.removeprefix(" ns")

# motherboard preprocessing 
motherboard_df["price"] = motherboard_df["price"].str.removeprefix("$")
motherboard_df["max_memory_gb"] = capacity_to_gb(list(motherboard_df["max_memory"]))

In [129]:
cpu_df.to_csv('./clean_data/cpu.csv')
cooler_df.to_csv('./clean_data/cooler.csv')
storage_df.to_csv('./clean_data/storage.csv')
memory_df.to_csv('./clean_data/memory.csv')
motherboard_df.to_csv('./clean_data/motherboard.csv')