In [75]:
import os
import numpy as np
import pandas as pd

cpu_df = pd.read_csv("./data/cpu.csv")
cooler_df = pd.read_csv("./data/cpu-cooler.csv")
storage_df = pd.read_csv("./data/internal-hard-drive.csv")
memory_df = pd.read_csv("./data/memory.csv")
motherboard_df = pd.read_csv("./data/motherboard.csv")

In [76]:
def take_average(l, suffix):
    for i in range(len(l)):
        entry = l[i]
        try:
            noises = entry.split("-")
            if len(noises) == 1:
                l[i] = noises[0].removesuffix(suffix)
            else:
                noises[-1] = noises[-1].removesuffix(suffix)
                noises = list(map(str.strip, noises))
                noises = list(map(float, noises))
                l[i] = round(np.average(np.array(noises)), 3)
        except: 
            continue

    return l

def capacity_to_gb(l):
    for i in range(len(l)):
        try:
            entry = l[i]
            if entry.endswith("TB"):
                l[i] = float(entry.removesuffix(" TB")) * 1000
            elif entry.endswith("GB"):
                l[i] = float(entry.removesuffix(" GB"))
            elif entry.endswith("MB"):
                l[i] = float(entry.removesuffix(" MB")) * 0.001
        except:
            continue

    return l

def process_modules(l):
    counts = []
    gbs = []
    totals = []
    for i in range(len(l)):
        entry = l[i].split("x")
        count = int(entry[0].strip())
        gb = entry[1].strip()
        if gb.endswith("MB"):
            gb = int(gb.removesuffix("MB"))*1000
        else:
            gb = int(gb.removesuffix("GB"))
        total = count * gb 

        counts.append(count)
        gbs.append(gb)
        totals.append(total)

    return counts, gbs, totals

In [77]:
# CPU preprocessing 
cpu_df["price"] = cpu_df["price"].str.extract(r'(\d+\.?\d*)').astype(float)
cpu_df["performance_core_clock"] = cpu_df["performance_core_clock"].str.removesuffix(" GHz")
cpu_df["performance_core_boost_clock"] = cpu_df["performance_core_boost_clock"].str.removesuffix(" GHz")
cpu_df["tdp"] = cpu_df["tdp"].str.removesuffix(" W")

# cooler preprocessing
cooler_df["price"] = cooler_df["price"].str.extract(r'(\d+\.?\d*)').astype(float)
cooler_df["average_noise_level"] = take_average(list(cooler_df["noise_level"]), " dB")
cooler_df["average_fan_rpm"] = take_average(list(cooler_df["fan_rpm"]), " RPM")
cooler_df["radiator_size"] = cooler_df["radiator_size"].str.removesuffix(" mm")

# storage preprocessing 
storage_df["price"] = storage_df["price"].str.extract(r'(\d+\.?\d*)').astype(float)
storage_df["cache_gb"] = capacity_to_gb(list(storage_df["cache"]))
storage_df["capacity_gb"] = capacity_to_gb(list(storage_df["capacity"]))
storage_df["price_per_gb"] = storage_df["price_per_gb"].str.extract(r'(\d+\.?\d*)').astype(float)

# memory preprocessing
memory_df["price"] = memory_df["price"].str.extract(r'(\d+\.?\d*)').astype(float)
memory_df["module_count"], memory_df["gb_per_module"], memory_df["total_ram"] = process_modules(list(memory_df["modules"]))
memory_df["price_per_gb"] = memory_df["price_per_gb"].str.extract(r'(\d+\.?\d*)').astype(float)
memory_df["first_word_latency"] = memory_df["first_word_latency"].str.removeprefix(" ns")

# motherboard preprocessing 
motherboard_df["price"] = motherboard_df["price"].str.extract(r'(\d+\.?\d*)').astype(float)
motherboard_df["max_memory_gb"] = capacity_to_gb(list(motherboard_df["max_memory"]))

In [78]:
cpu_df

Unnamed: 0,title,core_count,performance_core_clock,performance_core_boost_clock,microarchitecture,tdp,integrated_graphics,rating,price
0,AMD Ryzen 7 9800X3D,8,4.7,5.2,Zen 5,120,Radeon,4.5,499.00
1,AMD Ryzen 7 7800X3D,8,4.2,5,Zen 4,120,Radeon,4.5,391.12
2,AMD Ryzen 5 7600X,6,4.7,5.3,Zen 4,105,Radeon,4.5,206.30
3,AMD Ryzen 5 5600X,6,3.7,4.6,Zen 3,65,,4.5,149.00
4,AMD Ryzen 7 9700X,8,3.8,5.5,Zen 5,65,Radeon,4.5,303.58
...,...,...,...,...,...,...,...,...,...
1407,AMD Phenom II X6 1065T,6,2.7,,K10,95,,,
1408,Intel Core i5-6600T,4,2.7,3.5,Skylake,35,Intel HD Graphics 530,,
1409,Intel Xeon E3-1225 V5,4,3.3,3.7,Skylake,80,Intel HD Graphics P530,,
1410,Intel Xeon E3-1245 V5,4,3.5,3.9,Skylake,80,Intel HD Graphics P530,,


In [79]:
cooler_df

Unnamed: 0,title,rating,price,fan_rpm,noise_level,color,radiator_size,average_noise_level,average_fan_rpm
0,Thermalright Peerless Assassin 120 SE,4.5,34.90,1550 RPM,25.6 dB,Black / Silver,,25.6,1550
1,Cooler Master Hyper 212 Black Edition,4.5,29.99,650 - 2000 RPM,6.5 - 26 dB,Black,,16.25,1325.0
2,ARCTIC Liquid Freezer III 360,4.5,143.44,200 - 1800 RPM,,Black,360,,1000.0
3,Thermalright Phantom Spirit 120 SE ARGB,4.5,37.90,1500 RPM,25.6 dB,Black / Silver,,25.6,1500
4,ARCTIC Liquid Freezer III 360 A-RGB,4.5,149.99,200 - 2000 RPM,,Black,360,,1100.0
...,...,...,...,...,...,...,...,...,...
2764,ID-COOLING FX120 ARGB,,,500 - 1800 RPM,35.2 dB,Black,120,35.2,1150.0
2765,ID-COOLING SE-206-XT,,,700 - 1800 RPM,35.2 dB,Black / Silver,,35.2,1250.0
2766,ID-COOLING SE-206-XT ARGB,,,700 - 1800 RPM,35.2 dB,Black,,35.2,1250.0
2767,Ocypus Delta L24 ARGB,,,500 - 2000 RPM,29 dB,Black,240,29,1250.0


In [80]:
storage_df

Unnamed: 0,title,rating,price,capacity,price_per_gb,type,cache,form_factor,interface,cache_mb,capacity_gb
0,Samsung 990 Pro,4.5,169.99,2 TB,0.085,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
1,Crucial P3 Plus,4.5,61.99,1 TB,0.062,SSD,,M.2-2280,M.2 PCIe 4.0 X4,,1000.0
2,Crucial P3 Plus,4.5,121.99,2 TB,0.061,SSD,,M.2-2280,M.2 PCIe 4.0 X4,,2000.0
3,Samsung 990 Pro,4.5,302.00,4 TB,0.075,SSD,4096 MB,M.2-2280,M.2 PCIe 4.0 X4,4.096,4000.0
4,Kingston NV3,4.5,62.99,1 TB,0.063,SSD,,M.2-2280,M.2 PCIe 4.0 X4,,1000.0
...,...,...,...,...,...,...,...,...,...,...,...
3695,Hitachi Deskstar,,,3 TB,,5400 RPM,32 MB,"3.5""",SATA 6.0 Gb/s,0.032,3000.0
3696,Seagate SV35,,,1 TB,,5400 RPM,64 MB,"3.5""",SATA 6.0 Gb/s,0.064,1000.0
3697,Seagate SV35,,,2 TB,,5400 RPM,64 MB,"3.5""",SATA 6.0 Gb/s,0.064,2000.0
3698,Seagate Enterprise NAS,,,3 TB,,7200 RPM,128 MB,"3.5""",SATA 6.0 Gb/s,0.128,3000.0


In [81]:
memory_df

Unnamed: 0,title,rating,price,speed,modules,price_per_gb,color,first_word_latency,cas_latency,module_count,gb_per_module,total_ram
0,G.Skill Ripjaws V 32 GB,4.5,52.99,DDR4-3600,2 x 16GB,1.656,Black,10 ns,18.0,2,16,32
1,G.Skill Aegis 16 GB,4.5,29.99,DDR4-3200,2 x 8GB,1.874,Red / Black,10 ns,16.0,2,8,16
2,Corsair Vengeance RGB 64 GB,4.5,239.99,DDR5-6000,2 x 32GB,3.750,White / Silver,10 ns,30.0,2,32,64
3,Silicon Power SP016GLLTU160N22 16 GB,4.5,15.99,DDR3-1600,2 x 8GB,0.999,Green,13.75 ns,11.0,2,8,16
4,G.Skill Flare X5 32 GB,,114.99,DDR5-6000,2 x 16GB,3.593,Black,9.333 ns,28.0,2,16,32
...,...,...,...,...,...,...,...,...,...,...,...,...
5595,TEAMGROUP T-Force Vulcan\xce\xb1 16 GB,,,DDR5-5200,2 x 8GB,,Red,15.385 ns,40.0,2,8,16
5596,TEAMGROUP T-Force Vulcan\xce\xb1 32 GB,,,DDR5-5200,2 x 16GB,,Red,15.385 ns,40.0,2,16,32
5597,TEAMGROUP T-Force Vulcan\xce\xb1 64 GB,,,DDR5-5200,2 x 32GB,,Red,15.385 ns,40.0,2,32,64
5598,TEAMGROUP T-Force Vulcan\xce\xb1 64 GB,,,DDR5-5600,2 x 32GB,,Red,12.857 ns,36.0,2,32,64


In [82]:
motherboard_df

Unnamed: 0,title,rating,price,cpu_socket,form_factor,max_memory,memory_slots,color,max_memory_gb
0,MSI B650 GAMING PLUS WIFI,4.5,169.99,AM5,ATX,192 GB,4,Black,192.0
1,Asus PRIME B650-PLUS WIFI,5.0,149.99,AM5,ATX,192 GB,4,Black / Silver,192.0
2,MSI MAG B650 TOMAHAWK WIFI,4.5,199.62,AM5,ATX,256 GB,4,Black,256.0
3,Asus PRIME B550M-A WIFI II,4.0,119.99,AM4,Micro ATX,128 GB,4,Blue / Silver,128.0
4,Gigabyte X870E AORUS ELITE WIFI7,4.0,315.08,AM5,ATX,256 GB,4,Black,256.0
...,...,...,...,...,...,...,...,...,...
4788,MSI PRO B860-VC WIFI6E,,,LGA1851,ATX,256 GB,4,Black,256.0
4789,Biostar A620MHC,,,AM5,Micro ATX,128 GB,2,Black,128.0
4790,Asus PRIME B850M-A-CSM,,,AM5,Micro ATX,256 GB,4,Black / Silver,256.0
4791,Asus PRIME B850M-A,,,AM5,Micro ATX,256 GB,4,Black / Silver,256.0


In [83]:
cpu_df.to_csv('../data/cpu.csv', index=False)
cooler_df.to_csv('../data/cooler.csv', index=False)
storage_df.to_csv('../data/storage.csv', index=False)
memory_df.to_csv('../data/memory.csv', index=False)
motherboard_df.to_csv('../data/motherboard.csv', index=False)