In [51]:
import pandas as pd
import string
import numpy as np

In [52]:
laptop_data = pd.read_csv('clean_data/clean_data.csv')

In [53]:
class DebugFeatureProcess:
    def __init__(self, features):
        self.features = features
    def printTokens(self, idx):
        feature = self.features[idx].replace('\xa0', ' ')
        feature = feature.replace('-', ' ')
        feature = feature.replace(',', ' ')
        tokens = [token.strip(string.punctuation).lower() for token in feature.split(' ')]
        tokens = [token for token in tokens if token != '']
        print(tokens)
    def printFeature(self, idx):
        print(self.features[idx])
    def printAll(self, indices):
        for idx in indices:
            print(str(idx), end = ": ")
            self.printTokens(idx)
            print(str(idx), end = ": ")
            self.printFeature(idx)

In [54]:
def isfloat(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [55]:
def memoryProcess(memories):
    anomalies = [902, 4815, 3315]
    '''
    3315: 8 GB  ,, DDR6
    902: 16 GB  ,, DDR4 2133/2400 MHz SO-DIMM
    4815: 8 GB  ,, DDRL3
    '''
    index = memories.index
    ram_capacity = pd.Series(index = index, dtype = 'float64') 
    bus = pd.Series(index = index, dtype = 'float64')
    ddr = pd.Series(index = index, dtype = 'int32')
    for idx, memory in zip(index, memories):
        try:
            if type(memory) != str:
                continue
            if idx in anomalies:
                continue
            memory = memory.lower()
            memory = memory.replace('\xa0', ' ')
            memory = memory.replace('-', ' ')
            memory = memory.replace(',', ' ')
            tokens = [token.strip(string.punctuation) for token in memory.split()]
            tokens = [token for token in tokens if token != '']
            for i in range(len(tokens)):

                if np.isnan(ram_capacity[idx]):
                    if tokens[i] == 'gb':
                        ram_capacity[idx] = round(float(tokens[i - 1]))
                    elif tokens[i] == 'mb':
                        ram_capacity[idx] = round(float(tokens[i - 1]) / 1024)

                if np.isnan(bus[idx]):    
                    if tokens[i] == 'mhz':
                        bus[idx] = float(tokens[i-1])
                    elif 'mhz' in tokens[i]:
                        num = tokens[i][:-3]
                        if isfloat(num):
                            bus[idx] = float(tokens[i][:-3])
                        else:
                            bus[idx] = float(tokens[i-1])

                if len(tokens[i]) > 2:
                    if tokens[i][:3] == "ddr":
                        if len(tokens[i]) > 3:
                            ddr[idx] = int(tokens[i][3])
                        else:
                            ddr[idx] = 1
        except Exception as e:
                print(idx)
                print(str(e))
    return pd.DataFrame({'ram_capacity_gb' : ram_capacity,
                         'bus_mhz': bus,
                         'ddr': ddr})

In [56]:
processed_memory = memoryProcess(laptop_data["Memory"])

In [57]:
'''
    3315: 8 GB  ,, DDR6
    902: 16 GB  ,, DDR4 2133/2400 MHz SO-DIMM
    4815: 8 GB  ,, DDRL3
'''
processed_memory["ram_capacity_gb"][[3315, 902, 4815]] = [8, 16, 8]
processed_memory["ddr"][902] = 4

In [58]:
for capacity in [32, 64, 3, 0, 48, 40, 7]:
    processed_memory["ram_capacity_gb"][processed_memory['ram_capacity_gb'] == capacity] = np.nan

In [59]:
def storageProcess(storages):
    anomalies = [1970, 4928, 6169, 1398]
    '''
    1398:  3x512 GB NVMe SSD
    1970: Ramaxel RTNTE256PCA8EADL, M.2 GB
    4928: Ramaxel RTNTE256PCA8EADL, M.2 GB
    6169:  TB SSD
    '''
    index = storages.index
    storage_capacity = pd.Series(index = index, dtype = 'float64') #gb
    rpm = pd.Series(index = index, dtype = 'float64') #rpm
    ssd = pd.Series(0, index = index, dtype = 'int32')
    hdd = pd.Series(0, index = index, dtype = 'int32')
    for idx, storage in zip(index, storages):
        try:
            if type(storage) != str:
                continue
            if idx in anomalies:
                continue
            if '+' in storage:
                continue
            storage = storage.lower()
            storage = storage.replace('\xa0', ' ')
            storage = storage.replace('-', ' ')
            storage = storage.replace(',', ' ')
            tokens = [token.strip(string.punctuation) for token in storage.split()]
            tokens = [token for token in tokens if token != '']
            
            for i in range(len(tokens)):
                
                if np.isnan(storage_capacity[idx]):
                    if tokens[i] == 'gb':
                        storage_capacity[idx] = float(tokens[i-1])
                    elif 'gb' in tokens[i]:
                        if isfloat(tokens[i][:-2]):
                            storage_capacity[idx] = float(tokens[i][:-2])

                    if tokens[i] == 'tb':
                        storage_capacity[idx] = float(tokens[i-1]) * 1024
                    elif 'tb' in tokens[i]:
                        if isfloat(tokens[i][:-2]):
                            storage_capacity[idx] = float(tokens[i][:-2])
                    
                
            if 'hdd' in storage or 'rpm' in storage:
                hdd[idx] = 1
            elif 'ssd' in storage:
                ssd[idx] = 1
                    
        except Exception as e:
            DebugFeatureProcess(storages).printAll([idx])
            print(str(e))
            print()
    return pd.DataFrame({'storage_capacity_gb':storage_capacity,
                         'hdd': hdd,
                         'ssd': ssd})

In [60]:
processed_storage = storageProcess(laptop_data["Storage"])

In [61]:
#6169:  TB SSD
#1398:  3x512 GB NVMe SSD
processed_storage.loc[6169] = [1024, 0, 1]
processed_storage.loc[1398] = [3 * 512, 0, 1]

In [62]:
for idx, capacity in enumerate(processed_storage["storage_capacity_gb"]):
    if capacity not in (128, 256, 512, 1024, 1536, 2048):
        processed_storage["storage_capacity_gb"][idx] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_storage["storage_capacity_gb"][idx] = np.nan


In [63]:
def displayProcess(displays):
    anomalies = [1597, 2736]
    '''
    1597:  16:9, 1920 x 1080 pixel, IPS, glossy: no
    2736: 4 USB 3.0 / 3.1 Gen1, 1 USB 3.1 Gen2, 1 Thunderbolt, 1 HDMI, 1 DisplayPort, 1 Kensington Lock, Audio Connections: 3.5 mm audio jack, Card Reader: SD
    '''
    index = displays.index
    inch = pd.Series(index = index, dtype = 'float64') 
    pixel_height = pd.Series(index = index, dtype = 'int32')
    pixel_width = pd.Series(index = index, dtype = 'int32')
    for idx, display in zip(index, displays):
        if idx in anomalies:
            continue
        if type(display) != str:
            continue
        try:
            display = display.lower()
            tokens = [token.strip(string.punctuation + ' ') for token in display.split(',')]
            tokens = [token for token in tokens if token != '']
            inch_part, pixel_part = tokens[0], tokens[1]
            inch[idx] = float(inch_part.split()[0])
            tokens = pixel_part.split()
            pixel_height[idx], _, pixel_width[idx] = int(tokens[0]), tokens[1], int(tokens[2])
        except Exception as e:
            print(str(idx), end = ": ")
            DebugFeatureProcess(displays).printFeature(idx)
            print(str(e))
    return pd.DataFrame({'inch': inch,
                         'pixel_height': pixel_height,
                         'pixel_width': pixel_width})

In [64]:
processed_display = displayProcess(laptop_data["Display"])

In [65]:
# 1597:  16:9, 1920 x 1080 pixel, IPS, glossy: no
processed_display.loc[1597] = [np.nan, 1920, 1080]

In [66]:
laptop_data["Memory"]

0                16 GB  ,, 4x 4096 MB DDR3-RAM (1333 MHz)
1                          4 GB  ,, A-Data AD73I1C1674EV 
2       8 GB  ,, 2x 4096 MB DDR3-RAM (1333 MHz), max. ...
3                                                  4 GB  
4                               1024 MB  ,, DDR2, max 2GB
                              ...                        
7682                                       16 GB  ,, DDR4
7683                                        8 GB  ,, DDR4
7684    8 GB  ,, DDR4-3200, Single-Channel-Mode, one m...
7685                                  16 GB  ,, DDR4-3200
7686                   8 GB  ,, DDR4 SDRAM single-channel
Name: Memory, Length: 7687, dtype: object

In [67]:
processed_memory

Unnamed: 0,ram_capacity_gb,bus_mhz,ddr
0,16.0,1333.0,3.0
1,4.0,,
2,8.0,1333.0,3.0
3,4.0,,
4,1.0,,2.0
...,...,...,...
7682,16.0,,4.0
7683,8.0,,4.0
7684,8.0,,4.0
7685,16.0,,4.0


In [68]:
laptop_data["Storage"]

0       2x WDC Scorpio Blue WD5000BEVT-22A0RT0 (RAID 0...
1                           Toshiba MK5059GSXP , 500 GB  
2         WDC Scorpio Black WD7500BPKT-75PKA4T0, 750 GB  
3                                                     NaN
4                          Fujitsu MJA2250BH G2, 250 GB  
                              ...                        
7682                                              1TB SSD
7683                                            512GB SSD
7684                                           , 256 GB  
7685                      1 TB PCIe 3x4 NVMe M.2 2280 SSD
7686                                           , 512 GB  
Name: Storage, Length: 7687, dtype: object

In [69]:
processed_storage

Unnamed: 0,storage_capacity_gb,hdd,ssd
0,,0,0
1,,0,0
2,,0,0
3,,0,0
4,,0,0
...,...,...,...
7682,,0,1
7683,512.0,0,1
7684,256.0,0,0
7685,1024.0,0,1


In [70]:
laptop_data["Display"]

0       17.30 inch 16:9, 1920 x 1080 pixel, CMO1720, F...
1       14.00 inch 16:9, 1366 x 768 pixel, LG LP140WH2...
2       15.60 inch 16:9, 1920 x 1080 pixel, LG Philips...
3          13.30 inch 16:9, 1366 x 768 pixel, glossy: yes
4       10.10 inch 16:9, 1024 x 600 pixel, WSVGA LED T...
                              ...                        
7682    15.60 inch 16:9, 1920 x 1080 pixel 141 PPI, IP...
7683    15.60 inch 16:9, 1920 x 1080 pixel 141 PPI, IP...
7684    15.60 inch 16:9, 1920 x 1080 pixel 141 PPI, AU...
7685           14.00 inch 16:9, 1920 x 1080 pixel 157 PPI
7686    15.60 inch 16:9, 1920 x 1080 pixel, CEC PANDA ...
Name: Display, Length: 7687, dtype: object

In [71]:
processed_display

Unnamed: 0,inch,pixel_height,pixel_width
0,17.3,1920.0,1080.0
1,14.0,1366.0,768.0
2,15.6,1920.0,1080.0
3,13.3,1366.0,768.0
4,10.1,1024.0,600.0
...,...,...,...
7682,15.6,1920.0,1080.0
7683,15.6,1920.0,1080.0
7684,15.6,1920.0,1080.0
7685,14.0,1920.0,1080.0


In [72]:
laptop_data[processed_memory.columns] = processed_memory

In [73]:
laptop_data[processed_storage.columns] = processed_storage

In [74]:
laptop_data[processed_display.columns] = processed_display

In [75]:
laptop_data.to_csv('clean_data/clean_data_K.csv', index=False)