In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import neweggutils as nu
import re

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df = pd.read_csv('final.csv')

In [4]:
df.drop(labels=['Unnamed: 0'], inplace=True, axis=1)

In [6]:
df_clean = df.copy()

# Drop duplicates and unnecessary columns

In [9]:
df_clean = df_clean[~df_clean.duplicated()]

In [12]:
df_clean.columns.values

array(['price', 'link', 'Brand', 'Series', 'Model', 'Part Number', 'Type',
       'Form Factor', 'Usage', 'Colors', 'Processor',
       'Processor Main Features', 'Cache Per Processor', 'Memory',
       'Storage', 'Optical Drive', 'Graphics', 'Power Supply',
       'Operating System', 'CPU Type', 'CPU Speed', 'L3 Cache Per CPU',
       'CPU Main Features', 'GPU/VGA Type', 'Video Memory',
       'Virtual Reality Ready', 'Memory Capacity', 'Memory Speed',
       'Memory Spec', 'Memory Slot (Total)', 'Maximum Memory Supported',
       'Optane Memory', 'HDD', 'HDD RPM', 'Optical Drive Type',
       'Screen Size', 'LAN Speed', 'WLAN', 'Bluetooth', 'Mouse Type',
       'Keyboard Type', 'Dimensions (H x W x D)', 'Weight',
       'Graphics Interface', 'SSD', 'Keyboard', 'Storage Spec',
       'Optical Drive Spec', 'Media Slots', 'Display Feature',
       'Front USB', 'Mouse', 'Special Features', 'Package Contents',
       'Software Included', 'Audio Chipset', 'LAN Chipset', 'Video Ports',
    

In [15]:
df_clean = df_clean.drop(labels=['Series', 'Model', 'Part Number', 'Model', 'Colors', 'Cache Per Processor',
                                 'Optical Drive', 'L3 Cache Per CPU', 'Virtual Reality Ready', 'Optane Memory',
                                 'LAN Speed', 'WLAN', 'Bluetooth', 'Mouse Type', 'Keyboard Type',
                                 'Dimensions (H x W x D)', 'Keyboard', 'Mouse', 'Special Features',
                                 'Package Contents', 'Software Included', 'Audio Chipset', 'LAN Chipset',
                                 'Speaker Configuration', 'Front Audio Ports', 'L2 Cache Per CPU', 
                                 'COM', 'Speaker Features', 'FIR', 'Card Reader', 'Installed Qty', 'Touchscreen',
                                 'LPT', 'External Bays', 'Internal Bays', 'Package Type', 'Optical Drive 2',
                                 'Optical Drive 2 Type', 'Front IEEE 1394'], axis=1)

In [17]:
df_clean = df_clean.drop(labels=['Power Supply', 'Memory Speed', 'Memory Spec', 'Memory Slot (Total)',
                                 'Maximum Memory Supported', 'HDD RPM', 'Weight', 'Media Slots', 'Display Feature',
                                 'HDD Interface', 'Ethernet', 'Chipset', 'Audio Features', 'Memory Slot (Available)',
                                 'Graphics Card', 'Screen Type', 'PCI Slots (Available/Total)', 'Monitor',
                                 'Audio', 'Speaker', 'SLI / Crossfire', 'Max Supported Qty'], axis=1)

In [26]:
df_clean.to_csv('final_pared.csv')

In [42]:
df_clean.reset_index(inplace=True, drop=True)

## Grab processor brand, cores, and speed

In [44]:
df_clean['proc_brand'] = df_clean['Processor'].str.extract(r'^(INTEL|AMD)', flags=re.IGNORECASE)

In [74]:
df_clean.drop(labels=['Type', 'Usage'], axis=1, inplace=True)

In [88]:
df_clean['Processor Main Features'].str.extract(r'(-Core|Core)', flags=re.IGNORECASE)

0    0.252858
dtype: float64

In [141]:
df_clean['proc_cores'] = df_clean['Processor'].apply(lambda x: nu.num_cores(str(x)))

In [211]:
pat = re.compile('([0-9]\.[0-9]?)')
df_clean['proc_speed'] = df_clean['CPU Speed'].str.extract(pat)

In [159]:
df = df_clean.copy()

In [161]:
df.to_csv('grab_procs.csv')

## Grab RAM capacity and type (ddr2 vs ddr3 vs ddr4)

In [290]:
df_clean = df.copy()

In [291]:
df_clean['mem_type'] = df_clean['Memory Capacity'].apply(lambda x: nu.ram_type(str(x)))

In [292]:
df_clean['mem_type'].notna().sum()

2500

In [293]:
df_clean['mem_cap'] = df_clean['Memory Capacity'].apply(lambda x: nu.ram_cap(str(x)))

In [294]:
df_clean.to_csv('final_dirty1.csv')

## Get number of USB ports

In [295]:
df_clean['Front USB'] = df_clean['Front USB'].str.extract(r'(\d)')
df_clean['Rear USB'] = df_clean['Rear USB'].str.extract(r'(\d)')

AttributeError: Can only use .str accessor with string values, which use np.object_ dtype in pandas

In [296]:
df_clean['Front USB'].fillna(0, inplace=True)
df_clean['Rear USB'].fillna(0, inplace=True)

In [297]:
df_clean['Front USB'] = df_clean['Front USB'].astype(int)
df_clean['Rear USB'] = df_clean['Rear USB'].astype(int)

In [298]:
df_clean['num_USB'] = df_clean['Front USB'] + df_clean['Rear USB']

In [299]:
df_clean['num_USB'] = df_clean['num_USB'].replace(0, np.nan)

In [303]:
df_clean['num_USB'].fillna(df_clean['num_USB'].median(), inplace=True)

In [304]:
df_clean['num_USB'].describe()

count    2974.000000
mean        7.888702
std         1.624129
min         1.000000
25%         8.000000
50%         8.000000
75%         8.000000
max        10.000000
Name: num_USB, dtype: float64

In [307]:
df_clean.to_csv('cleaning.csv')

## Get operating system

In [509]:
df_clean = pd.read_csv('cleaning.csv')

In [510]:
df_clean['Operating System'] = df_clean['Operating System'].str.extract(r'(Windows \d*)')

In [511]:
df_clean['Operating System'].unique()

array(['Windows 10', nan, 'Windows 8', 'Windows 7'], dtype=object)

## Get storage capacity and type

In [578]:
df_clean['storage_type'] = df_clean['SSD'].apply(lambda x: nu.ssd_or_hdd(str(x)))

In [583]:
df_clean[['Storage', 'storage_type', 'storage_cap', 'SSD']].sample(50)

Unnamed: 0,Storage,storage_type,storage_cap,SSD
1769,1 TB,hdd,1024.0,No
2195,320 GB SATA 7200 RPM,hdd,320.0,No
1738,320 GB,hdd,320.0,No
622,250 GB HDD,hdd,250.0,
342,360 GB SSD,ssd,360.0,Brand New 360 GB
2187,160 GB SATA 7200 RPM,hdd,160.0,No
1208,250 GB,hdd,250.0,
211,256 GB SSD M.2 PCIe NVMe Opal,ssd,256.0,256 GB
984,,hdd,1024.0,No
1308,,hdd,1024.0,No


In [513]:
df_clean['storage_cap'] = df_clean['Storage'].apply(lambda x: nu.storage_cap(str(x)))

In [582]:
df_clean['storage_cap'].fillna(df_clean['SSD'].apply(lambda x: nu.storage_cap(str(x))), inplace=True)

In [514]:
df_clean['HDD'].unique()

array(['2 TB', '1 TB', nan, '500 GB', '250 GB', 'No', '320 GB', '3 TB',
       '2TB', '500G', '3TB', '250G', '750 GB', '160 GB', '1TB', '500GB',
       '80 GB', '320G', '1.5 TB'], dtype=object)

In [515]:
df_clean['HDD'] = df_clean['HDD'].apply(lambda x: nu.storage_cap(str(x)))

In [516]:
df_clean['storage_cap'].fillna(df_clean['HDD'], inplace=True)

## Get graphics type

In [529]:
df_clean['graphics'] = df_clean['GPU/VGA Type'].apply(lambda x: nu.graphics_type(str(x)))

In [518]:
df_clean['Graphics Interface'] = df_clean['Graphics Interface'].apply(lambda x: nu.graphics_type(str(x)))

In [531]:
df_clean['graphics'] = df_clean['graphics'].fillna(df_clean['Graphics Interface'])

## Clean up brand column

In [520]:
df_clean['Brand'] = df_clean['Brand'].apply(lambda x: nu.get_brand(str(x)))

## Exclude computers that come with a monitor

In [526]:
df_clean = df_clean[(df_clean['Screen Size'] == 'No Screen') | df_clean['Screen Size'].isna()]

In [527]:
df_clean.reset_index(inplace=True, drop=True)

In [548]:
df_clean['Form Factor'] = df_clean['Form Factor'].apply(lambda x: nu.form_factor(str(x)))

In [551]:
df_clean['form'] = df_clean['Form Factor']

In [553]:
df_clean['brand'] = df_clean['Brand']

In [584]:
df_clean.to_csv('cleaning.csv')

In [687]:
df_clean.head()

Unnamed: 0,price,link,Brand,Form Factor,Processor,Processor Main Features,Memory,Storage,Graphics,Operating System,...,proc_cores,proc_speed,mem_type,mem_cap,num_USB,storage_type,storage_cap,graphics,form,brand
0,930,https://www.newegg.com/Product/Product.aspx?Item=N82E16883794893R&ignorebbr=1,LENOVO,TOWER,Intel Core i7-8700 3.20 GHz,64 bit 6-Core Processor,16 GB DDR4 2666 + 16 GB Optane Memory,2 TB 7200 RPM HDD,NVIDIA GeForce GTX 1050 Ti 4 GB GDDR5,Windows 10,...,6.0,3.2,ddr4,16.0,8.0,hdd,2048.0,NVIDIA,TOWER,LENOVO
1,1050,https://www.newegg.com/Product/Product.aspx?Item=N82E16883794892R&ignorebbr=1,LENOVO,TOWER,Intel Core i7-8700 3.20 GHz,64 bit 6-Core Processor,16 GB DDR4 2666 + 16 GB Optane Memory,2 TB 7200 RPM HDD,NVIDIA GeForce GTX 1060 6 GB GDDR5,Windows 10,...,6.0,3.2,ddr4,16.0,8.0,hdd,2048.0,NVIDIA,TOWER,LENOVO
2,379,https://www.newegg.com/Product/Product.aspx?Item=N82E16883794897R&ignorebbr=1,LENOVO,TOWER,Intel Core i3-8100 3.60 GHz,64 bit Quad-Core Processor,4 GB DDR4 2666,1 TB 7200 RPM HDD,Intel UHD Graphics 630,Windows 10,...,2.0,3.6,ddr4,4.0,8.0,hdd,1024.0,INTEGRATED,TOWER,LENOVO
3,674,https://www.newegg.com/Product/Product.aspx?Item=N82E16883165597&ignorebbr=1,DELL,SMALL,Intel Core i5-8600 3.10 GHz,64 bit 6-Core Processor,8 GB,256 GB SSD,Intel UHD Graphics 630,Windows 10,...,4.0,3.1,ddr4,8.0,8.0,ssd,256.0,INTEGRATED,SMALL,DELL
4,935,https://www.newegg.com/Product/Product.aspx?Item=1VK-001E-18MT6&ignorebbr=1,,,,,,,,,...,,,,,8.0,hdd,,,,


In [688]:
df_clean.columns.values

array(['price', 'link', 'Brand', 'Form Factor', 'Processor',
       'Processor Main Features', 'Memory', 'Storage', 'Graphics',
       'Operating System', 'CPU Type', 'CPU Speed', 'CPU Main Features',
       'GPU/VGA Type', 'Memory Capacity', 'HDD', 'Screen Size',
       'Graphics Interface', 'SSD', 'Front USB', 'Video Ports',
       'Rear USB', 'proc_brand', 'proc_cores', 'proc_speed', 'mem_type',
       'mem_cap', 'num_USB', 'storage_type', 'storage_cap', 'graphics',
       'form', 'brand'], dtype=object)

## Get dataframe ready for modeling

Note: I'm excluding processor brand because there are only 7 AMD processors.

In [696]:
df_mod = df_clean[['price', 'proc_cores', 'proc_speed', 'mem_type', 'mem_cap',
                   'num_USB', 'storage_type', 'storage_cap', 'graphics', 'form', 'brand']]

In [698]:
df_mod = df_mod[df_mod['proc_cores'].notna()]

In [701]:
df_mod = df_mod[df_mod['price'].notna()]

## Fill nulls

General strategy is going to be to fill categorical nulls with the mode of that category, and to fill continuous nulls with the median.

In [705]:
df_mod['proc_speed'] = df_mod['proc_speed'].fillna(df_mod['proc_speed'].median())

In [709]:
df_mod['mem_type'] = df_mod['mem_type'].fillna('ddr3')

In [712]:
df_mod['storage_cap'] = df_mod['storage_cap'].fillna(df_mod['storage_cap'].median())

In [718]:
df_mod['graphics'] = df_mod['graphics'].fillna('INTEGRATED')

In [721]:
df_mod['form'] = df_mod['form'].fillna('SMALL')

In [724]:
df_mod['brand'] = df_mod['brand'].fillna('DELL')

## Fix data types

Make price an int.

All other columns are in their correct form.

In [728]:
df_mod['price'] = df_mod['price'].str.replace(',', '')
df_mod['price'] = df_mod['price'].astype(int)

In [731]:
df_mod.to_csv('model.csv')