In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import neweggutils as nu
import re

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df = pd.read_csv('final.csv')

In [4]:
df.drop(labels=['Unnamed: 0'], inplace=True, axis=1)

In [6]:
df_clean = df.copy()

# Drop duplicates and unnecessary columns

In [9]:
df_clean = df_clean[~df_clean.duplicated()]

In [12]:
df_clean.columns.values

array(['price', 'link', 'Brand', 'Series', 'Model', 'Part Number', 'Type',
       'Form Factor', 'Usage', 'Colors', 'Processor',
       'Processor Main Features', 'Cache Per Processor', 'Memory',
       'Storage', 'Optical Drive', 'Graphics', 'Power Supply',
       'Operating System', 'CPU Type', 'CPU Speed', 'L3 Cache Per CPU',
       'CPU Main Features', 'GPU/VGA Type', 'Video Memory',
       'Virtual Reality Ready', 'Memory Capacity', 'Memory Speed',
       'Memory Spec', 'Memory Slot (Total)', 'Maximum Memory Supported',
       'Optane Memory', 'HDD', 'HDD RPM', 'Optical Drive Type',
       'Screen Size', 'LAN Speed', 'WLAN', 'Bluetooth', 'Mouse Type',
       'Keyboard Type', 'Dimensions (H x W x D)', 'Weight',
       'Graphics Interface', 'SSD', 'Keyboard', 'Storage Spec',
       'Optical Drive Spec', 'Media Slots', 'Display Feature',
       'Front USB', 'Mouse', 'Special Features', 'Package Contents',
       'Software Included', 'Audio Chipset', 'LAN Chipset', 'Video Ports',
    

In [15]:
df_clean = df_clean.drop(labels=['Series', 'Model', 'Part Number', 'Model', 'Colors', 'Cache Per Processor',
                                 'Optical Drive', 'L3 Cache Per CPU', 'Virtual Reality Ready', 'Optane Memory',
                                 'LAN Speed', 'WLAN', 'Bluetooth', 'Mouse Type', 'Keyboard Type',
                                 'Dimensions (H x W x D)', 'Keyboard', 'Mouse', 'Special Features',
                                 'Package Contents', 'Software Included', 'Audio Chipset', 'LAN Chipset',
                                 'Speaker Configuration', 'Front Audio Ports', 'L2 Cache Per CPU', 
                                 'COM', 'Speaker Features', 'FIR', 'Card Reader', 'Installed Qty', 'Touchscreen',
                                 'LPT', 'External Bays', 'Internal Bays', 'Package Type', 'Optical Drive 2',
                                 'Optical Drive 2 Type', 'Front IEEE 1394'], axis=1)

In [17]:
df_clean = df_clean.drop(labels=['Power Supply', 'Memory Speed', 'Memory Spec', 'Memory Slot (Total)',
                                 'Maximum Memory Supported', 'HDD RPM', 'Weight', 'Media Slots', 'Display Feature',
                                 'HDD Interface', 'Ethernet', 'Chipset', 'Audio Features', 'Memory Slot (Available)',
                                 'Graphics Card', 'Screen Type', 'PCI Slots (Available/Total)', 'Monitor',
                                 'Audio', 'Speaker', 'SLI / Crossfire', 'Max Supported Qty'], axis=1)

In [26]:
df_clean.to_csv('final_pared.csv')

In [42]:
df_clean.reset_index(inplace=True, drop=True)

## Grab processor brand, cores, and speed

In [44]:
df_clean['proc_brand'] = df_clean['Processor'].str.extract(r'^(INTEL|AMD)', flags=re.IGNORECASE)

In [74]:
df_clean.drop(labels=['Type', 'Usage'], axis=1, inplace=True)

In [88]:
df_clean['Processor Main Features'].str.extract(r'(-Core|Core)', flags=re.IGNORECASE)

0    0.252858
dtype: float64

In [141]:
df_clean['proc_cores'] = df_clean['Processor'].apply(lambda x: nu.num_cores(str(x)))

In [211]:
pat = re.compile('([0-9]\.[0-9]?)')
df_clean['proc_speed'] = df_clean['CPU Speed'].str.extract(pat)

In [159]:
df = df_clean.copy()

In [161]:
df.to_csv('grab_procs.csv')

## Grab RAM capacity and type (ddr2 vs ddr3 vs ddr4)

In [290]:
df_clean = df.copy()

In [291]:
df_clean['mem_type'] = df_clean['Memory Capacity'].apply(lambda x: nu.ram_type(str(x)))

In [292]:
df_clean['mem_type'].notna().sum()

2500

In [293]:
df_clean['mem_cap'] = df_clean['Memory Capacity'].apply(lambda x: nu.ram_cap(str(x)))

In [294]:
df_clean.to_csv('final_dirty1.csv')

## Get number of USB ports

In [295]:
df_clean['Front USB'] = df_clean['Front USB'].str.extract(r'(\d)')
df_clean['Rear USB'] = df_clean['Rear USB'].str.extract(r'(\d)')

AttributeError: Can only use .str accessor with string values, which use np.object_ dtype in pandas

In [296]:
df_clean['Front USB'].fillna(0, inplace=True)
df_clean['Rear USB'].fillna(0, inplace=True)

In [297]:
df_clean['Front USB'] = df_clean['Front USB'].astype(int)
df_clean['Rear USB'] = df_clean['Rear USB'].astype(int)

In [298]:
df_clean['num_USB'] = df_clean['Front USB'] + df_clean['Rear USB']

In [299]:
df_clean['num_USB'] = df_clean['num_USB'].replace(0, np.nan)

In [303]:
df_clean['num_USB'].fillna(df_clean['num_USB'].median(), inplace=True)

In [304]:
df_clean['num_USB'].describe()

count    2974.000000
mean        7.888702
std         1.624129
min         1.000000
25%         8.000000
50%         8.000000
75%         8.000000
max        10.000000
Name: num_USB, dtype: float64

In [307]:
df_clean.to_csv('cleaning.csv')

## Get operating system

In [462]:
df_clean = pd.read_csv('cleaning.csv')

In [463]:
df_clean['Operating System'] = df_clean['Operating System'].str.extract(r'(Windows \d*)')

In [464]:
df_clean['Operating System'].unique()

array(['Windows 10', nan, 'Windows 8', 'Windows 7'], dtype=object)

## Get storage capacity and type

In [465]:
df_clean['storage_type'] = df_clean['SSD'].apply(lambda x: nu.ssd_or_hdd(str(x)))

In [466]:
df_clean['storage_cap'] = df_clean['Storage'].apply(lambda x: nu.storage_cap(str(x)))

In [469]:
df_clean['HDD'].unique()

array(['2 TB', '1 TB', nan, '500 GB', '250 GB', 'No', '320 GB', '3 TB',
       '2TB', '500G', '3TB', '250G', '750 GB', '160 GB', '1TB', '500GB',
       '80 GB', '320G', '1.5 TB'], dtype=object)

In [472]:
df_clean['HDD'] = df_clean['HDD'].apply(lambda x: nu.storage_cap(str(x)))

In [473]:
df_clean['storage_cap'].fillna(df_clean['HDD'], inplace=True)

## Get graphics type

In [479]:
df_clean['graphics'] = df_clean['GPU/VGA Type'].apply(lambda x: nu.graphics_type(str(x)))

## Clean up brand column

In [419]:
df_clean['Brand'] = df_clean['Brand'].apply(lambda x: nu.get_brand(str(x)))

In [421]:
df_clean.to_csv('cleaning.csv')

In [461]:
df_clean['storage_cap'].apply(lambda x: nu.clean_store_cap(str(x)))

714

In [457]:
s = '500GB'

pat = re.compile('\d*')

pat.findall(s)[0]

'500'