We got 16 pages of data in one scrape. We'll go ahead with this as our main dataframe. If necessary we can add another 15 pages of data, for a total of around 3,000 datapoints. 

In [194]:
import pandas as pd
import numpy as np
import bs4
import requests
import neweggutils
import re

from collections import OrderedDict

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [348]:
pd.set_option('max_colwidth', 200)

In [155]:
def extract_load(components, prices):
    '''
        Parses HTML of two dataframes and merges them in to a single dataframe.
        ====Parameters====
        components: html of individual computer pages
        prices: html of product array page with 96 products
        ====Returns====
        merged: dataframe with prices corresponding to components
    '''
    try:
        prices.drop(labels=['Unnamed: 0', 'component_html'], axis=1, inplace=True)
        components.drop(labels=['Unnamed: 0', 'price_html'], axis=1, inplace=True)

    except (KeyError, ValueError):
        pass
    
    components = components.apply(lambda x: neweggutils.get_components(x[0]), axis=1)
    prices = prices.apply(lambda x: neweggutils.get_prices_and_links(x[0]))# axis=1) if prices is df
    
    components = components.dropna()
    prices = prices.dropna()
    
    components = pd.DataFrame.from_records(components.values)    
    
    # Concatenate all rows in to one list of prices and links
    concat_prices = []
    
    for price in prices:
        concat_prices.extend(price)
                
    prices = pd.DataFrame(concat_prices)
    prices.columns = ['price', 'link']
    
    merged = prices.merge(components, on='link', how='left')
    
    return merged

## Get all available data in to one dataframe

First load the available HTML dataframes.

In [511]:
f5c = pd.read_csv('first_5_prices_backup.csv', index_col=0)
f5p = pd.read_csv('first_5_components_backup.csv', index_col=0)

In [512]:
f5c.reset_index(inplace=True, drop=True)

In [513]:
f5p.reset_index(inplace=True, drop=True)

In [514]:
s5p = pd.read_csv('6_to_11_price.csv', index_col=0)
s5c = pd.read_csv('6_to_11_component.csv', index_col=0)

In [516]:
t5c = pd.read_csv('16_price.csv', index_col=0)
t5p = pd.read_csv('16_comp.csv', index_col=0)

In [517]:
last_c = pd.read_csv('32-on-comp.csv', index_col=0)
last_p = pd.read_csv('32-on-prices.csv', index_col=0)

last_c.reset_index(inplace=True, drop=True)
last_p.reset_index(inplace=True, drop=True)

In [508]:
df1 = neweggutils.extract_load(f5c, f5p)

In [518]:
df1.to_csv('df1_final.csv')

In [519]:
df2 = neweggutils.extract_load(s5c, s5p)

In [520]:
df2.to_csv('df2_final.csv')

In [521]:
t5c.reset_index(inplace=True, drop=True)
t5p.reset_index(inplace=True, drop=True)

In [None]:
df3 = neweggutils.extract_load(t5c, t5p)

In [None]:
df3.to_csv('df3_final.csv')

In [None]:
df4 = neweggutils.extract_load(last_c, last_p)

In [None]:
df4.to_csv('master4.csv')

In [None]:
final = pd.concat([df1, df2, df3], sort=False)

In [None]:
final.info()

In [None]:
final.to_csv('final.csv')