# Python Web Scraping - Newegg's Website

This project provides the laptop specifications from [newegg.com](newegg.com) using ```BeautifulSoup4``` and ```Selenium```.

By: __Naufal Hilmiaji__

In [1]:
from bs4 import BeautifulSoup as bs
from selenium import webdriver

import pandas as pd
import numpy as np
import requests
import re

Prevent Selenium to open new chrome window.

In [None]:
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument('headless')

To avoid captcha and robot checkings.

In [None]:
from fake_useragent import UserAgent

# ua = UserAgent()
# user_agents = ua.data_randomize()
# print(ua.random)

## Getting Laptop Lists Based on Brands

The cell below will produce the links of every laptop based on their brands as follows:
* ACER
* ASUS
* DELL
* HP
* LENOVO
* MSI

The program will scrape the sources of ```href``` tags as URLs and store them in a text file.

In [None]:
laptop_ids = {
    'acer': 2050001146,
    'asus': 2050001315,
    'hp': 2050001186,
    'lenovo': 2050010418,
    'dell': 2050010772,
    'msi': 2050001312
}

# Get Page Numbers
def get_page_numbers(doc):
    page_text = doc.find(class_="list-tool-pagination-text").strong
    pages = int(str(page_text).split("/")[-2].split(">")[-1][:-1])
    return pages

def get_laptops(page_url, brand):
    lists = []

    driver = webdriver.Chrome(
        "C:/Users/cilac/OneDrive/Documents/chromedriver.exe")
    driver.get(page_url)
    soup = bs(driver.page_source, 'lxml')
    page_text = soup.find(class_="list-tool-pagination-text").strong
    pages = int(str(page_text).split("/")[-2].split(">")[-1][:-1])
    page_numbers = get_page_numbers(soup)

    for page in range(1, page_numbers + 1):
        new_url = f"{page_url}&page={page}"
        driver = webdriver.Chrome(
            "C:/Users/cilac/OneDrive/Documents/chromedriver.exe")
        driver.get(new_url)
        new_soup = bs(driver.page_source, 'lxml')

        div = new_soup.find(class_='item-cells-wrap border-cells items-grid-view four-cells expulsion-one-cell')
        products = div.find_all(text=re.compile(brand))

        for product in products:
            parent = product.parent
            if parent.name != 'a':
                continue
            # link = parent['href']
            lists.append(parent['href'])
    return lists

In [None]:
url1 = f'https://www.newegg.com/p/pl?N=100006740%{list(laptop_ids.values())[0]}&Order=6&PageSize=60'
url2 = f'https://www.newegg.com/p/pl?N=100006740%{list(laptop_ids.values())[1]}&Order=6&PageSize=60'
url3 = f'https://www.newegg.com/p/pl?N=100006740%{list(laptop_ids.values())[2]}&Order=6&PageSize=60'
url4 = f'https://www.newegg.com/p/pl?N=100006740%{list(laptop_ids.values())[3]}&Order=6&PageSize=60'
url5 = f'https://www.newegg.com/p/pl?N=100006740%{list(laptop_ids.values())[4]}&Order=6&PageSize=60'
url6 = f'https://www.newegg.com/p/pl?N=100006740%{list(laptop_ids.values())[5]}&Order=6&PageSize=60'
# print(url)
acer_links = get_laptops(url1, 'Acer')
asus_links = get_laptops(url2, 'ASUS')
hp_links = get_laptops(url3, 'HP')
lenovo_links = get_laptops(url4, 'Lenovo')
dell_links = get_laptops(url5, 'Dell')
msi_links = get_laptops(url6, 'MSI')

Saving URLs to text files.

In [None]:
f = open("text_files/ACER.txt","w+")
for link in acer_links:
    f.write(link + '\n')

f = open("text_files/ASUS.txt","w+")
for link in asus_links:
    f.write(link + '\n')

f = open("text_files/DELL.txt","w+")
for link in dell_links:
    f.write(link + '\n')

f = open("text_files/HP.txt","w+")
for link in hp_links:
    f.write(link + '\n')

f = open("text_files/LENOVO.txt","w+")
for link in lenovo_links:
    f.write(link + '\n')

f = open("text_files/MSI.txt","w+")
for link in msi_links:
    f.write(link + '\n')

## Extracting Information Based on the Links Scrapped

The program will scrape through every links that have been scrapped from the website and get the necessary data, such as laptop brands, types, specifications, pricings, and the URL. The data will be formatted as ```pandas``` DataFrame and then saved as CSV files.

In [None]:
laptop_specifications = pd.DataFrame(columns=['brand', 'type', 'specifications', 'price', 'link'])

In [None]:
def get_links(brand):
    df = pd.read_csv(f'text_files/{brand}.txt', names=['links'])
    return df

def new_row(df, brand, type, specifications, price, link):
    data = {
        'brand': brand,
        'type': type,
        'specifications': specifications,
        'price': price,
        'link': link
    }
    df = df.append(data, ignore_index=True)
    return df

def get_specifications(url, brand, df):
    bullet_lists = []

    ua = UserAgent()
    user_agents = ua.random
    options.add_argument(f'user-agent={user_agents}')

    driver = webdriver.Chrome(
        "C:/Users/cilac/OneDrive/Documents/chromedriver.exe", options=options)
    driver.get(url)

    soup = bs(driver.page_source, 'lxml')
    product_wrap = soup.find('div', class_='product-wrap')

    item_type = product_wrap.h1.get_text()
    price_div = soup.find('div', class_='product-price').ul
    price_was = price_div.find('li', class_='price-was').get_text()
    price_current = price_div.find('li', class_='price-current').get_text()
    product_bullets = product_wrap.find('div', class_='product-bullets').ul

    if product_bullets:
        item_lists = product_bullets.find_all('li', text=True)

        for lst in item_lists:
            bullet_lists.append(
                ''.join([bullet for bullet in lst.find_all(text=True)]))
        item_details = ', '.join([item for item in bullet_lists]).replace(
            '\t', ": ").replace(' | ', ', ').replace(' \r', '').replace('\n', '')
        if price_was:
            return new_row(df, brand, item_type, item_details, price_was, url)
        else:
            return new_row(df, brand, item_type, item_details, price_current, url)
    else:
        if price_was:
            return new_row(df, brand, item_type, 'Not Available', price_was, url)
        else:
            return new_row(df, brand, item_type, 'Not Available', price_current, url)

In [None]:
import time

# brands = ['ACER', 'ASUS', 'DELL', 'HP', 'LENOVO', 'MSI']
brands = ['MSI']
for brand in brands:
    df_links = get_links(brand)

    for i in range(454, len(df_links)):
        laptop_specifications = get_specifications(df_links['links'][i], brand, laptop_specifications)
        time.sleep(10)

In [None]:
laptop_specifications

In [None]:
laptop_specifications.to_csv('datasets/msi_laptops.csv', mode='a', header=False)

In [None]:
for i in range(len(laptop_specifications)):
    print(str(i) + ". " + laptop_specifications['specifications'][i] + '\n')

## Check Duplicated Rows from CSV files

In [2]:
check_msi = pd.read_csv('datasets/msi_laptops.csv', sep=',')
check_msi = check_msi.drop_duplicates()
check_msi

Unnamed: 0.1,Unnamed: 0,brand,type,specifications,price,link
0,0,MSI,"MSI Alpha 17 17.3"" 144Hz Gaming Laptop AMD Ryz...","AMD Ryzen 7 5800H Octa-core, AMD Radeon RX6600...","$1,099.99",https://www.newegg.com/p/1TS-0016-02ZW2
1,1,MSI,"MSI - GF65 15.6"" 144hz Gaming Laptop - Intel C...","Features, Intel 10th Generation Core i5, Windo...","$1,199.99",https://www.newegg.com/p/1TS-0016-02ZW3
2,2,MSI,MSI Laptop Modern 15 A5M-221 AMD Ryzen 7 5000 ...,"AMD Ryzen 7 5000 Series 5700U (1.80GHz), 8 GB ...",$749.00,https://www.newegg.com/carbon-gray-msi-modern-...
3,3,MSI,MSI Laptop Modern 14 B11MO-241 Intel Core i7 1...,"Intel Core i7 11th Gen 1165G7 (2.80 GHz), 8 GB...",$799.00,https://www.newegg.com/blue-stone-msi-modern-1...
4,4,MSI,MSI Laptop Modern 14 B11MO-209 Intel Core i5 1...,"Intel Core i5 11th Gen 1135G7 (2.40 GHz), 8 GB...",$799.00,https://www.newegg.com/beige-mousse-msi-modern...
...,...,...,...,...,...,...
675,310,MSI,MSI GS66 Stealth 10SF Home and Entertainment L...,Upgraded - Seal is opened for Hardware/Softwar...,"$3,270.00",https://www.newegg.com/p/1TS-0016-02NY2
676,311,MSI,"Newest MSI GF63 Premium Gaming Laptop, 15.6"" F...","Processor - Intel Core i5-10300H 10th Gen, 2.5...","$1,549.00",https://www.newegg.com/p/1TS-0016-02NT4
677,312,MSI,MSI Prestige 14 EVO Home and Business Laptop (...,Upgraded - Seal is opened for Hardware/Softwar...,"$1,608.00",https://www.newegg.com/p/1TS-0016-02P46
678,313,MSI,MSI Prestige 14 A10SC-021 Home and Entertainme...,Upgraded - Seal is opened for Hardware/Softwar...,"$3,146.00",https://www.newegg.com/p/1TS-0016-02P50


In [3]:
check_msi.drop('Unnamed: 0', axis=1, inplace=True)
check_msi

Unnamed: 0,brand,type,specifications,price,link
0,MSI,"MSI Alpha 17 17.3"" 144Hz Gaming Laptop AMD Ryz...","AMD Ryzen 7 5800H Octa-core, AMD Radeon RX6600...","$1,099.99",https://www.newegg.com/p/1TS-0016-02ZW2
1,MSI,"MSI - GF65 15.6"" 144hz Gaming Laptop - Intel C...","Features, Intel 10th Generation Core i5, Windo...","$1,199.99",https://www.newegg.com/p/1TS-0016-02ZW3
2,MSI,MSI Laptop Modern 15 A5M-221 AMD Ryzen 7 5000 ...,"AMD Ryzen 7 5000 Series 5700U (1.80GHz), 8 GB ...",$749.00,https://www.newegg.com/carbon-gray-msi-modern-...
3,MSI,MSI Laptop Modern 14 B11MO-241 Intel Core i7 1...,"Intel Core i7 11th Gen 1165G7 (2.80 GHz), 8 GB...",$799.00,https://www.newegg.com/blue-stone-msi-modern-1...
4,MSI,MSI Laptop Modern 14 B11MO-209 Intel Core i5 1...,"Intel Core i5 11th Gen 1135G7 (2.40 GHz), 8 GB...",$799.00,https://www.newegg.com/beige-mousse-msi-modern...
...,...,...,...,...,...
675,MSI,MSI GS66 Stealth 10SF Home and Entertainment L...,Upgraded - Seal is opened for Hardware/Softwar...,"$3,270.00",https://www.newegg.com/p/1TS-0016-02NY2
676,MSI,"Newest MSI GF63 Premium Gaming Laptop, 15.6"" F...","Processor - Intel Core i5-10300H 10th Gen, 2.5...","$1,549.00",https://www.newegg.com/p/1TS-0016-02NT4
677,MSI,MSI Prestige 14 EVO Home and Business Laptop (...,Upgraded - Seal is opened for Hardware/Softwar...,"$1,608.00",https://www.newegg.com/p/1TS-0016-02P46
678,MSI,MSI Prestige 14 A10SC-021 Home and Entertainme...,Upgraded - Seal is opened for Hardware/Softwar...,"$3,146.00",https://www.newegg.com/p/1TS-0016-02P50


In [23]:
msi_datasets = check_msi.copy()

In [24]:
for col in msi_datasets.columns:
    msi_datasets[col] = msi_datasets[col].str.lower()

In [33]:
msi_datasets['specifications'].str.split()

0      amd ryzen 7 5800h octa-core, amd radeon rx6600...
1      features, intel 10th generation core i5, windo...
2      amd ryzen 7 5000 series 5700u (1.80ghz), 8 gb ...
3      intel core i7 11th gen 1165g7 (2.80 ghz), 8 gb...
4      intel core i5 11th gen 1135g7 (2.40 ghz), 8 gb...
                             ...                        
675    upgraded - seal is opened for hardware/softwar...
676    processor - intel core i5-10300h 10th gen, 2.5...
677    upgraded - seal is opened for hardware/softwar...
678    upgraded - seal is opened for hardware/softwar...
679    15.6" fhd, ips-level 144hz 45%ntsc ,  1920x108...
Name: specifications, Length: 594, dtype: object

In [35]:
msi_specifications = pd.DataFrame(msi_datasets['specifications'].str.split(', ').values.tolist())
msi_specifications.to_excel('datasets/msi_specifications.xlsx')

In [37]:
msi_combined = msi_datasets.join(msi_specifications)
msi_combined

Unnamed: 0,brand,type,specifications,price,link,0,1,2,3,4,...,27,28,29,30,31,32,33,34,35,36
0,msi,"msi alpha 17 17.3"" 144hz gaming laptop amd ryz...","amd ryzen 7 5800h octa-core, amd radeon rx6600...","$1,099.99",https://www.newegg.com/p/1ts-0016-02zw2,amd ryzen 7 5800h octa-core,amd radeon rx6600m 8gb gddr6,windows 11 os,in-plane switching (ips) technology,144 hz refresh rate,...,,,,,,,,,,
1,msi,"msi - gf65 15.6"" 144hz gaming laptop - intel c...","features, intel 10th generation core i5, windo...","$1,199.99",https://www.newegg.com/p/1ts-0016-02zw3,features,intel 10th generation core i5,windows 10 operating system,windows 10 brings back the start menu from win...,like the edge web browser that lets you markup...,...,,,,,,,,,,
2,msi,msi laptop modern 15 a5m-221 amd ryzen 7 5000 ...,"amd ryzen 7 5000 series 5700u (1.80ghz), 8 gb ...",$749.00,https://www.newegg.com/carbon-gray-msi-modern-...,amd ryzen 7 5000 series 5700u (1.80ghz),8 gb memory 512 gb nvme ssd,amd radeon graphics,1920 x 1080,windows 11 home 64-bit,...,,,,,,,,,,
3,msi,msi laptop modern 14 b11mo-241 intel core i7 1...,"intel core i7 11th gen 1165g7 (2.80 ghz), 8 gb...",$799.00,https://www.newegg.com/blue-stone-msi-modern-1...,intel core i7 11th gen 1165g7 (2.80 ghz),8 gb memory 512 gb nvme ssd,intel iris xe graphics,1920 x 1080,windows 10 home 64-bit\r,...,,,,,,,,,,
4,msi,msi laptop modern 14 b11mo-209 intel core i5 1...,"intel core i5 11th gen 1135g7 (2.40 ghz), 8 gb...",$799.00,https://www.newegg.com/beige-mousse-msi-modern...,intel core i5 11th gen 1135g7 (2.40 ghz),8 gb memory 512 gb nvme ssd,intel iris xe graphics,1920 x 1080,windows 10 home 64-bit\r,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
675,msi,msi gs66 stealth 10sf home and entertainment l...,upgraded - seal is opened for hardware/softwar...,"$3,270.00",https://www.newegg.com/p/1ts-0016-02ny2,,,,,,...,,,,,,,,,,
676,msi,"newest msi gf63 premium gaming laptop, 15.6"" f...","processor - intel core i5-10300h 10th gen, 2.5...","$1,549.00",https://www.newegg.com/p/1ts-0016-02nt4,,,,,,...,,,,,,,,,,
677,msi,msi prestige 14 evo home and business laptop (...,upgraded - seal is opened for hardware/softwar...,"$1,608.00",https://www.newegg.com/p/1ts-0016-02p46,,,,,,...,,,,,,,,,,
678,msi,msi prestige 14 a10sc-021 home and entertainme...,upgraded - seal is opened for hardware/softwar...,"$3,146.00",https://www.newegg.com/p/1ts-0016-02p50,,,,,,...,,,,,,,,,,


In [38]:
msi_combined.to_excel('datasets/msi_excel.xlsx', index=False)

In [8]:
for i in range(10):
    print(msi_datasets['specifications'][i].split(', '))

['amd ryzen 7 5800h octa-core', 'amd radeon rx6600m 8gb gddr6', 'windows 11 os', 'in-plane switching (ips) technology', '144 hz refresh rate']
['features', 'intel 10th generation core i5', 'windows 10 operating system', 'windows 10 brings back the start menu from windows 7 and introduces new features', 'like the edge web browser that lets you markup web pages on your scree', '15.6" full hd display', 'the 1920 x 1080 resolution with 144hz boasts impressive color and clarity. energy-efficient led backlight.', '8gb system memory for intense multitasking and gaming', 'reams of high-bandwidth ddr4 ram to smoothly run your graphics-heavy pc games and video-editing applications', 'as well as numerous programs and browser tabs all at once.', '512gb solid-state drive (pci-e)', 'save files fast and store more data. with massive amounts of storage and advanced communication power', 'pci-e nvme ssds are great for major gaming applications', 'multiple servers', 'daily backups', 'and more.', 'nvidia

In [None]:
check_msi.to_csv('datasets/msi_laptops_cleaned.csv')