In [1]:

from pathlib import Path
import re
import requests
import time
import datetime

import pandas as pd
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [2]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data" # os.path.join(BASE_DIR, 'data')
if not DATA_DIR.exists(): # os.path.exists(DATA_DIR)
    DATA_DIR.mkdir(exist_ok=True) # os.makedirs(DATA_DIR, exist_ok=True)
    
product_category_links_output = DATA_DIR / "category-products.csv"
product_output = DATA_DIR / "products.csv"

In [3]:

options = Options()
options.add_argument("--headless")
options.add_argument('--incognito')
options.add_argument('--ignore-certificate-errors')
driver = webdriver.Chrome(options=options)

In [12]:
categories = [
    {"name": "Laptops", "url": "https://www.amazon.in/s?k=laptop"},
    {"name": "N95 Mask", "url": "https://www.amazon.in/s?k=n95+mask"},
    {"name": "Notebook", "url": "https://www.amazon.in/s?k=notebook"}
]

In [5]:

regex_options = [
    r"https://www.amazon.in/gp/product/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.in/dp/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/",
]

def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [6]:
def clean_page_links(page_links=[], category=None):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({"url": url, "product_id": product_id, "category": category})
    return final_page_links

In [7]:
def scrace_category_product_links(categories=[]):
    all_product_links = []
    for category in categories:
        time.sleep(1.5)
        url = category.get("url")
        driver.get(url)
        body_el = driver.find_element_by_css_selector("body")
        html_str = body_el.get_attribute("innerHTML")
        html_obj = HTML(html=html_str)
        page_links = [f"https://www.amazon.in{x}" for x in html_obj.links if x.startswith("/")]
        cleaned_links = clean_page_links(page_links=page_links, category=category)
        all_product_links += cleaned_links
    return all_product_links

In [8]:
def extract_categories_and_save(categories=[]):
    all_product_links = scrace_category_product_links(categories)
    category_df = pd.DataFrame(all_product_links)
    category_df.to_csv(product_category_links_output, index=False)

In [13]:
extract_categories_and_save(categories=categories)

In [14]:
def scrape_product_page(url, title_lookup = "#productTitle", price_lookup = "#priceblock_ourprice"):
    driver.get(url)
    time.sleep(1.5)
    body_el = driver.find_element_by_css_selector("body")
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price

In [15]:
def perform_scrape(cleaned_items=[]):
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price != None:
            print(link, title, price)
        product_data = {
            "url": link,
            "product_id": product_id,
            "title": title,
            "price": price
        }
        data_extracted.append(product_data)
    return data_extracted

In [16]:

def row_scrape_event(row, *args, **kwargs):
    link = row['url']
    scraped = 0
    try:
        scraped = row['scraped']
    except:
        pass
    # print(link)
    if scraped == 1 or scraped == "1":
        print("skipped")
        return row
    product_id = row['product_id']
    title, price = (None, None)
    try:
        title, price = scrape_product_page(link)
    except:
        pass
    row['title'] = title
    row['price'] = price
    row['scraped'] = 1
    row['timestamp'] = datetime.datetime.now().timestamp()
    print(link, title, price)
    return row

In [17]:
df = pd.read_csv(product_category_links_output)
df.head()

Unnamed: 0,url,product_id,category
0,https://www.amazon.in/LifeDigital-Zed-Note-CX3...,B0854QKMFY,"{'name': 'Laptops', 'url': 'https://www.amazon..."
1,https://www.amazon.in/LifeDigital-Zed-AIR-Wind...,B0854QFQFV,"{'name': 'Laptops', 'url': 'https://www.amazon..."
2,https://www.amazon.in/LifeDigital-Zed-AIR-Wind...,B0854QFQFV,"{'name': 'Laptops', 'url': 'https://www.amazon..."
3,https://www.amazon.in/Lenovo-ThinkPad-11-6-Bus...,B0848RB868,"{'name': 'Laptops', 'url': 'https://www.amazon..."
4,https://www.amazon.in/Lenovo-ThinkPad-11-6-Bus...,B0848RB868,"{'name': 'Laptops', 'url': 'https://www.amazon..."


In [18]:
df.shape

(211, 3)

In [19]:
df_sub = df.copy() # df.head(n=10)

In [20]:
df_sub = df_sub.apply(row_scrape_event, axis=1)

https://www.amazon.in/LifeDigital-Zed-Note-CX3-Touchscreen/dp/B0854QKMFY/ref=sr_1_20?dchild=1&keywords=laptop&qid=1595401659&sr=8-20#customerReviews LifeDigital Zed Note CX3 13.3-inch Touchscreen Convertible Laptop (Intel Core i3/8GB/256GB SSD/Windows 10/Intel HD Graphics), Silver ₹ 33,205.00
https://www.amazon.in/LifeDigital-Zed-Note-CX3-Touchscreen/dp/B0854QKMFY/ref=sr_1_20?dchild=1&keywords=laptop&qid=1595401659&sr=8-20#customerReviews LifeDigital Zed Note CX3 13.3-inch Touchscreen Convertible Laptop (Intel Core i3/8GB/256GB SSD/Windows 10/Intel HD Graphics), Silver ₹ 33,205.00
https://www.amazon.in/LifeDigital-Zed-AIR-Windows-Graphics/dp/B0854QFQFV/ref=sr_1_6?dchild=1&keywords=laptop&qid=1595401659&sr=8-6 LifeDigital Zed AIR X 2020 15.6 inch Laptop (Intel Core i3 5005U/8GB/256GB SSD/Windows 10/Intel HD 5500 Graphics), Black ₹ 27,788.00
https://www.amazon.in/LifeDigital-Zed-AIR-Windows-Graphics/dp/B0854QFQFV/ref=sr_1_6?dchild=1&keywords=laptop&qid=1595401659&sr=8-6#customerReviews L

https://www.amazon.in/Boldfit-Pollution-protective-manufacturer-Ministry/dp/B08BFYYVYR/ref=sr_1_24?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-24#customerReviews Boldfit N95 mask for face (Pack of 10) Anti Pollution, protective. Third Party Tested by manufacturer at SGS & Ministry of Textiles ₹ 1,090.00
https://www.amazon.in/Callas-Re-usable-Pollution-Filtration-Respirator/dp/B089CPP26Y/ref=sr_1_33?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-33#customerReviews Callas Premium Quality N95 Re-usable Mask Anti Pollution, High Filtration Capacity 5 Layered; With Respirator ₹ 619.00 - ₹ 1,300.00
https://www.amazon.in/ORILEY-Certified-Layer-Disposable-Respirator/dp/B088QWV35K/ref=sr_1_43?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-43#customerReviews ORILEY CE & ISO Certified 5 Layer Disposable Face Mask with Nose Pin & Respirator for Men & Women (1 PC) ₹ 85.00
https://www.amazon.in/Multilayered-Washable-Reusable-Anti-Pollution-protection/dp/B08942WH8V/ref=sr_1_19?dchild=1&keywords=n9

https://www.amazon.in/NOVEX-ALTEK-Face-Mask-Respirator/dp/B089S25S44/ref=sr_1_22?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-22 NOVEX ALTEK Face Mask with Respirator (Grey) - Pack of 5 ₹ 240.98
https://www.amazon.in/CAREVIEWTM-CV1221H-Protective-Layered-Filtration/dp/B08951PPT1/ref=sr_1_32?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-32#customerReviews CAREVIEW™ CV1221H N95 FFP2 Protective Face Mask, 6 Layered Filtration with Head Band Strap- (Pack of 3) ₹ 360.00
https://www.amazon.in/Filtration-Capacity-Reusable-Particulate-Certified/dp/B08BJ245SV/ref=sr_1_21?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-21 Trendy N95 Face Mask | 5 ply High Filtration Capacity | Five Layer Reusable Particulate Mask and Washable | FDA,CE, GMP, ISO Approved and Certified (Pack of 10) ₹ 499.00
https://www.amazon.in/Xtore-Washable-Pollution-Breathing-Replaceable/dp/B07MCPKLZP/ref=sr_1_30?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-30 None None
https://www.amazon.in/Standard-Certified-5-Layered-Pr

https://www.amazon.in/Reusable-washable-certified-droplets-pollution/dp/B08C67MQG5/ref=sr_1_6?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-6 Swadesi Stuff N95 Filter Multi color Filter Face Mask, Reusable, washable & CE certified to protect Mouth droplets, Dust and pollution, Pack of 5 mask ₹ 499.00
https://www.amazon.in/DALUCI-Reusable-Pollution-Layer-Women/dp/B0893WWSG5/ref=sr_1_11?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-11#customerReviews DALUCI N95 Reusable Mask Anti Air Pollution Face Mask With 5 Layer For Men Women (White, 2 Pack) ₹ 199.00
https://www.amazon.in/Xtore-Comfortable-Pollution-Breathing-Perticulate/dp/B07LDVP7DV/ref=sr_1_5?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-5 None None
https://www.amazon.in/Kurtzy-Washable-Anti-Pollution-Anti-Dust-Respiratory/dp/B088D8M4B1/ref=sr_1_45?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-45#customerReviews Kurtzy Washable Reusable 3 Layers CN95 Coatex Anti-Pollution Anti-Dust Face Mask with Respiratory Valve (Pack of 2) 

https://www.amazon.in/Bodyguard-Anti-Pollution-Face-Mask/dp/B075XN3KML/ref=sr_1_46?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-46#customerReviews BodyGuard Reusable Anti Pollution Face Mask with Activated Carbon, N95 + PM2.5 for kids - Small (Black) ₹ 199.00
https://www.amazon.in/Urbangabru-Pollution-protective-filters-system/dp/B0851H66G4/ref=sr_1_38?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-38#customerReviews Urbangabru N99 Anti Pollution Mask with 4 layer protective filters PM 2.5 system (valve color may vary) ₹ 299.00
https://www.amazon.in/Reusable-washable-certified-droplets-pollution/dp/B08C3Z7D69/ref=sr_1_10?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-10 Swadesi Stuff N95 Face Mask, Reusable, washable & CE certified to protect Mouth droplets, Dust and pollution, Pack of 5 mask ₹ 499.00
https://www.amazon.in/Venus-V-4200N95-Respirator-Butterfly-Temperature/dp/B085TSNCZQ/ref=sr_1_29?dchild=1&keywords=n95+mask&qid=1595401665&sr=8-29 None None
https://www.amazon.in/Gear-S

https://www.amazon.in/Factor-Notes-Journal-Diary-Notebook/dp/B07S8N89YD/ref=sr_1_42?dchild=1&keywords=notebook&qid=1595401671&sr=8-42 Factor Notes Ruled 90 GSM Natural Shade Paper Notebook/ 96 Pages / B6-120mm X 180mm Journal Diary/ B6R- Idea ₹ 120.00
https://www.amazon.in/Classmate-ITC-Wir-200Pg-Ruld/dp/B07KCSKCB3/ref=sr_1_8?dchild=1&keywords=notebook&qid=1595401671&sr=8-8#customerReviews Classmate Notebook - Single line, Spiral Binding, 240mm x 180mm, 200 Pages ₹ 60.00
https://www.amazon.in/Drapvision-Majestic-Notebook-Enriched-Design/dp/B084TVJMCM/ref=sr_1_40?dchild=1&keywords=notebook&qid=1595401671&sr=8-40 Drapvision Majestic Series Notebook, A5, Enriched Bond Pages, Plain, Design: Pure Black ₹ 99.00
https://www.amazon.in/Luxor-Subject-Single-Ruled-Notebook/dp/B00LHZWD0C/ref=sr_1_55?dchild=1&keywords=notebook&qid=1595401671&sr=8-55#customerReviews Luxor 5 Subject Single Ruled Notebook - A4, 70 GSM, 300 pages ₹ 289.00
https://www.amazon.in/Lauret-Notebook-Journal-Natural-Shade/dp/B

https://www.amazon.in/Drapvision-Majestic-Notebook-Enriched-Design/dp/B084TVJMCM/ref=sr_1_40?dchild=1&keywords=notebook&qid=1595401671&sr=8-40#customerReviews Drapvision Majestic Series Notebook, A5, Enriched Bond Pages, Plain, Design: Pure Black ₹ 99.00
https://www.amazon.in/Luxor-Subject-Premium-Exercise-Notebook/dp/B07QHNDF4C/ref=sr_1_24?dchild=1&keywords=notebook&qid=1595401671&sr=8-24 Luxor 1 Subject Spine Taped Premium Exercise Notebook, Single Ruled - (21cm x 29.7cm), 180 Pages ₹ 95.00
https://www.amazon.in/Classmate-Pulse-Single-5-Subject-Notebook/dp/B075FR13TK/ref=sr_1_29?dchild=1&keywords=notebook&qid=1595401671&sr=8-29#customerReviews Classmate 2100128 Soft Cover 5 Subject Spiral Binding Notebook, Single Line, 250 Pages (Assorted cover design) ₹ 110.00
https://www.amazon.in/Luxor-Subject-Premium-Exercise-Notebook/dp/B07Q6XTPP9/ref=sr_1_11?dchild=1&keywords=notebook&qid=1595401671&sr=8-11#customerReviews Luxor 6 Subject Spiral Premium Exercise Notebook, Single Ruled - (18cm x

In [35]:
df_sub.to_csv(product_output,index = False)