In [1]:
# importing the required libraries

import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import regex as re
import json
from time import time
import sqlalchemy

In [2]:
# read the data from csv file
df = pd.read_csv("Amazon Scraping - Sheet1.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,id,Asin,country
0,0,1,1015,de
1,1,2424796,1015,fr
2,2,2,000004458X,de
3,3,2424797,000004458X,fr
4,4,3,1002198,de
...,...,...,...,...
995,995,2425286,4484924,fr
996,996,449,4485742,de
997,997,2425287,4485742,fr
998,998,450,4486072,de


In [4]:
# configure driver for selenium

# path to driver
DRIVER_PATH = 'C:/Users/OMEN/Downloads/chromedriver.exe'
options = Options()
# without gui mode
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

In [5]:
times = []
st = time()
data = []
for row in df.iterrows():
    ind, val = row
    
    # update parameters
    country = val['country']
    asin = val['Asin']
    
    # get webpage
    url = f"https://www.amazon.{country}/dp/{asin}"
    driver.get(url)
    title = driver.title
    
    # check if page exists
    # Amazon gives title as 404 - Not found, Page not found or Amazon.de search
    # Checking for those conditions
    if title[:7] != "Amazon." and title[:6] != '404 - ' and title != "Page introuvable":
        
        # there are two main product types
        # product or songs
        # get details according to it
        product_title = driver.find_element_by_id("productTitle").text
        image = None
        try:
            image = driver.find_element_by_id("landingImage").get_attribute('src')
        except NoSuchElementException as e:
            pass
        
        if not image:
            try:
                image = driver.find_element_by_id("imgBlkFront").get_attribute('src')
            except NoSuchElementException as e:
                pass
                
        price = None
        try:
            price_w = driver.find_element_by_class_name('a-price-whole').text
            price_f = driver.find_element_by_class_name('a-price-fraction').text
            price = price_w + "." + price_f
            if not re.findall('[0-9]+', price):
                price = None
        except NoSuchElementException as e:
            pass
        
        if not price:
            try:
                price = driver.find_element_by_xpath("//span[@class='a-color-base']").text
                price = ".".join(re.findall('[0-9]+', price))
            except NoSuchElementException as e:
                pass
                
        description = None
        try:
            description = driver.find_element_by_id("productDescription").text
            description = description.split("\n")[-1]
        except NoSuchElementException as e:
            pass
        
        if not description:
            try:
                description = driver.find_element_by_id("detailBulletsWrapper_feature_div").text
            except NoSuchElementException as e:
                pass
        
        data.append({
            "ProductTitle": product_title,
            "ImageURL": image,
            "Price": price,
            "ProductDescription": description
        })
    
    else:
        print(url, "not available")
        
    # add time after every 100 urls
    if (ind+1)%100 == 0:
        times.append(time()-st)
        st = time()

https://www.amazon.de/dp/1015 not available
https://www.amazon.fr/dp/1015 not available
https://www.amazon.fr/dp/000004458X not available
https://www.amazon.de/dp/1002198 not available
https://www.amazon.fr/dp/1002198 not available
https://www.amazon.fr/dp/1002791 not available
https://www.amazon.it/dp/1002791 not available
https://www.amazon.de/dp/1002864 not available
https://www.amazon.fr/dp/1002864 not available
https://www.amazon.de/dp/1003704 not available
https://www.amazon.fr/dp/1003704 not available
https://www.amazon.de/dp/1003763 not available
https://www.amazon.fr/dp/1003763 not available
https://www.amazon.fr/dp/1004271 not available
https://www.amazon.it/dp/1004271 not available
https://www.amazon.de/dp/1017519 not available
https://www.amazon.fr/dp/1017519 not available
https://www.amazon.fr/dp/000102163X not available
https://www.amazon.fr/dp/1022369 not available
https://www.amazon.it/dp/1022369 not available
https://www.amazon.fr/dp/1022857 not available
https://www.a

https://www.amazon.fr/dp/1085727 not available
https://www.amazon.fr/dp/1085840 not available
https://www.amazon.it/dp/1085840 not available
https://www.amazon.fr/dp/1086413 not available
https://www.amazon.it/dp/1086413 not available
https://www.amazon.de/dp/1086553 not available
https://www.amazon.fr/dp/1086553 not available
https://www.amazon.fr/dp/000108657X not available
https://www.amazon.it/dp/000108657X not available
https://www.amazon.fr/dp/000108660X not available
https://www.amazon.fr/dp/000108688X not available
https://www.amazon.de/dp/1087061 not available
https://www.amazon.fr/dp/1087061 not available
https://www.amazon.de/dp/1087126 not available
https://www.amazon.fr/dp/1087126 not available
https://www.amazon.fr/dp/1087177 not available
https://www.amazon.it/dp/1087177 not available
https://www.amazon.de/dp/1087320 not available
https://www.amazon.fr/dp/1087320 not available
https://www.amazon.de/dp/1087495 not available
https://www.amazon.fr/dp/1087495 not available
h

https://www.amazon.fr/dp/1133705 not available
https://www.amazon.de/dp/1137727 not available
https://www.amazon.fr/dp/1137727 not available
https://www.amazon.de/dp/1137999 not available
https://www.amazon.fr/dp/1137999 not available
https://www.amazon.de/dp/1138235 not available
https://www.amazon.fr/dp/1138235 not available
https://www.amazon.de/dp/1139371 not available
https://www.amazon.fr/dp/1139371 not available
https://www.amazon.de/dp/1139398 not available
https://www.amazon.fr/dp/1139398 not available
https://www.amazon.fr/dp/1140795 not available
https://www.amazon.it/dp/1140795 not available
https://www.amazon.de/dp/1141341 not available
https://www.amazon.es/dp/1141341 not available
https://www.amazon.fr/dp/1141341 not available
https://www.amazon.de/dp/1144553 not available
https://www.amazon.fr/dp/1144553 not available
https://www.amazon.de/dp/1144634 not available
https://www.amazon.fr/dp/1144634 not available
https://www.amazon.de/dp/1144685 not available
https://www.a

https://www.amazon.de/dp/4164938 not available
https://www.amazon.fr/dp/4164938 not available
https://www.amazon.de/dp/4165829 not available
https://www.amazon.fr/dp/4165829 not available
https://www.amazon.de/dp/4165845 not available
https://www.amazon.fr/dp/4165845 not available
https://www.amazon.de/dp/4166345 not available
https://www.amazon.fr/dp/4166345 not available
https://www.amazon.fr/dp/4166523 not available
https://www.amazon.it/dp/4166523 not available
https://www.amazon.de/dp/4167082 not available
https://www.amazon.fr/dp/4167082 not available
https://www.amazon.de/dp/4167384 not available
https://www.amazon.fr/dp/4167384 not available
https://www.amazon.de/dp/4167465 not available
https://www.amazon.fr/dp/4167465 not available
https://www.amazon.de/dp/4167848 not available
https://www.amazon.fr/dp/4167848 not available
https://www.amazon.fr/dp/000416816X not available
https://www.amazon.it/dp/000416816X not available
https://www.amazon.de/dp/4168240 not available
https:/

https://www.amazon.de/dp/4173376 not available
https://www.amazon.fr/dp/4173376 not available
https://www.amazon.de/dp/4173384 not available
https://www.amazon.fr/dp/4173384 not available
https://www.amazon.de/dp/4173406 not available
https://www.amazon.fr/dp/4173406 not available
https://www.amazon.de/dp/4173430 not available
https://www.amazon.fr/dp/4173430 not available
https://www.amazon.de/dp/4173473 not available
https://www.amazon.fr/dp/4173473 not available
https://www.amazon.de/dp/4173538 not available
https://www.amazon.fr/dp/4173538 not available
https://www.amazon.de/dp/4173589 not available
https://www.amazon.fr/dp/4173589 not available
https://www.amazon.de/dp/4173643 not available
https://www.amazon.fr/dp/4173643 not available
https://www.amazon.de/dp/4173899 not available
https://www.amazon.fr/dp/4173899 not available
https://www.amazon.de/dp/4174194 not available
https://www.amazon.fr/dp/4174194 not available
https://www.amazon.de/dp/4174348 not available
https://www.a

https://www.amazon.de/dp/4211715 not available
https://www.amazon.fr/dp/4211715 not available
https://www.amazon.de/dp/4211723 not available
https://www.amazon.fr/dp/4211723 not available
https://www.amazon.de/dp/4211782 not available
https://www.amazon.fr/dp/4211782 not available
https://www.amazon.de/dp/4211812 not available
https://www.amazon.fr/dp/4211812 not available
https://www.amazon.de/dp/4211855 not available
https://www.amazon.fr/dp/4211855 not available
https://www.amazon.de/dp/4211863 not available
https://www.amazon.fr/dp/4211863 not available
https://www.amazon.de/dp/4212029 not available
https://www.amazon.fr/dp/4212029 not available
https://www.amazon.de/dp/4480007 not available
https://www.amazon.fr/dp/4480007 not available
https://www.amazon.de/dp/4480422 not available
https://www.amazon.fr/dp/4480422 not available
https://www.amazon.fr/dp/4480570 not available
https://www.amazon.it/dp/4480570 not available
https://www.amazon.de/dp/4481127 not available
https://www.a

In [6]:
# save dictionary as json
out_file = open("amazon_data.json", "w")  
json.dump(data, out_file)
out_file.close()

In [7]:
# calculate total and average times
total_time = sum(times)
avg_time = sum(times)/len(times)

In [8]:
print(f"Total time taken: {round(total_time,2)} seconds")
print(f"Average time taken for 100 urls: {round(avg_time,2)} seconds")

Total time taken: 495.48 seconds
Average time taken for 100 urls: 49.55 seconds


In [9]:
# convert dictionary to df
data_df = pd.DataFrame(data)

In [11]:
# login to SQL server and create database
user = 'root'
password = 'master'
dbname = 'amazondb'
engine = sqlalchemy.create_engine(f'mysql://{user}:{password}@localhost')
engine.execute(f"CREATE DATABASE IF NOT EXISTS {dbname}")
engine.execute(f"USE {dbname}") 

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1916489fd08>

In [None]:
# save to database
data_df.to_sql(con=engine, name='amazon_products', if_exists='replace')