In [15]:
from seleniumwire import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from lxml import etree
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
from pymongo import MongoClient
from datetime import datetime
import json
import os

from functions.getProxy import *
from functions.getUserAgent import *

In [16]:
#proxy = getProxy()
domain = 'shein.com' # For checking if the URL is from the same domain
debug = True # Set to True to limit to 1 page
db_mode = False # True = MongoDB, False = JSON

In [17]:
with open('shein_categories.txt', 'r') as file: # Read URLs from file
    urls = file.readlines()

if db_mode:
    mongo_host = os.environ.get('MONGO_HOST', 'localhost')
    client = MongoClient(f'mongodb://{mongo_host}:27017/')
    db = client['shein']
    collection = db['product_urls']

    # Setup Index
    try:
        collection.create_index('url', unique=True)
    except Exception as e:
        print('Index already exists')

blacklistedWords = [
    'javascript:',
    'mailto:',
    'tel:',
    'facebook.com',
    'twitter.com',
    'instagram.com',
    'youtube.com',
    'pinterest.com',
    'tiktok.com',
    'Copyright',
    'copyright',
    'Privacy',
    'privacy',
    'Terms',
    'terms',
    'Imprint',
    'imprint',
    'bonus',
    'campaign',
    'campaigns',
    'sale',
    'refund',
    'track',
    'How-to',
    'how-to',
    'shein.com/women',
    'shein.com/other',
    'shein.com/Return-Policy',
    'shein.com/men',
    'shein.com/plussize',
    'shein.com/curve-plus-size',
    'promotion',
    'shein.com/home',
    'shein.com/cart',
    'contact',
    'About',
    'SUPPLY-CHAIN-TRANSPARENCY',
    'prime',
    'shein.com/kids',
    'shein.com/beauty',
    'shein.com/flashsale',
    'Shipping-Info',
    'coupon-a',
    '/user/auth/login',
    'daily-new',
    'New-in-Trends',
    'shein.com/style',
    'New-in-Trends',
    'shein.com/member-image-list',
]

In [18]:
# Function to check if any word from a list is included in a string
def included_in_string(string, word_list):
    for word in word_list:
        if word in string:
            return True
    return False


In [19]:
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--user-agent=' + GET_UA())
options.add_argument('--incognito')
options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
chrome_drvier_binary = "C:\\chromedriver-win64\\chromedriver.exe"

driver = webdriver.Chrome(service=Service(chrome_drvier_binary), options=options)

for url in urls:
    url = url.strip()
    print('Processing ' + url)

    driver.get(url)

    try:
        # Use WebDriverWait for better handling
        pagination_text = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'sui-pagination__total'))
        ).text
        pagination_number = re.sub(r"\D", "", pagination_text)  # Fix: Raw string
        max_pages = int(pagination_number)
        if debug:
            max_pages = min(1, max_pages)
        print(f'Found {max_pages} pages')
    except Exception as e:
        print('Error getting pagination: ' + str(e))
        max_pages = 1  # Default to 1 page if pagination fails

    # Initialize array for product URLs
    product_urls = []

    for i in range(1, max_pages + 1):
        try:
            print(f'Processing page {i} of {max_pages}')
            driver.get(url + '?page=' + str(i))

            product_elements = driver.find_elements(By.CLASS_NAME, 'product-list__item')
            for product in product_elements:
                href = product.find_element(By.TAG_NAME, 'a').get_attribute('href')
                if not href or included_in_string(href, blacklistedWords):
                    continue
                parsed_url = urlparse(href)
                cleaned_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
                if domain in parsed_url.netloc and cleaned_url not in product_urls:
                    try:
                        if db_mode:
                            if collection.find_one({'url': cleaned_url}):
                                print('URL already exists in MongoDB')
                                continue
                            print('Adding ' + cleaned_url + ' to MongoDB')
                            collection.insert_one({'url': cleaned_url, 'status': 'pending', 'timestamp': datetime.now()})
                        else:
                            product_urls.append(cleaned_url)
                    except Exception as e:
                        print('Error adding URL to MongoDB: ' + str(e))
                        pass
        except Exception as e:
            print('Error processing page: ' + str(e))
            continue

    if not db_mode:
        print('Writing to JSON file')
        with open('product_urls.json', 'w') as outfile:
            json.dump(product_urls, outfile)

driver.quit()
if db_mode:
    client.close()

Processing https://us.shein.com/Plus-Size-Blouses-c-1891.html
Error getting pagination: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF6C3A06CF5+28821]
	(No symbol) [0x00007FF6C3973880]
	(No symbol) [0x00007FF6C381578A]
	(No symbol) [0x00007FF6C38691BE]
	(No symbol) [0x00007FF6C38694AC]
	(No symbol) [0x00007FF6C38B2647]
	(No symbol) [0x00007FF6C388F33F]
	(No symbol) [0x00007FF6C38AF412]
	(No symbol) [0x00007FF6C388F0A3]
	(No symbol) [0x00007FF6C385A778]
	(No symbol) [0x00007FF6C385B8E1]
	GetHandleVerifier [0x00007FF6C3D3FCED+3408013]
	GetHandleVerifier [0x00007FF6C3D5745F+3504127]
	GetHandleVerifier [0x00007FF6C3D4B63D+3455453]
	GetHandleVerifier [0x00007FF6C3ACBDFB+835995]
	(No symbol) [0x00007FF6C397EB9F]
	(No symbol) [0x00007FF6C397A854]
	(No symbol) [0x00007FF6C397A9ED]
	(No symbol) [0x00007FF6C396A1D9]
	BaseThreadInitThunk [0x00007FFD1AED259D+29]
	RtlUserThreadStart [0x00007FFD1C48AF38+40]

Processing page 1 of 1
Writing to JSON file


In [20]:
# Skip pagination check and directly set max_pages to 1
max_pages = 1
print(f'Processing the first page only (max_pages set to {max_pages})')

# Initialize array for product URLs
product_urls = []

for i in range(1, max_pages + 1):
    try:
        print(f'Processing page {i} of {max_pages}')
        driver.get(url + f'?page={i}')

        product_elements = driver.find_elements(By.CLASS_NAME, 'product-list__item')
        for product in product_elements:
            href = product.find_element(By.TAG_NAME, 'a').get_attribute('href')
            if not href or included_in_string(href, blacklistedWords):
                continue
            parsed_url = urlparse(href)
            cleaned_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
            if domain in parsed_url.netloc and cleaned_url not in product_urls:
                try:
                    if db_mode:
                        if collection.find_one({'url': cleaned_url}):
                            print('URL already exists in MongoDB')
                            continue
                        print('Adding ' + cleaned_url + ' to MongoDB')
                        collection.insert_one({'url': cleaned_url, 'status': 'pending', 'timestamp': datetime.now()})
                    else:
                        product_urls.append(cleaned_url)
                except Exception as e:
                    print('Error adding URL to MongoDB: ' + str(e))
                    pass
    except Exception as e:
        print('Error processing page: ' + str(e))
        continue

if not db_mode:
    print('Writing to JSON file')
    with open('product_urls.json', 'w') as outfile:
        json.dump(product_urls, outfile)

Processing the first page only (max_pages set to 1)
Processing page 1 of 1
Error processing page: HTTPConnectionPool(host='localhost', port=53256): Max retries exceeded with url: /session/40a0337a7d1afa82df76ac653cee1502/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000218FF4A11C0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))
Writing to JSON file
