In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium_stealth import stealth
from selenium.webdriver.common.by import By
import time
import random
import requests as req
import json
import json
import re



In [8]:
def extract_first_number(quantity_str):
    pattern = re.compile(r'\d+(\.\d+)?')
    match = pattern.search(quantity_str)
    return float(match.group()) if match else None

def extract_unit_of_measure(quantity_str):
    pattern = re.compile(r'[a-zA-Z]+')
    match = pattern.search(quantity_str)
    return match.group() if match else None

def extract_score(company_info_str):
    pattern = re.compile(r'(\d+\.\d+)/5')
    match = pattern.search(company_info_str[0])
    return float(match.group(1)) if match else None

def extract_sales_value(company_info_str):
    pattern = re.compile(r'\$(\d{1,3}(?:,\d{3})*(?:\.\d+)?)')
    match = pattern.search(company_info_str[0])
    if match:
        return float(match.group(1).replace(',', ''))
    else:
        return None

In [4]:
def clean_json(json_data):
    result = [v for v in json_data.values()]
    result = result[1:3]
    for i, j in zip(result[0], result[1]):
        if len(i) == len(j) + 1:
            i.remove(i[0])
        elif len(i) == len(j) + 2:
            i.remove(i[0])
            i.remove(i[-1])

    json_data['prices'] = result[0]

    new_json = {
        'title': [], 'prices': [], 'quantities': [], 'unit_of_measure': [], 
        'product_type': [], 'score': [], 'sales_value': []
    }
    
    for idx, (title, a, b, c, d) in enumerate(zip(json_data['title'], json_data['prices'], json_data['quantities'], json_data['product_type'], json_data['company_info'])):
        if a == 'N/A' or b == 'N/A':
            new_json['title'].append(title)
            new_json['prices'].append(a)
            new_json['quantities'].append(b)
            new_json['unit_of_measure'].append('N/A')
            new_json['product_type'].append(c[10:])
            new_json['score'].append(extract_score(d))
            new_json['sales_value'].append(extract_sales_value(d))
        else:
            for a_value, b_value in zip(a, b):
                new_json['title'].append(title)
                new_json['prices'].append(float(a_value[1:]))
                new_json['quantities'].append(extract_first_number(b_value))
                new_json['unit_of_measure'].append(extract_unit_of_measure(b_value))
                new_json['product_type'].append(c[10:])
                new_json['score'].append(extract_score(d))
                new_json['sales_value'].append(extract_sales_value(d))

    return new_json

In [5]:


def get_random_headers():
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
        
    ]
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "Accept-Language": "en-US,en;q=0.9",
        "User-Agent": random.choice(user_agents)
    }
    return headers

def create_driver():
    options = Options()
    #options.add_argument("--headless")  # Consider running in non-headless mode for debugging
    options.add_argument("--disable-blink-features=AutomationControlled")
    
    # Add random headers
    headers = get_random_headers()
    for key, value in headers.items():
        options.add_argument(f"{key}={value}")
    
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    return driver

def random_delay(start=1, end=3):
    time.sleep(random.uniform(start, end))
raw_materials_categories = ["https://www.alibaba.com/Fabric-Textile-Raw-Material_p4", "https://www.alibaba.com/Chemicals_p8", 
                            "https://www.alibaba.com/Metals-Alloys_p9", "https://www.alibaba.com/Rubber-Plastics_p80", 
                            "https://www.alibaba.com/Agriculture_p1", "https://www.alibaba.com/Business-Services_p28"]

if __name__ == "__main__":
    driver = create_driver()
    raw_materials = {'Fabrics': [], 'Chemicals': [], 'Metals': [], 'Plastics': [], 'Agriculture': [], 'Business_Services': []}
    for category, material in zip(raw_materials_categories, raw_materials.keys()):
        URL = category
        try:
            driver.get(URL)
            random_delay()
            
            fabrics = driver.find_elements(By.XPATH, '//a[@class="hugo-dotelement leaf-nav-item"]')
            links_fabrics = [element.get_attribute('href') for element in fabrics if element.get_attribute('href')]
            random_delay()

            

        except Exception as e:
            print("Error fetching the URL:", e)
                    
        page = 1
        a = 0

        try:
            group_list = {'sub_categories' : []}
            while a < len(links_fabrics):
                try:
                    driver.get(links_fabrics[a] + f"&page={page}")
                    random_delay()

                except Exception as e:
                    print("Error fetching the URL:", e)
                    break

                try: 
                    product_type = driver.find_element(By.XPATH, '//span[@class="seb-refine-result-tag__label"]').text
                except Exception as e:
                    print("Error getting product type:")
                    product_type = "N/A"

                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                random_delay()
                
                links_elements = driver.find_elements(By.XPATH, '//a[@class="search-card-e-detail-wrapper"]')
                links = [element.get_attribute('href') for element in links_elements if element.get_attribute('href')]
                links = list(set(links))

                if page == 1:
                    final_products = {'title': [], 'prices': [], 'quantities': [], 'product_type': [], 'company_info': []}
                for link in links:
                    driver.get(link)
                    
                    random_delay()
                    try:
                        title_element = driver.find_element(By.XPATH, '//h1')
                        final_products["title"].append(title_element.text if title_element else "N/A")
                    except Exception as e:
                        print("Error getting product title:")

                    try:
                        quantity_element = driver.find_elements(By.XPATH, '//div[@class="quality"]')
                        final_products["quantities"].append([i.text for i in quantity_element] if quantity_element else "N/A")
                        
                    except Exception as e:
                        final_products["quantities"].append("N/A")
                        
                    
                    try:
                        price_element = driver.find_elements(By.XPATH, '//div[@class="price"]')
                        final_products["prices"].append([g.text for g in price_element] if price_element else "N/A")
                    
                    except Exception as e:
                        final_products["prices"].append("N/A")
                    try:
                        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                        company_info = driver.find_elements(By.XPATH, '//div[@class="attr-content"]')
                        final_products["company_info"].append([i.text for i in company_info] if company_info else "N/A")

                        
                    
                    except Exception as e:
                        print("Error getting product rating:")
                        final_products["rating"].append("N/A")
                        
                    final_products["product_type"].append(product_type)
                    
                    #driver.back()
                    random_delay()
                page += 1

                if page == 3:
                    print('daje')
                    group_list["sub_categories"].append(clean_json(final_products))
                    file_path = 'products.json'
                    with open(file_path, 'w') as file:
                        json.dump(group_list, file)
                    a += 1
                    page = 1
        
    
                  
        finally:
            driver.quit()
        print(material, "done")
        raw_materials[material] = group_list
        with open(file_path, 'w') as file:
            json.dump(group_list, file)

    file_path = 'products_stratified.json'
    with open(file_path, 'w') as file:
        json.dump(raw_materials, file)


Error getting product title:
Error getting product rating:


KeyError: 'rating'

In [6]:
final_products

{'title': ['Custom Floral Pattern Satin Chiffon Fabric 97% Polyester 3% Spandex Matte Silk for Dresses-Mixed Genre',
  "Premium Cotton Elastic Jersey Fabric - Ultra-Soft And Highly Flexible - The Perfect Choice For Children's Loungewear",
  'Single Jersey Fabric 95% Cotton %5 Elastan Wholesale Factory Knitted Fabric for lining clothing making Interlock supreme t-shirt',
  'wholesale woven plain style 8 Wale Bubble elasticity Corduroy 96% cotton 4% spandex mix fabric for Pants, dresses, coats',
  'Dress dubai garment monofilament textile soft moss stripe crepe woven viscose 80% rayon 20% nylon mixed fabric for women',
  'Wholesale 175GSM 70% Cotton 26% Nylon 3% Spandex Blend Weft Spandex Stretch Fabric For Pants',
  'COTTON WITH SPANDEX',
  'Wholesale Sales of Clothing Fabric Silk Interwoven Fragrant Gauze Pentagon Flower Fabric',
  'The most popular rayon bamboo cotton plum pattern jacquard satin brocade fabric for clothing',
  'mixed silk cotton jacquard 140 cm 22 mm silk 40% cotton 6

In [9]:
cleaned_df = pd.DataFrame(clean_json(final_products))
cleaned_df

Unnamed: 0,title,prices,quantities,unit_of_measure,product_type,score,sales_value
0,Custom Floral Pattern Satin Chiffon Fabric 97%...,3.5,6.0,meters,Mixed Fabric,,
1,Custom Floral Pattern Satin Chiffon Fabric 97%...,2.9,50.0,meters,Mixed Fabric,,
2,Custom Floral Pattern Satin Chiffon Fabric 97%...,2.5,1000.0,meters,Mixed Fabric,,
3,Custom Floral Pattern Satin Chiffon Fabric 97%...,2.0,5000.0,meters,Mixed Fabric,,
4,Premium Cotton Elastic Jersey Fabric - Ultra-S...,16.2,35.0,meters,Mixed Fabric,,
5,Premium Cotton Elastic Jersey Fabric - Ultra-S...,12.15,70.0,meters,Mixed Fabric,,
6,Premium Cotton Elastic Jersey Fabric - Ultra-S...,8.1,140.0,meters,Mixed Fabric,,
7,Single Jersey Fabric 95% Cotton %5 Elastan Who...,6.52,100.0,kilograms,Mixed Fabric,,
8,Single Jersey Fabric 95% Cotton %5 Elastan Who...,6.45,1000.0,kilograms,Mixed Fabric,,
9,Single Jersey Fabric 95% Cotton %5 Elastan Who...,6.4,15000.0,kilograms,Mixed Fabric,,
