In [6]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium_stealth import stealth
from selenium.webdriver.common.by import By
import time
import random
import requests as req
import json
import json
import re



In [2]:
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse

def transform_link(old_link):
    parsed_url = urlparse(old_link)
    query_params = parse_qs(parsed_url.query)
    
    # Extract necessary parameters from the original link
    product_id_value = query_params.get('productId', [''])[0]
    category_id_value = query_params.get('categoryId', [''])[0]
    keywords_value = query_params.get('SearchText', [''])[0]
    
    # Construct new query string
    new_query_params = {
        'assessmentCompany': 'true',
        'categoryId': category_id_value,
        'keywords': keywords_value,
        'productId': product_id_value,
        'spm': 'a2700.galleryofferlist.leftFilter.d_filter'
    }
    new_query_string = urlencode(new_query_params, doseq=True)
    
    # Construct new URL
    new_parsed_url = parsed_url._replace(query=new_query_string)
    new_link = urlunparse(new_parsed_url)
    
    return new_link

In [3]:
def extract_first_number(quantity_str):
    pattern = re.compile(r'\d+(\.\d+)?')
    match = pattern.search(quantity_str)
    return float(match.group()) if match else None

def extract_unit_of_measure(quantity_str):
    pattern = re.compile(r'[a-zA-Z]+')
    match = pattern.search(quantity_str)
    return match.group() if match else None

def extract_score(company_info_str):
    pattern = re.compile(r'(\d+\.\d+)/5')
    match = pattern.search(company_info_str[0])
    return float(match.group(1)) if match else None


def extract_total_annual_revenues(company_info_list):
    pattern = re.compile(r'\$([\d,]+)')
    for info_str in company_info_list:
        match = pattern.search(info_str)
        if match:
            return float(match.group(1).replace(',', ''))
    return None



In [4]:
def clean_json(json_data):
    result = [v for v in json_data.values()]
    result = result[1:3]
    for i, j in zip(result[0], result[1]):
        if len(i) == len(j) + 1:
            i.remove(i[0])
        elif len(i) == len(j) + 2:
            i.remove(i[0])
            i.remove(i[-1])

    json_data['prices'] = result[0]

    new_json = {
        'ID': [], 'title': [], 'product_type': [], 'macro_category': [], 'prices': [], 
        'quantities': [], 'unit_of_measure': [], 
         'score': [], 'total_annual_revenues': [], 
        
    }
    
    for idx, (title, a, b, c, d, e) in enumerate(zip(json_data['title'], json_data['prices'], json_data['quantities'], json_data['product_type'], json_data['company_info'], json_data['macro_category'])):
        if a == 'N/A' or b == 'N/A':
            try:
                new_json['macro_category'].append(e)
            except Exception as e:
                new_json['macro_category'].append('N/A')
            new_json['ID'].append(hash(title))
            new_json['title'].append(title)
            new_json['prices'].append(a)
            new_json['quantities'].append(b)
            new_json['unit_of_measure'].append('N/A')
            new_json['product_type'].append(c[10:])
            new_json['score'].append(extract_score(d))
            new_json['total_annual_revenues'].append(extract_total_annual_revenues(d))
            
        else:
            for a_value, b_value in zip(a, b):
                try:
                    new_json['macro_category'].append(e)
                except Exception as e:
                    new_json['macro_category'].append('N/A')
                new_json['ID'].append(hash(title))
                new_json['title'].append(title)
                try:
                    new_json['prices'].append(float(a_value[1:]))
                except Exception as e:
                    new_json['prices'].append('N/A')
                new_json['quantities'].append(extract_first_number(b_value))
                new_json['unit_of_measure'].append(extract_unit_of_measure(b_value))
                new_json['product_type'].append(c[10:])
                new_json['score'].append(extract_score(d))
                new_json['total_annual_revenues'].append(extract_total_annual_revenues(d))
                

    return new_json


In [5]:
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import json

def get_random_headers():
    user_agents = [
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"
    ]
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "Accept-Language": "en-US,en;q=0.9",
        "User-Agent": random.choice(user_agents)
    }
    return headers

def create_driver():
    options = Options()
    options.add_argument("--headless")  # Consider running in non-headless mode for debugging
    options.add_argument("--disable-blink-features=AutomationControlled")
    
    # Add random headers
    headers = get_random_headers()
    for key, value in headers.items():
        options.add_argument(f"{key}={value}")
    
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    return driver

def random_delay(start=1, end=3):
    time.sleep(random.uniform(start, end))

def fetch_url_with_retries(driver, url, retries=5, delay=10):
    attempt = 0
    while attempt < retries:
        try:
            driver.get(url)
            return
        except Exception as e:
            print(f"Error fetching the URL: {e}")
            attempt += 1
            if attempt < retries:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print("Max retries reached. Moving to the next URL.")
                raise e

raw_materials_categories = [ "https://www.alibaba.com/Business-Services_p28"]
#"https://www.alibaba.com/Fabric-Textile-Raw-Material_p4"
#, "https://www.alibaba.com/Chemicals_p8", "https://www.alibaba.com/Metals-Alloys_p9", 
# "https://www.alibaba.com/Rubber-Plastics_p80", "https://www.alibaba.com/Agriculture_p1", 
# "https://www.alibaba.com/Business-Services_p28"

if __name__ == "__main__":
    driver = create_driver()
    raw_materials = {'Plastics': [], 'Agriculture': [], 'Business_Services': []}
    for category, material in zip(raw_materials_categories, raw_materials.keys()):
        URL = category
        try:
            fetch_url_with_retries(driver, URL)
            random_delay()
            
            fabrics = driver.find_elements(By.XPATH, '//a[@class="hugo-dotelement leaf-nav-item"]')
            links_fabrics = [element.get_attribute('href') for element in fabrics if element.get_attribute('href')]
            random_delay()

        except Exception as e:
            print("Error fetching the initial URL:", e)
            continue  
        
        page = 1
        a = 0

        try:
            group_list = {'sub_categories' : []}
            while a < len(links_fabrics):
                try:
                    fetch_url_with_retries(driver, transform_link(links_fabrics[a]) + f"&page={page}")
                    random_delay()
                    print(a, page)

                except Exception as e:
                    print("No more pages or error fetching the URL:", e)
                    group_list["sub_categories"].append(final_products)
                    a += 1
                    page = 1
                    continue

                try: 
                    product_type = driver.find_element(By.XPATH, '//span[@class="seb-refine-result-tag__label"]').text
                except Exception as e:
                    print("Error getting product type:", e)
                    product_type = "N/A"

                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                random_delay()
                
                links_elements = driver.find_elements(By.XPATH, '//a[@class="search-card-e-detail-wrapper"]')
                links = [element.get_attribute('href') for element in links_elements if element.get_attribute('href')]
                links = list(set(links))

                if page == 1:
                    final_products = {'title': [], 'prices': [], 'quantities': [], 'product_type': [], 'company_info': [], 'macro_category': []}
                for link in links:
                    final_products["macro_category"].append(material)
                    fetch_url_with_retries(driver, link)
                    
                    random_delay()

                    try:
                        title_element = driver.find_element(By.XPATH, '//h1')
                        final_products["title"].append(title_element.text if title_element else "N/A")
                    except Exception as e:
                        print("Error getting product title:", e)
                        final_products["title"].append("N/A")

                    try:
                        quantity_element = driver.find_elements(By.XPATH, '//div[@class="quality"]')
                        final_products["quantities"].append([i.text for i in quantity_element] if quantity_element else "N/A")
                    except Exception as e:
                        print("Error getting product quantities:", e)
                        final_products["quantities"].append("N/A")
                        
                    try:
                        price_element = driver.find_elements(By.XPATH, '//div[@class="price"]')
                        final_products["prices"].append([g.text for g in price_element] if price_element else "N/A")
                    except Exception as e:
                        print("Error getting product prices:", e)
                        final_products["prices"].append("N/A")

                    try:
                        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                        company_info = driver.find_elements(By.XPATH, '//div[@class="attr-content"]')
                        final_products["company_info"].append([i.text for i in company_info] if company_info else "N/A")
                    except Exception as e:
                        print("Error getting company info:", e)
                        final_products["company_info"].append("N/A")
                        
                    final_products["product_type"].append(product_type)
                    random_delay()

                if page >= 5:
                    group_list["sub_categories"].append(final_products)
                    a += 1
                    page = 1
                else:
                    page += 1
        finally:
            print(material, "done")
        raw_materials[material] = group_list
        with open('products.json', 'w') as file:
            json.dump(group_list, file)

    file_path = 'products_stratified.json'
    with open(file_path, 'w') as file:
        json.dump(raw_materials, file)


0 1
0 2
0 3
0 4
0 5
1 1
1 2
1 3
1 4
1 5
2 1
2 2
2 3
2 4
2 5
3 1
3 2
3 3
3 4
3 5
4 1
4 2
4 3
4 4
4 5
5 1
5 2
5 3
5 4
5 5
6 1
6 2
6 3
6 4
6 5
7 1
7 2
7 3
7 4
7 5
8 1
8 2
8 3
8 4
8 5
9 1
9 2
9 3
9 4
9 5
Plastics done
0 1
0 2
0 3
0 4
0 5
1 1
1 2
1 3
1 4
1 5
2 1
2 2
2 3
2 4
2 5
3 1
3 2
3 3
3 4
3 5
4 1
4 2
4 3
4 4
4 5
5 1
5 2
5 3
5 4
5 5
6 1
Error getting product title: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//h1"}
  (Session info: chrome-headless-shell=125.0.6422.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x00000001033864c8 chromedriver + 4302024
1   chromedriver                        0x000000010337ee10 chromedriver + 4271632
2   chromedriver                        0x0000000102fb019c chromedriver + 278940
3   chromedriver                        0x0000000102ff22c4 chromedriver + 549572
4   c

In [12]:
final_products

{'title': ['Factory direct artificial long hair pile faux fur fabric',
  'new design fashion dalmatian print faux fur fabric china faux rabbit fur Leopard animal printed fabric',
  'High quality new goods with various fur fabrics acrylic faux fur fabrics for clothing'],
 'prices': [['$5.00/meter', '$7.50', '$6.90', '$6.70', ''],
  ['$10.00/meter', '$1.86', '$1.75', ''],
  ['$3.50', '$3.40', '$3.38'],
  'N/A'],
 'quantities': [['300 - 999 meters', '1000 - 2999 meters', '>= 3000 meters'],
  ['1000 - 4999 meters', '>= 5000 meters'],
  ['500 - 2999 meters', '3000 - 11999 meters', '>= 12000 meters'],
  'N/A'],
 'product_type': ['Category: Faux Fur',
  'Category: Faux Fur',
  'Category: Faux Fur',
  'Category: Faux Fur'],
 'company_info': [['4.8/5',
   '100.0%',
   '≤2h',
   '$170,000+',
   'Southeast Asia,Domestic Market,Eastern Asia',
   '220m²',
   'Design-based customization',
   'Sample-based customization',
   'Raw-material traceability identification',
   'Finished product inspection'

In [8]:
with open('products_stratified.json', 'r') as file:
    json_fabrics = json.load(file)


In [7]:
import pandas as pd
import json

with open('products_stratified.json', 'r') as file:
    json_fabrics = json.load(file)

fabrics_chemicals = pd.read_csv('final_fabrics_chemicals_metals.csv')
dfs = []
dfs.append(fabrics_chemicals)

for sub_category in json_fabrics['Plastics']['sub_categories']:
    dfs.append(pd.DataFrame(clean_json(sub_category)))

for sub_category in json_fabrics['Agriculture']['sub_categories']:
    
    dfs.append(pd.DataFrame(clean_json(sub_category)))
final_fabrics = pd.concat(dfs, ignore_index=True)


final_fabrics.reset_index(drop=True, inplace=True)







In [8]:
final_fabrics

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ID,title,product_type,macro_category,prices,quantities,unit_of_measure,score,total_annual_revenues
0,0.0,0.0,-3.943321e+18,SHYZ Dye Pearlescent Color Pigment Soap Making...,Pigment,Chemicals,1.5,100.0,pieces,5.0,10000.0
1,1.0,1.0,-3.943321e+18,SHYZ Dye Pearlescent Color Pigment Soap Making...,Pigment,Chemicals,1.35,1000.0,pieces,5.0,10000.0
2,2.0,2.0,-3.943321e+18,SHYZ Dye Pearlescent Color Pigment Soap Making...,Pigment,Chemicals,1.2,2000.0,pieces,5.0,10000.0
3,3.0,3.0,3.997857e+18,Food Grade Pigment Edible Luster Dust Gold Gli...,Pigment,Chemicals,39.99,1.0,kilograms,4.9,70000.0
4,4.0,4.0,3.997857e+18,Food Grade Pigment Edible Luster Dust Gold Gli...,Pigment,Chemicals,27.98,25.0,kilograms,4.9,70000.0
...,...,...,...,...,...,...,...,...,...,...,...
29558,,,4.931048e+18,Mushroom growing raw material wood chip sawdus...,Agricultural Waste,Agriculture,80.0,1.0,tons,4.4,100000.0
29559,,,7.491994e+18,Factory Hot Sale Animal Feeding Corn Cob Cornc...,Agricultural Waste,Agriculture,332.4,1.0,tons,4.9,20000.0
29560,,,7.491994e+18,Factory Hot Sale Animal Feeding Corn Cob Cornc...,Agricultural Waste,Agriculture,316.6,11.0,tons,4.9,20000.0
29561,,,7.491994e+18,Factory Hot Sale Animal Feeding Corn Cob Cornc...,Agricultural Waste,Agriculture,332.4,1.0,tons,4.9,20000.0


In [9]:
fabrics_chemicals_metals_agriculture_plastics = final_fabrics.to_csv('fabrics_chemicals_metals_agriculture_plastics.csv')