In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium_stealth import stealth
from selenium.webdriver.common.by import By
import time
import random
import json
import re


In [2]:
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse

def transform_link(old_link):
    parsed_url = urlparse(old_link)
    query_params = parse_qs(parsed_url.query)
    
    # Extract necessary parameters from the original link
    product_id_value = query_params.get('productId', [''])[0]
    category_id_value = query_params.get('categoryId', [''])[0]
    keywords_value = query_params.get('SearchText', [''])[0]
    
    # Construct new query string
    new_query_params = {
        'assessmentCompany': 'true',
        'categoryId': category_id_value,
        'keywords': keywords_value,
        'productId': product_id_value,
        'spm': 'a2700.galleryofferlist.leftFilter.d_filter'
    }
    new_query_string = urlencode(new_query_params, doseq=True)
    
    # Construct new URL
    new_parsed_url = parsed_url._replace(query=new_query_string)
    new_link = urlunparse(new_parsed_url)
    
    return new_link

In [3]:
def extract_first_number(quantity_str):
    pattern = re.compile(r'\d+(\.\d+)?')
    match = pattern.search(quantity_str)
    return float(match.group()) if match else None

def extract_unit_of_measure(quantity_str):
    pattern = re.compile(r'[a-zA-Z]+')
    match = pattern.search(quantity_str)
    return match.group() if match else None

def extract_score(company_info_str):
    pattern = re.compile(r'(\d+\.\d+)/5')
    match = pattern.search(company_info_str[0])
    return float(match.group(1)) if match else None


def extract_total_annual_revenues(company_info_list):
    pattern = re.compile(r'\$([\d,]+)')
    for info_str in company_info_list:
        match = pattern.search(info_str)
        if match:
            return float(match.group(1).replace(',', ''))
    return None



In [4]:
def clean_json(json_data):
    result = [v for v in json_data.values()]
    result = result[1:3]
    for i, j in zip(result[0], result[1]):
        if len(i) == len(j) + 1:
            i.remove(i[0])
        elif len(i) == len(j) + 2:
            i.remove(i[0])
            i.remove(i[-1])

    json_data['prices'] = result[0]

    new_json = {
        'ID': [], 'title': [], 'product_type': [], 'macro_category': [], 'prices': [], 
        'quantities': [], 'unit_of_measure': [], 
         'score': [], 'total_annual_revenues': [], 
        
    }
    
    for idx, (title, a, b, c, d, e) in enumerate(zip(json_data['title'], json_data['prices'], json_data['quantities'], json_data['product_type'], json_data['company_info'], json_data['macro_category'])):
        if a == 'N/A' or b == 'N/A':
            new_json['macro_category'].append(e)
            new_json['ID'].append(hash(title))
            new_json['title'].append(title)
            new_json['prices'].append(a)
            new_json['quantities'].append(b)
            new_json['unit_of_measure'].append('N/A')
            new_json['product_type'].append(c[10:])
            new_json['score'].append(extract_score(d))
            new_json['total_annual_revenues'].append(extract_total_annual_revenues(d))
            
        else:
            for a_value, b_value in zip(a, b):
                new_json['macro_category'].append(e)
                new_json['ID'].append(hash(title))
                new_json['title'].append(title)
                try:
                    new_json['prices'].append(float(a_value[1:]))
                except Exception as e:
                    new_json['prices'].append('N/A')
                new_json['quantities'].append(extract_first_number(b_value))
                new_json['unit_of_measure'].append(extract_unit_of_measure(b_value))
                new_json['product_type'].append(c[10:])
                new_json['score'].append(extract_score(d))
                new_json['total_annual_revenues'].append(extract_total_annual_revenues(d))
                

    return new_json


In [6]:
def get_random_headers():
    user_agents = [
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"
    ]
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "Accept-Language": "en-US,en;q=0.9",
        "User-Agent": random.choice(user_agents)
    }
    return headers

def create_driver():
    options = Options()
    #options.add_argument("--headless")  # Consider running in non-headless mode for debugging
    options.add_argument("--disable-blink-features=AutomationControlled")
    
    # Add random headers
    headers = get_random_headers()
    for key, value in headers.items():
        options.add_argument(f"{key}={value}")
    
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    return driver

def random_delay(start=1, end=3):
    time.sleep(random.uniform(start, end))

def fetch_url_with_retries(driver, url, retries=5, delay=10):
    attempt = 0
    while attempt < retries:
        try:
            driver.get(url)
            return
        except Exception as e:
            print(f"Error fetching the URL: {e}")
            attempt += 1
            if attempt < retries:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print("Max retries reached. Moving to the next URL.")
                raise e

raw_materials_categories = ["https://www.alibaba.com/Metals-Alloys_p9", "https://www.alibaba.com/Rubber-Plastics_p80", 
                            "https://www.alibaba.com/Agriculture_p1", "https://www.alibaba.com/Business-Services_p28"]

if __name__ == "__main__":
    driver = create_driver()
    raw_materials = {'Metals': [], 'Plastics': [], 'Agriculture': [], 'Business_Services': []}
    for category, material in zip(raw_materials_categories, raw_materials.keys()):
        URL = category
        try:
            fetch_url_with_retries(driver, URL)
            random_delay()
            
            fabrics = driver.find_elements(By.XPATH, '//a[@class="hugo-dotelement leaf-nav-item"]')
            links_fabrics = [element.get_attribute('href') for element in fabrics if element.get_attribute('href')]
            random_delay()

        except Exception as e:
            print("Error fetching the initial URL:", e)
            continue  
        
        page = 1
        a = 0

        try:
            group_list = {'sub_categories' : []}
            while a < len(links_fabrics):
                try:
                    fetch_url_with_retries(driver, transform_link(links_fabrics[a]) + f"&page={page}")
                    random_delay()

                except Exception as e:
                    print("No more pages or error fetching the URL:", e)
                    group_list["sub_categories"].append(final_products)
                    a += 1
                    page = 1
                    continue

                try: 
                    product_type = driver.find_element(By.XPATH, '//span[@class="seb-refine-result-tag__label"]').text
                except Exception as e:
                    print("Error getting product type:", e)
                    product_type = "N/A"

                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                random_delay()
                
                links_elements = driver.find_elements(By.XPATH, '//a[@class="search-card-e-detail-wrapper"]')
                links = [element.get_attribute('href') for element in links_elements if element.get_attribute('href')]
                links = list(set(links))

                if page == 1:
                    final_products = {'title': [], 'prices': [], 'quantities': [], 'product_type': [], 'company_info': [], 'macro_category': []}
                for link in links:
                    final_products["macro_category"].append(material)
                    fetch_url_with_retries(driver, link)
                    
                    random_delay()

                    try:
                        title_element = driver.find_element(By.XPATH, '//h1')
                        final_products["title"].append(title_element.text if title_element else "N/A")
                    except Exception as e:
                        print("Error getting product title:", e)
                        final_products["title"].append("N/A")

                    try:
                        quantity_element = driver.find_elements(By.XPATH, '//div[@class="quality"]')
                        final_products["quantities"].append([i.text for i in quantity_element] if quantity_element else "N/A")
                    except Exception as e:
                        print("Error getting product quantities:", e)
                        final_products["quantities"].append("N/A")
                        
                    try:
                        price_element = driver.find_elements(By.XPATH, '//div[@class="price"]')
                        final_products["prices"].append([g.text for g in price_element] if price_element else "N/A")
                    except Exception as e:
                        print("Error getting product prices:", e)
                        final_products["prices"].append("N/A")

                    try:
                        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                        company_info = driver.find_elements(By.XPATH, '//div[@class="attr-content"]')
                        final_products["company_info"].append([i.text for i in company_info] if company_info else "N/A")
                    except Exception as e:
                        print("Error getting company info:", e)
                        final_products["company_info"].append("N/A")
                        
                    final_products["product_type"].append(product_type)
                    random_delay()

                if page >= 5:
                    group_list["sub_categories"].append(final_products)
                    a += 1
                    page = 1
                else:
                    page += 1
        finally:
            driver.quit()
        print(material, "done")
        raw_materials[material] = group_list
        with open('products.json', 'w') as file:
            json.dump(group_list, file)

    file_path = 'products_stratified.json'
    with open(file_path, 'w') as file:
        json.dump(raw_materials, file)


Error getting product type: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=125.0.6422.142)
Stacktrace:
	GetHandleVerifier [0x00C7B8E3+45827]
	(No symbol) [0x00C0DCC4]
	(No symbol) [0x00B0150F]
	(No symbol) [0x00ADE133]
	(No symbol) [0x00B6949F]
	(No symbol) [0x00B7B8E6]
	(No symbol) [0x00B62B96]
	(No symbol) [0x00B36998]
	(No symbol) [0x00B3751D]
	GetHandleVerifier [0x00F34513+2899763]
	GetHandleVerifier [0x00F8793D+3240797]
	GetHandleVerifier [0x00D013B4+593364]
	GetHandleVerifier [0x00D082DC+621820]
	(No symbol) [0x00C170A4]
	(No symbol) [0x00C137A8]
	(No symbol) [0x00C13947]
	(No symbol) [0x00C059FE]
	BaseThreadInitThunk [0x75C87BA9+25]
	RtlInitializeExceptionChain [0x76EEBE3B+107]
	RtlClearBits [0x76EEBDBF+191]



NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=125.0.6422.142)
Stacktrace:
	GetHandleVerifier [0x00C7B8E3+45827]
	(No symbol) [0x00C0DCC4]
	(No symbol) [0x00B0150F]
	(No symbol) [0x00ADE133]
	(No symbol) [0x00B6949F]
	(No symbol) [0x00B7B8E6]
	(No symbol) [0x00B62B96]
	(No symbol) [0x00B36998]
	(No symbol) [0x00B3751D]
	GetHandleVerifier [0x00F34513+2899763]
	GetHandleVerifier [0x00F8793D+3240797]
	GetHandleVerifier [0x00D013B4+593364]
	GetHandleVerifier [0x00D082DC+621820]
	(No symbol) [0x00C170A4]
	(No symbol) [0x00C137A8]
	(No symbol) [0x00C13947]
	(No symbol) [0x00C059FE]
	BaseThreadInitThunk [0x75C87BA9+25]
	RtlInitializeExceptionChain [0x76EEBE3B+107]
	RtlClearBits [0x76EEBDBF+191]


In [12]:
final_products

{'title': ['Factory direct artificial long hair pile faux fur fabric',
  'new design fashion dalmatian print faux fur fabric china faux rabbit fur Leopard animal printed fabric',
  'High quality new goods with various fur fabrics acrylic faux fur fabrics for clothing'],
 'prices': [['$5.00/meter', '$7.50', '$6.90', '$6.70', ''],
  ['$10.00/meter', '$1.86', '$1.75', ''],
  ['$3.50', '$3.40', '$3.38'],
  'N/A'],
 'quantities': [['300 - 999 meters', '1000 - 2999 meters', '>= 3000 meters'],
  ['1000 - 4999 meters', '>= 5000 meters'],
  ['500 - 2999 meters', '3000 - 11999 meters', '>= 12000 meters'],
  'N/A'],
 'product_type': ['Category: Faux Fur',
  'Category: Faux Fur',
  'Category: Faux Fur',
  'Category: Faux Fur'],
 'company_info': [['4.8/5',
   '100.0%',
   '≤2h',
   '$170,000+',
   'Southeast Asia,Domestic Market,Eastern Asia',
   '220m²',
   'Design-based customization',
   'Sample-based customization',
   'Raw-material traceability identification',
   'Finished product inspection'

In [79]:
transform_link(links_fabrics[0])

'https://www.alibaba.com/trade/search?assessmentCompany=true&categoryId=190000216&keywords=Mixed+Fabric&productId=10000014167447&spm=a2700.galleryofferlist.leftFilter.d_filter'

In [13]:
cleaned_df = pd.DataFrame(clean_json(final_products))
cleaned_df

Unnamed: 0,ID,title,product_type,macro_category,prices,quantities,unit_of_measure,score,total_annual_revenues
0,-1962552057033446486,Factory direct artificial long hair pile faux ...,Faux Fur,Fabrics,7.5,300.0,meters,4.8,170000.0
1,-1962552057033446486,Factory direct artificial long hair pile faux ...,Faux Fur,Fabrics,6.9,1000.0,meters,4.8,170000.0
2,-1962552057033446486,Factory direct artificial long hair pile faux ...,Faux Fur,Fabrics,6.7,3000.0,meters,4.8,170000.0
3,564609232169099207,new design fashion dalmatian print faux fur fa...,Faux Fur,Fabrics,1.86,1000.0,meters,,7804957.0
4,564609232169099207,new design fashion dalmatian print faux fur fa...,Faux Fur,Fabrics,1.75,5000.0,meters,,7804957.0
5,-5164580511461344495,High quality new goods with various fur fabric...,Faux Fur,Fabrics,3.5,500.0,meters,5.0,1000.0
6,-5164580511461344495,High quality new goods with various fur fabric...,Faux Fur,Fabrics,3.4,3000.0,meters,5.0,1000.0
7,-5164580511461344495,High quality new goods with various fur fabric...,Faux Fur,Fabrics,3.38,12000.0,meters,5.0,1000.0


In [8]:
with open('products_stratified.json', 'r') as file:
    json_fabrics = json.load(file)


In [11]:
import pandas as pd
import json

with open('products_stratified.json', 'r') as file:
    json_fabrics = json.load(file)


dfs = []


for sub_category in json_fabrics['Fabrics']['sub_categories']:
    dfs.append(pd.DataFrame(sub_category))


final_fabrics = pd.concat(dfs, ignore_index=True)


final_fabrics.reset_index(drop=True, inplace=True)







  final_fabrics = pd.concat(dfs, ignore_index=True)


In [12]:
final_fabrics

Unnamed: 0,title,prices,quantities,unit_of_measure,product_type,score,sales_value
0,100% Polyester T800 stretch nylon spandex micr...,0.35,20.0,pieces,100% Polyester Fabric,,
1,100% Polyester T800 stretch nylon spandex micr...,0.31,1000.0,pieces,100% Polyester Fabric,,
2,100% Polyester T800 stretch nylon spandex micr...,0.25,10000.0,pieces,100% Polyester Fabric,,
3,Hot sale 100% polyester colourful net tulle me...,0.38,1.0,meters,100% Polyester Fabric,4.8,
4,Hot sale 100% polyester colourful net tulle me...,0.36,3000.0,meters,100% Polyester Fabric,4.8,
...,...,...,...,...,...,...,...
2489,wholesale pure linen fabric In stock high qual...,5.6,500.0,meters,100% Linen Fabric,,
2490,wholesale pure linen fabric In stock high qual...,4.8,3000.0,meters,100% Linen Fabric,,
2491,soft Enzyme wash plain yarn dyed stone washed ...,5.85,1500.0,meters,100% Linen Fabric,,
2492,soft Enzyme wash plain yarn dyed stone washed ...,5.58,3000.0,meters,100% Linen Fabric,,


In [13]:
fabrics = final_fabrics.to_csv('final_fabrics.csv')

In [4]:
fabrics = pd.read_csv('final_fabrics.csv')

In [6]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [7]:
fabrics

Unnamed: 0.1,Unnamed: 0,title,prices,quantities,unit_of_measure,product_type,score,sales_value
0,0,100% Polyester T800 stretch nylon spandex micr...,0.35,20.0,pieces,100% Polyester Fabric,,
1,1,100% Polyester T800 stretch nylon spandex micr...,0.31,1000.0,pieces,100% Polyester Fabric,,
2,2,100% Polyester T800 stretch nylon spandex micr...,0.25,10000.0,pieces,100% Polyester Fabric,,
3,3,Hot sale 100% polyester colourful net tulle me...,0.38,1.0,meters,100% Polyester Fabric,4.8,
4,4,Hot sale 100% polyester colourful net tulle me...,0.36,3000.0,meters,100% Polyester Fabric,4.8,
5,5,Hot sale 100% polyester colourful net tulle me...,0.35,5000.0,meters,100% Polyester Fabric,4.8,
6,6,Hot sale 100% polyester colourful net tulle me...,0.34,10000.0,meters,100% Polyester Fabric,4.8,
7,7,Factory Custom 100% Polyester Microfiber Peach...,0.55,3000.0,meters,100% Polyester Fabric,,
8,8,Factory Custom 100% Polyester Microfiber Peach...,0.53,5000.0,meters,100% Polyester Fabric,,
9,9,Factory Custom 100% Polyester Microfiber Peach...,0.5,10000.0,meters,100% Polyester Fabric,,
