In [6]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium_stealth import stealth
from selenium.webdriver.common.by import By
import time
import random
import requests as req
import json
import json
import re



In [2]:
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse

def transform_link(old_link):
    parsed_url = urlparse(old_link)
    query_params = parse_qs(parsed_url.query)
    
    # Extract necessary parameters from the original link
    product_id_value = query_params.get('productId', [''])[0]
    category_id_value = query_params.get('categoryId', [''])[0]
    keywords_value = query_params.get('SearchText', [''])[0]
    
    # Construct new query string
    new_query_params = {
        'assessmentCompany': 'true',
        'categoryId': category_id_value,
        'keywords': keywords_value,
        'productId': product_id_value,
        'spm': 'a2700.galleryofferlist.leftFilter.d_filter'
    }
    new_query_string = urlencode(new_query_params, doseq=True)
    
    # Construct new URL
    new_parsed_url = parsed_url._replace(query=new_query_string)
    new_link = urlunparse(new_parsed_url)
    
    return new_link

In [3]:
def extract_first_number(quantity_str):
    pattern = re.compile(r'\d+(\.\d+)?')
    match = pattern.search(quantity_str)
    return float(match.group()) if match else None

def extract_unit_of_measure(quantity_str):
    pattern = re.compile(r'[a-zA-Z]+')
    match = pattern.search(quantity_str)
    return match.group() if match else None

def extract_score(company_info_str):
    pattern = re.compile(r'(\d+\.\d+)/5')
    match = pattern.search(company_info_str[0])
    return float(match.group(1)) if match else None


def extract_total_annual_revenues(company_info_list):
    pattern = re.compile(r'\$([\d,]+)')
    for info_str in company_info_list:
        match = pattern.search(info_str)
        if match:
            return float(match.group(1).replace(',', ''))
    return None



In [4]:
def clean_json(json_data):
    result = [v for v in json_data.values()]
    result = result[1:3]
    for i, j in zip(result[0], result[1]):
        if len(i) == len(j) + 1:
            i.remove(i[0])
        elif len(i) == len(j) + 2:
            i.remove(i[0])
            i.remove(i[-1])

    json_data['prices'] = result[0]

    new_json = {
        'ID': [], 'title': [], 'product_type': [], 'macro_category': [], 'prices': [], 
        'quantities': [], 'unit_of_measure': [], 
         'score': [], 'total_annual_revenues': [], 
        
    }
    
    for idx, (title, a, b, c, d, e) in enumerate(zip(json_data['title'], json_data['prices'], json_data['quantities'], json_data['product_type'], json_data['company_info'], json_data['macro_category'])):
        if a == 'N/A' or b == 'N/A':
            try:
                new_json['macro_category'].append(e)
            except Exception as e:
                new_json['macro_category'].append('N/A')
            new_json['ID'].append(hash(title))
            new_json['title'].append(title)
            new_json['prices'].append(a)
            new_json['quantities'].append(b)
            new_json['unit_of_measure'].append('N/A')
            new_json['product_type'].append(c[10:])
            new_json['score'].append(extract_score(d))
            new_json['total_annual_revenues'].append(extract_total_annual_revenues(d))
            
        else:
            for a_value, b_value in zip(a, b):
                try:
                    new_json['macro_category'].append(e)
                except Exception as e:
                    new_json['macro_category'].append('N/A')
                new_json['ID'].append(hash(title))
                new_json['title'].append(title)
                try:
                    new_json['prices'].append(float(a_value[1:]))
                except Exception as e:
                    new_json['prices'].append('N/A')
                new_json['quantities'].append(extract_first_number(b_value))
                new_json['unit_of_measure'].append(extract_unit_of_measure(b_value))
                new_json['product_type'].append(c[10:])
                new_json['score'].append(extract_score(d))
                new_json['total_annual_revenues'].append(extract_total_annual_revenues(d))
                

    return new_json


In [8]:
with open('products_stratified.json', 'r') as file:
    json_fabrics = json.load(file)


In [7]:

with open('products_stratified.json', 'r') as file:
    json_fabrics = json.load(file)

fabrics_chemicals = pd.read_csv('final_fabrics_chemicals_metals.csv')
dfs = []
dfs.append(fabrics_chemicals)

for sub_category in json_fabrics['Plastics']['sub_categories']:
    dfs.append(pd.DataFrame(clean_json(sub_category)))

for sub_category in json_fabrics['Agriculture']['sub_categories']:
    
    dfs.append(pd.DataFrame(clean_json(sub_category)))
final_fabrics = pd.concat(dfs, ignore_index=True)


final_fabrics.reset_index(drop=True, inplace=True)







In [8]:
final_fabrics

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ID,title,product_type,macro_category,prices,quantities,unit_of_measure,score,total_annual_revenues
0,0.0,0.0,-3.943321e+18,SHYZ Dye Pearlescent Color Pigment Soap Making...,Pigment,Chemicals,1.5,100.0,pieces,5.0,10000.0
1,1.0,1.0,-3.943321e+18,SHYZ Dye Pearlescent Color Pigment Soap Making...,Pigment,Chemicals,1.35,1000.0,pieces,5.0,10000.0
2,2.0,2.0,-3.943321e+18,SHYZ Dye Pearlescent Color Pigment Soap Making...,Pigment,Chemicals,1.2,2000.0,pieces,5.0,10000.0
3,3.0,3.0,3.997857e+18,Food Grade Pigment Edible Luster Dust Gold Gli...,Pigment,Chemicals,39.99,1.0,kilograms,4.9,70000.0
4,4.0,4.0,3.997857e+18,Food Grade Pigment Edible Luster Dust Gold Gli...,Pigment,Chemicals,27.98,25.0,kilograms,4.9,70000.0
...,...,...,...,...,...,...,...,...,...,...,...
29558,,,4.931048e+18,Mushroom growing raw material wood chip sawdus...,Agricultural Waste,Agriculture,80.0,1.0,tons,4.4,100000.0
29559,,,7.491994e+18,Factory Hot Sale Animal Feeding Corn Cob Cornc...,Agricultural Waste,Agriculture,332.4,1.0,tons,4.9,20000.0
29560,,,7.491994e+18,Factory Hot Sale Animal Feeding Corn Cob Cornc...,Agricultural Waste,Agriculture,316.6,11.0,tons,4.9,20000.0
29561,,,7.491994e+18,Factory Hot Sale Animal Feeding Corn Cob Cornc...,Agricultural Waste,Agriculture,332.4,1.0,tons,4.9,20000.0
