In [None]:
import ray
ray.init(runtime_env={"env_vars": {'ENV_CHOMSKY_KRYLOV_WORKSPACE': '1',
                                   'APP_INST_NAME': 'krylovtessapp-instance',
                                   'APP_ENV': 'pre-production',
                                   'APP_NAME': 'krylovtessapp'}})

In [2]:
import warnings
warnings.filterwarnings("ignore")
import os

from pychomsky.chchat import EbayLLMChatWrapper, AzureOpenAIChatWrapper, GoogleGenAIWrapper

In [4]:
import pyarrow.fs as fs

file_path = '/apps/b_perso/generative_signals/items_aspects_pdf_250206_fashion_1.parquet'

hdfs = fs.HadoopFileSystem("default")
ds = ray.data.read_parquet(f'hdfs://{file_path}', filesystem=hdfs)

2025-02-20 09:04:49,914 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-02-20 09:04:54,655 WARN shortcircuit.DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.


Parquet Files Sample 0:   0%|          | 0.00/1.00 [00:00<?, ? file/s]

In [6]:
num_signals_per_item = 3
json_quotes_replacer = '"'
PROMPT_SUFFIX = f'''
Given the product's details above, extract **up to {num_signals_per_item} distinct and compelling product characteristics** that would convince a buyer to make a purchase.
If fewer than {num_signals_per_item} compelling characteristics are found, generate as many as possible.

### **Before extracting, carefully evaluate each product characteristic step by step using the following rules:**  
- **Each characteristic must be unique** and distinct from the others (no repetitions, no near-synonyms).  
- **Each characteristic must appear exactly as written in the product's details** as a continuous sequence of words, n-gram, or permutation.  
- **STRICT RULE:** Do not use any words or partial phrases from the title in the characteristic. If it appears in the title, discard it completely.  
- **Prioritize valuable, informative, unique, and clear product features** that strongly encourage purchase (e.g.,"High Quality Stainless Steel", "Premium Satin Paper", "Strong Durability").   
- **Ensure each characteristic provides new information** and avoids redundancy.  
- **Avoid phrases that are vague, incomplete, open-ended, or missing context** (e.g., "For Multiple", "Long Big").    
- **Avoid overly technical characteristics overloaded with numbers** (e.g., "Model XZ-500435 4.3GHz", "150W 21V 65.80"), unless they represent key product information. key numerical specifications that add product value are allowed (e.g., "100%", "4mm", "2-Pack", "50cm").  
- **Avoid vague, redundant, or ambiguous characteristics** that lack specific product value (e.g., "Handmade Yes", "For Multiple","Actually Fits").  
- **Do NOT include product category words (e.g., "Leash", "Towel", "Fabric") if they are already in the title.** Instead, extract only the key descriptive characteristic (e.g., "Very Durable" instead of "Very Durable Leash").
- **Each characteristic should be concise (up to 3 words)- longer characteristics are discouraged. **  

---

### **Examples of Good & Bad Characteristics**  
:white_check_mark: **Good Characteristics (Concise, Informative, and Unique)**:  
  - **Extra-Wide Elastic Edges** :white_check_mark: *(Describes a unique design feature for a better fit.)*  
  - **Deep Wall Fitted Design** :white_check_mark: *(Highlights a functional aspect for mattress compatibility.)*  
  - **100% Breathable Cotton** :white_check_mark: *(Emphasizes material quality for comfort.)*  
  - **USDA Organic Certified** :white_check_mark: *(Adds credibility and trust to the product.)*  
  - **Premium Printing Technologies** :white_check_mark: *(Showcases a high-quality production method.)* 
  - **Strong Durability** :white_check_mark: *(Emphasizes long-lasting quality, making the product a reliable investment, incentivizes purchase.)*


:x: **Bad Characteristics → :white_check_mark: Better Alternatives**  
- :x: **"Actually Fits"** *(Vague, lacks a specific feature.)* → :white_check_mark: **"Extra-Wide Elastic Edges"**  
- :x: **"Four Sizes And Options"** *(Unclear, not a feature.)* → :white_check_mark: **"Includes Bonus Small Bag"** *(Highlights a tangible product benefit.)*  
- :x: **"Luxuriously Soft Towel"** *(Reduandant as repeats "Towel" from the title.)* → :white_check_mark: **"Luxuriously Soft"** *(Concise, add new information.)*
- :x: **"Genuine OEM Replacement"** *(Redaundant as repeats title words.)* → :white_check_mark: **"Genuine Part"** *(More concise and avoids title repetition.)*  

---
### **Output Format:**  
Generate your response in **JSON format**, ensuring that each characteristic is returned separately with an explanation.

{{{{  
    {json_quotes_replacer}characteristic_1{json_quotes_replacer}: {json_quotes_replacer}first short characteristic{json_quotes_replacer}, 
    {json_quotes_replacer}explanation_1{json_quotes_replacer}: {json_quotes_replacer}Explanation of how this characteristic follows the extraction rules and where it appears in the product details{json_quotes_replacer},
    ''' + ",\n    ".join([
        f"{json_quotes_replacer}characteristic_{i}{json_quotes_replacer}: {json_quotes_replacer}{i} short characteristic{json_quotes_replacer},\n"
        f"{json_quotes_replacer}explanation_{i}{json_quotes_replacer}: {json_quotes_replacer}Explanation of how this characteristic follows the extraction rules and where it appears in the product details{json_quotes_replacer}"
        for i in range(2, num_signals_per_item + 1)
    ]) + '''
}}}}  
'''

ITEM_PROMPT_TEMPLATE = """
            You are an expert salesperson that always excels in providing the best buying tips to buyers given a product of interest they wish to buy. 

            Product details:
            * Product title: {title}
            * Product features: {features}
            * Product description: {desc}
            """

In [None]:
# MODEL_NAME = 'ebay-internal-chat-completions-sandbox-lilium-2-mix-7b-instruct' # this is a sandbox model
MODEL_NAME = 'azure-chat-completions-gpt-4o-mini-2024-07-18-sandbox'
CHOMSKY_ENDPOINT = 'https://chomskygw6cont.pp.vip.ebay.com/api/v1/genai' # pre-production

class LLMRunner:
    def __init__(self):
        # self.elm = EbayLLMChatWrapper(model_name=MODEL_NAME, temperature=0.1)
        self.elm = AzureOpenAIChatWrapper(model_name=MODEL_NAME, temperature=0.1)
        self.elm.chgw_client.chgw_endpoint = CHOMSKY_ENDPOINT

    def __call__(self, row):
        try:
            text = f"{ITEM_PROMPT_TEMPLATE.format(title=row['title'], features=row['aspects'], desc=row['desc'])}{PROMPT_SUFFIX}"
            result = self.elm.invoke(text)
            row['result'] = result.content
            row['error'] = '' # not None to avoid PyArrow issues
        except Exception as e:
            print(e)
            row['result'] = '' # not None to avoid PyArrow issues
            row['error'] = str(e)
            
        return row

In [None]:
# Usually we care only for the number of Actors(processes) we run and how much CPU each takes
# Here we are trying to calculate how much cpu each will take to reduce rate limits
CPU_PER_NODE = 4
AVG_INFER_TIME_SECS = 5
RATE_PER_MINUTE = 300  # chomsky rate limit for lilium per IP

CONCURRENCY_PER_NODE = int(RATE_PER_MINUTE / 60 * AVG_INFER_TIME_SECS) # 25
NUM_CPUS_PER_ACTOR = CPU_PER_NODE / CONCURRENCY_PER_NODE # 0.16
NUMBER_OF_NODES = 10


ds_results = (
    ds.map(
        LLMRunner,
        concurrency=CONCURRENCY_PER_NODE * NUMBER_OF_NODES,  # 250
        num_cpus=NUM_CPUS_PER_ACTOR  # total of 250*0.16 = 40 CPUs == 10 nodes
    )
)

In [9]:
ds_results.write_parquet('hdfs:///apps/b_perso/generative_signals/ray_test.parquet', filesystem=hdfs)

2025-02-20 11:15:23,454	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2025-02-20_07-37-03_341740_1/logs/ray-data
2025-02-20 11:15:23,455	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> ActorPoolMapOperator[Map(LLMRunner)] -> TaskPoolMapOperator[Write]


Running 0: 0.00 row [00:00, ? row/s]

- ReadParquet->SplitBlocks(49) 1: 0.00 row [00:00, ? row/s]

- Map(LLMRunner) 2: 0.00 row [00:00, ? row/s]

- Write 3: 0.00 row [00:00, ? row/s]



In [10]:
saved_results = ray.data.read_parquet('hdfs:///apps/b_perso/generative_signals/ray_test.parquet', filesystem=hdfs, columns=['title', 'desc', 'result', 'error'])

Metadata Fetch Progress 0:   0%|          | 0.00/8.00 [00:00<?, ? task/s]

Parquet Files Sample 0:   0%|          | 0.00/2.00 [00:00<?, ? file/s]

In [13]:
results_list = saved_results.take(50)

2025-02-20 11:27:09,868	INFO dataset.py:2409 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2025-02-20 11:27:09,876	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2025-02-20_07-37-03_341740_1/logs/ray-data
2025-02-20 11:27:09,876	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=50]


Running 0: 0.00 row [00:00, ? row/s]

- ReadParquet->SplitBlocks(2) 1: 0.00 row [00:00, ? row/s]

- limit=50 2: 0.00 row [00:00, ? row/s]

In [15]:
import re

regex = r'"characteristic_\d+":\s*"([^"]*)"'
    
for row in results_list[:10]:
    matches = re.findall(regex, row['result'])
    print(f"Title: {row['title']}\n Signals: {matches}\n")

Title: fashnzfab women's faux straw wood frame clutch
 Signals: ['High-Quality Polyurethane', 'Unique Faux Straw Frame', 'Versatile Styling']

Title: disneyland sherpa fleece spirit jersey for adults size xl
 Signals: ['Puff ink logo', 'Heathered rib cuffs', 'Fleecy sherpa style']

Title: pinup sailor hat mens one size black strapback cap dickie walker embroidered
 Signals: ['Good condition', 'Lightweight', 'Casual occasion']

Title: women's slip on pointed toe pumps chunky heel lady court ol shoes plus size
 Signals: ['Non-slip', 'Comfort', 'Faux suede']

Title: very vintage prada jacket nylon waterproof bag jacket size l
 Signals: ['Heavyweight', 'Nylon Waterproof', 'Insulation Material: Polyester']

Title: vintage madye's glamour scuffs womens slippers sleep white size small (5.5-6.5)
 Signals: ['Condition: New', 'Original Box', 'Vintage']

Title: dim supplement - dim - immune system supplement 2bottle
 Signals: ['Boost Immune System', 'Balances Hormone Levels', 'Supports Prostate H