In [121]:
import pandas as pd
import openai
import pickle as pkl
import json
from tqdm.notebook import tqdm
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [122]:
PROMPT = """You are a quality control officer at a marketplace. You have received a product offer from a seller and a product description in JSON format. A "bundle" is defined as an inseparable set of items being sold together, with a price provided for the entire set. Prices in the offer should not be divided.

Your task is to determine whether the offer includes the described bundle. You also need to handle the specified case, where the "OFFER" lacks sufficient detail about the product and makes it impossible for the customer to understand or verify the product without additional context and set "invalid" response.

The expected JSON fields are:
- **"Title" (string)**: A brief title of the bundle, including information about what is being sold. Important general details, such as product category, should be included, if provided. The title must give customers enough information to understand what the bundle contains without any external context or needing to perform additional research.
- **"Count" (string)**: The minimal inseparable amount of product in the bundle. For countable items, this is the number of items. For non-countable products (like liquids or materials), this should reflect the volume, length, or other relevant metrics (default is '1').
- **"Price" (number)**: The cost for the entire bundle, based on the 'Count' specified.
- **"Currency" (string)**: The currency for the 'Price' (default is 'RUB').

The JSON product description should represent just ONE bundle.

Bundles must not be split up, and separate products must not be aggregated into a single bundle.

Respond with JSON with ONLY 2 fields:
- **"Label"**: should be one of ("valid", "invalid"):
    - 'valid' if the product represented by the bundle is valid, and the OFFER contains the described bundle, meaning it is possible to purchase the described bundle in the 'Count' for the 'Price' specified in 'Currency'.
    - 'invalid' - if the OFFER does not provide enough information for a customer to understand what the product bundle contains without additional context, or if the product described seems unreal or cannot be verified based on the information provided.
- **"Comment"**: Explain your decision in one sentence.

Note: This instruction emphasizes the need for clarity and verifiability in the product title and details, ensuring that customers can understand and verify what they are purchasing without needing to search for more information.

Answer with one JSON only, no extra information should be provided."""

In [123]:
API_KEY = os.getenv('DIPLOMA_OPENAI_API_KEY')
openai_client = openai.OpenAI(api_key=API_KEY)

In [124]:
cache = {}

In [125]:
def product_desc(row):
  s = f'''
OFFER: {row.Text}

Bundle description
Title: {row.Title}
Count: {row.Count}
Price: {row.Price}
Currency: {row.Currency}
  '''
  return s

In [137]:
def extract_json(content):
    result = {'full': content}
    for key in ['Title', 'Count', 'Price', 'Currency']:
        content = content.replace(f'\"{key}\"', f'\'{key}\'')
    try:
        if content.startswith('```json'):
          content = content[7:-3]
        resp = json.loads(content)
        result.update(resp)
    except:
        pattern = r'\{.*\}'
        match = re.search(pattern, content, re.DOTALL)
        
        if match:
            resp = json.loads(match.group(0))
            result.update(resp)
    return result

In [138]:
from time import sleep

class OpenAIShooter:
  def __init__(self, client, prompt, dump_path='<default_path_to_save_checkpoints>', dump_freq=200, cache={}):
    self.client = client
    self.cache = {}
    self.model = "gpt-4-turbo-preview"
    self.prompt = prompt
    self.storage = cache
    self.answers = []
    self.dump_path = dump_path
    self.dump_freq = dump_freq

  def request(self, text):
    if text not in self.storage:
      retry = True
      while retry:
        response = self.client.chat.completions.create(
          model=self.model,
          messages=[
            {"role": "system", "content": self.prompt},
            {"role": "user", "content": text}
          ],
          **({'response_format':{"type": "json_object"}} if self.model == "gpt-4-1106-preview" else {})
        )
        self.storage[text] = response
        retry = False
        if retry:
          sleep(0.5)
    return self.storage[text]
  
  def shoot_all(self, df, limit=1e9):
    answer = {}
    for i in tqdm(range(min(len(df), limit))):
      item = df.iloc[i]
      if item.Label_gpt != '':
        continue
      text = product_desc(item)
      raw_resp = self.request(text)
      try:
        content = raw_resp.choices[0].message.content
        answer[i] = extract_json(content)
      except Exception as e:
        print(i, raw_resp, e)
      df.loc[i, 'Label_gpt'] = answer[i]['Label'] if 'Label' in answer[i] else ''
      df.loc[i, 'Comment_gpt'] = answer[i]['Comment'] if 'Comment' in answer[i] else ''
      if not i % self.dump_freq:
        df.to_csv(self.dump_path)
        with open('answer.pkl', 'wb') as f:
          pkl.dump(answer, f)
        print(f'Iter {i}: dumped')
    df.to_csv(self.dump_path)
    self.answers.append(answer)
    return answer

In [139]:
shooter = OpenAIShooter(openai_client, PROMPT, cache=cache)

In [129]:
df = pd.read_csv('<path_to_parsed_labeled_data>')
df['Label_gpt'] = ''
df['Comment_gpt'] = ''
df.head(5)

Unnamed: 0,Text,WallId,PostId,Source,Title,Price,Currency,Count,Label_gpt,Comment_gpt
0,Распродажа \nКоллагеновый консилер от Enough\n...,-206817857,86686,vk,Коллагеновый консилер от Enough,80,RUB,1,,
1,#продаю в тбилиси focusrite scatlett 2i2. полн...,1332882905,439179,tg,focusrite scatlett 2i2,190,₾,1,,
2,"Ботильоны Vagabond, натуральная кожа/мех, 39 р...",-48332904,713045,vk,"Ботильоны Vagabond, натуральная кожа/мех, разм...",5000,RUB,1,,
3,"Цена: 700р\nраспродажа \nРазмеры: 42,44,46,48,...",-206817857,120264,vk,флисовая одежда,700,RUB,1,,
4,"продам кухонный стол. стол удобный, основатель...",1121942433,699812,tg,кухонный стол и 4 стула,150,EUR,1,,


In [140]:
answers = shooter.shoot_all(df)

  0%|          | 0/5153 [00:00<?, ?it/s]

Iter 200: dumped
Iter 400: dumped
Iter 600: dumped
Iter 800: dumped
Iter 1000: dumped
Iter 1200: dumped
Iter 1400: dumped
Iter 1600: dumped
Iter 1800: dumped
Iter 2000: dumped
Iter 2200: dumped
Iter 2400: dumped
Iter 2600: dumped
Iter 2800: dumped
Iter 3000: dumped
Iter 3200: dumped
Iter 3400: dumped
Iter 3600: dumped
Iter 3800: dumped
Iter 4000: dumped
Iter 4200: dumped
Iter 4400: dumped
Iter 4600: dumped
Iter 4800: dumped
Iter 5000: dumped


In [142]:
df.to_csv('<path_to_labeled_and_validated_data>')

In [81]:
df[df.Label_gpt == 'valid']

Unnamed: 0,Text,WallId,PostId,Source,Title,Price,Currency,Count,Label_gpt,Comment_gpt
0,"1. монитор, 250 лари\n2. мольберт, 50 лари\n3....",1332882905,686748,tg,"рюкзак, 40 литров, идеальное состояние",100,лари,1,valid,The offer includes a backpack bundle that matc...
1,#БНИ_ПродамБУ_Москва\n#БНИ_ПродамНовая_Москва\...,-114967596,592027,vk,"Street Fighter (база, стретчи, доп)",26000,RUB,1,valid,The offer clearly describes the bundle's conte...
2,Продам стульчик для кормления Happy baby. В хо...,-82544098,661957,vk,Стульчик для кормления Happy baby,3000,RUB,1,valid,The offer clearly describes a single feeding c...
3,"Продаю компьютер Цена 55000 Все почти новое ,е...",-90442795,353383,vk,Компьютер с комплектующими и аксессуарами,55000,RUB,1,valid,The offer clearly describes a computer with co...
5,"продам часы galaxy watch\nпокупали на амазоне,...",1121942433,119858,tg,часы galaxy watch с стальным ремешком,200,EUR,1,valid,The offer clearly describes the bundle includi...
...,...,...,...,...,...,...,...,...,...,...
14054,"стол, 4 стула. 180 евро, лимассол",1121942433,437040,tg,стол и 4 стула,180,евро,1,valid,The offer provides clear details about the bun...
14055,1. Черное платье выше колен с открытыми плечам...,-48332904,725391,vk,Блуза MOHITO,500,RUB,1,valid,The offer includes a specific item matching th...
14057,#БНИ_ПродамБУ_Москва \n#БНИ_СДЭК \n#БНИ_Авито ...,-114967596,573450,vk,Пробуждение Медведя с дополнением Тяжелые танки,3500,RUB,1,valid,The offer clearly includes the described bundl...
14058,"Продаю;Ботинки 34р,1000р.Ботинки утеплённые 34...",-82544098,642652,vk,Куртка демисезонная 92 размер,600,RUB,1,valid,The OFFER clearly states the product matching ...


In [82]:
11736 / 14061

0.8346490292297845