In [4]:
import numpy as np
import pandas as pd
import re

import json
from tqdm.notebook import tqdm

import openai
import tiktoken

In [5]:
np.random.seed(1337)

 ## Данные

In [10]:
data = pd.read_csv("../data/merged_final.csv")
print(data.shape)
data.head()

(100000, 4)


Unnamed: 0,Text,WallId,PostId,Source
0,iРhonе 11 64гб в крacном цвeте с ЧЕКОM из магa...,-151775338,537819,vk
1,"Kx-tca285\nБ/у недолго. В рабочем состоянии, н...",-48332904,704790,vk
2,🌹ПОЛУЧИЛИ ДАЧНИ КОСТЮМ \n🌹КАЧЕСТВО СУПЕР \n🌹РА...,-192083416,12439,vk
3,Продам 1000р. Размер 46. (В идеальном состояни...,-90442795,349691,vk
4,мини-сумка charles&keith яичного цвета. новая....,1332882905,429778,tg


In [11]:
data.drop_duplicates(subset=["Text"], inplace=True)
print(data.shape)
data.head()

(94611, 4)


Unnamed: 0,Text,WallId,PostId,Source
0,iРhonе 11 64гб в крacном цвeте с ЧЕКОM из магa...,-151775338,537819,vk
1,"Kx-tca285\nБ/у недолго. В рабочем состоянии, н...",-48332904,704790,vk
2,🌹ПОЛУЧИЛИ ДАЧНИ КОСТЮМ \n🌹КАЧЕСТВО СУПЕР \n🌹РА...,-192083416,12439,vk
3,Продам 1000р. Размер 46. (В идеальном состояни...,-90442795,349691,vk
4,мини-сумка charles&keith яичного цвета. новая....,1332882905,429778,tg


In [42]:
data_20k = data.sample(n=20000)
print(data_20k.shape)
data_20k.head()

(20000, 4)


Unnamed: 0,Text,WallId,PostId,Source
14868,б/у:\n∙ сумка дорожная на колесах (~высота 70с...,1332882905,570967,tg
33066,davines - мусс-объем \nthis is a volume boosti...,1332882905,618057,tg
32282,👌Новая поступления 🥳🥳🥳\n. tencel \n Ц...,-192083416,13589,vk
71425,"клининг/фея чистоты/ уборка 🧚\nофис, квартира,...",1121942433,400700,tg
37440,Распродажа \nЦена: 550р шт \nРазмер: 41.42.43....,-206817857,82815,vk


In [43]:
data_20k["Source"].value_counts()

Source
tg    10548
vk     9452
Name: count, dtype: int64

In [44]:
data_20k.isna().sum()

Text      0
WallId    0
PostId    0
Source    0
dtype: int64

In [47]:
data_20k.to_csv("<tmp_path_to_20k_sample_from_raw_dataset>", index=False)

## Constants

In [22]:
model = "gpt-4-turbo-2024-04-09"

In [23]:
token = "your_openai_token"  # None
with open("<path_to_token>", "r") as file:
    token = file.readline().strip()

In [24]:
prompt = """You are an AI trainer at a marketplace company. Your role is to convert product offer posts from plain text into structured JSON format. A "bundle" is defined as a set of items being sold together, which cannot be separated.
The JSON structure must include:
"place": The location where the buyer can pick up the product or delivery information, if available.
"bundles": A list of dictionaries, each representing a valid bundle.
Each bundle dictionary should contain ONLY the following information for ONE bundle:
"name" (string): A brief title of the bundle, including enough details for the customer to understand what is being sold. Ensure to include important general details, such as product category, if provided.
"quantity" (string): The minimal unseparable amount of product in the bundle. For countable items, this is the number of items. For non-countable products (like liquids or materials), this should reflect the volume, length, or other relevant metrics (default is '1').
"price" (number): The cost for the entire bundle, based on the 'quantity' specified.
"currency" (string): The currency for the 'price' (default is 'RUB').

Regarding product variations, such as size or color, these should NOT be treated as separate bundles. Variations should be considered additional descriptions and are not required in the JSON. The focus is on distinct products or bundles as defined by their essential characteristics, not their variations.
Exclude any fields with null values from the JSON. Do not add descriptions or any fields other than those specified. If no price information is provided, exclude the bundle from the JSON. Answer in the language of the original offer.
Your response should be a JSON only, with no additional text required.
"""

## Functions

In [25]:
def count_tokens(text):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [26]:
client = openai.OpenAI(api_key=token)

In [37]:
class OpenAIMarkup:
    def __init__(self, openai_client, model, prompt, checkpoint_frequency=200):
        self.client = openai_client
        self.model = model
        self.prompt = prompt
        self.checkpoint_frequency = checkpoint_frequency
        self.save_data = None

    def process_ad(self, text):
        return self.client.chat.completions.create(
            messages=[
                {"role": "system", "content": self.prompt},
                {"role": "user", "content": text},
            ],
            model=self.model,
        ).choices[0].message.content

    def save_checkpoint(self, dataset, responses, save_path):
        self.save_data = dataset.iloc[:len(responses), :].copy()
        self.save_data["GptResponse"] = responses
        self.save_data.to_csv(save_path, index=False)

    def markup_dataset(self, dataset, text_column="Text", limit=10000, save_path="<default_path_for_checkpoints>"):
        json_responses = []
        save_data = None
        failed = 0
        for i, ind in tqdm(enumerate(dataset.index)):
            try:
                json_response = self.process_ad(dataset.loc[ind, text_column])
            except Exception as e:
                print(f"Failed to process ad: '{dataset.loc[ind, text_column]}'")
                failed += 1
                continue
            json_responses.append(json_response)

            if i % self.checkpoint_frequency == 0:
                self.save_checkpoint(dataset, json_responses, save_path)
                print(f"checkpoint of size {self.save_data.shape} saved!")

            if i - failed >= limit:
                break

        self.save_checkpoint(dataset, json_responses, save_path)
        return self.save_data

## Small test

In [38]:
markupper = OpenAIMarkup(client, model, prompt, 1)

test_data = pd.DataFrame({
    "text": [
        "все футболки в идеальном  состоянии  все оригинальные цена одной футболки 40 лари  . 1. levi’s  ( размер м ) 2. lyle scott ( размер m ) 3. the north face ( размер xl ) 4. lyle scott ( новая размер м ) 5. lyle scott ( размер s ) 6. new balance ( размер l )  7. nike ( размер l )  8. tommy hilfiger ( размер м ) 9. nike ( размер l ) 10. nike ( размер xl )",
        "книги на английском  blue moon 4€ the good fight 5€ the book thief 4€ the pillars of the earth 4€ peaky blinders 5€ all the light we cannot see 5€ the last letter from your lover 5€ this is going to hurt в твердой обложке 9€ самовывоз пафос, юниверсал",
        "Распродажа Цена: 350р Размер:104-110- 116-122-128 Без выбора цвета Арт: 1-4-10 корпус А",
    ] * 10
})

test_result = markupper.markup_dataset(test_data, text_column="text", limit=3, save_path="test_markup.csv")

0it [00:00, ?it/s]

checkpoint of size (1, 2) saved!
checkpoint of size (2, 2) saved!
checkpoint of size (3, 2) saved!
checkpoint of size (4, 2) saved!


In [39]:
test_result.head()

Unnamed: 0,text,GptResponse
0,все футболки в идеальном состоянии все ориги...,"{\n ""place"": ""Tbilisi"",\n ""bundles"": [\n ..."
1,книги на английском blue moon 4€ the good fig...,"{\n ""place"": ""самовывоз пафос, юниверсал"",\n ..."
2,Распродажа Цена: 350р Размер:104-110- 116-122-...,"{\n ""place"": ""Арт: 1-4-10 корпус А"",\n ""bund..."
3,все футболки в идеальном состоянии все ориги...,"{\n ""place"": null,\n ""bundles"": [\n {\n ..."


## Финальная разметка

In [46]:
markupper = OpenAIMarkup(client, model, prompt, checkpoint_frequency=200)
labeled_data = markupper.markup_dataset(data_20k, text_column="Text", limit=10000, save_path="<path_for_checkpoints>")

0it [00:00, ?it/s]

checkpoint of size (1, 5) saved!
checkpoint of size (201, 5) saved!
checkpoint of size (401, 5) saved!
checkpoint of size (601, 5) saved!
checkpoint of size (801, 5) saved!
checkpoint of size (1001, 5) saved!
checkpoint of size (1201, 5) saved!
checkpoint of size (1401, 5) saved!
checkpoint of size (1601, 5) saved!
checkpoint of size (1801, 5) saved!
checkpoint of size (2001, 5) saved!
checkpoint of size (2201, 5) saved!
checkpoint of size (2401, 5) saved!
checkpoint of size (2601, 5) saved!
checkpoint of size (2801, 5) saved!
checkpoint of size (3001, 5) saved!
checkpoint of size (3201, 5) saved!
checkpoint of size (3401, 5) saved!
checkpoint of size (3601, 5) saved!
checkpoint of size (3801, 5) saved!
checkpoint of size (4001, 5) saved!
checkpoint of size (4201, 5) saved!
checkpoint of size (4401, 5) saved!
checkpoint of size (4601, 5) saved!
checkpoint of size (4801, 5) saved!
checkpoint of size (5001, 5) saved!


KeyboardInterrupt: 

In [52]:
labeled_data = pd.read_csv("<path_for_checkpoints>")
labeled_data.shape

(5001, 5)

In [54]:
labeled_data.head()

Unnamed: 0,Text,WallId,PostId,Source,GptResponse
0,б/у:\n∙ сумка дорожная на колесах (~высота 70с...,1332882905,570967,tg,"{\n ""place"": null,\n ""bundles"": [\n {\n ..."
1,davines - мусс-объем \nthis is a volume boosti...,1332882905,618057,tg,"{\n ""place"": ""вера"",\n ""bundles"": [\n {\n..."
2,👌Новая поступления 🥳🥳🥳\n. tencel \n Ц...,-192083416,13589,vk,"{\n ""place"": ""https://vk.com/wall488363770_21..."
3,"клининг/фея чистоты/ уборка 🧚\nофис, квартира,...",1121942433,400700,tg,"{\n ""place"": ""по маршруту автобусов 711, 424,..."
4,Распродажа \nЦена: 550р шт \nРазмер: 41.42.43....,-206817857,82815,vk,"{\n ""place"": null,\n ""bundles"": []\n}"


In [56]:
from json.decoder import JSONDecodeError
parsed_gpt_labels = {
    "Text": [],
    "WallId": [],
    "PostId": [],
    "Source": [],
    "Title": [],
    "Price": [],
    "Currency": [],
    "Count": [],
}

failed_inds = []
empty_inds = []

for ind in tqdm(labeled_data.index):
    try:
        res = json.loads(labeled_data.loc[ind, "GptResponse"])
    except JSONDecodeError as e:
        failed_inds.append(ind)
        continue

    if "bundles" not in res:
        empty_inds.append(ind)

    for prod in res.get("bundles", []):
        for col in ("Text", "WallId", "PostId", "Source"):
            parsed_gpt_labels[col].append(labeled_data.loc[ind, col])
        parsed_gpt_labels["Title"].append(prod.get("name"))
        parsed_gpt_labels["Price"].append(prod.get("price"))
        parsed_gpt_labels["Currency"].append(prod.get("currency", "RUB"))
        parsed_gpt_labels["Count"].append(prod.get("quantity", 1))

gpt_labels_df = pd.DataFrame(parsed_gpt_labels)
failed_to_parse_df = labeled_data.loc[failed_inds, :].copy()
no_bundles_df = labeled_data.loc[empty_inds, :].copy()

gpt_labels_df.head()

  0%|          | 0/5001 [00:00<?, ?it/s]

Unnamed: 0,Text,WallId,PostId,Source,Title,Price,Currency,Count
0,б/у:\n∙ сумка дорожная на колесах (~высота 70с...,1332882905,570967,tg,сумка дорожная на колесах,60,GEL,1
1,б/у:\n∙ сумка дорожная на колесах (~высота 70с...,1332882905,570967,tg,"рюкзак kanken classic, цвет fog, оригинал",60,GEL,1
2,б/у:\n∙ сумка дорожная на колесах (~высота 70с...,1332882905,570967,tg,"женская куртка оверсайз incity, размер m",50,GEL,1
3,б/у:\n∙ сумка дорожная на колесах (~высота 70с...,1332882905,570967,tg,батарея (аккумулятор) пустышка для камер sony,50,GEL,1
4,б/у:\n∙ сумка дорожная на колесах (~высота 70с...,1332882905,570967,tg,комплект штативов с креплением к столу vijim,60,GEL,1


In [57]:
!mkdir data

mkdir: cannot create directory ‘data’: File exists


In [58]:
print(gpt_labels_df.shape)
gpt_labels_df.to_csv("<path_to_parsed_labeled_data>", index=False)

(6984, 8)


In [60]:
print(no_bundles_df.shape)
gpt_labels_df.to_csv("<path_to_labeled_data_with_no_bundles>", index=False)

(244, 5)


In [61]:
gpt_labels_df.shape

(6984, 8)

In [62]:
gpt_labels_df.isna().sum()

Text         0
WallId       0
PostId       0
Source       0
Title        0
Price       79
Currency     0
Count        0
dtype: int64