In [None]:
import os
import random
import json
import pandas as pd
import time
import ast
import google.generativeai as genai
from datasets import load_dataset
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")

# Обращаемся к gemini через api_key
genai.configure(api_key=api_key)

# Загрузка исходного датасета
df = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset", split="train").to_pandas()

In [None]:
# Определяем целевую схему данных (категории и интенты)
TARGET_SCHEMA = {
    "ACCOUNT": ["create_account", "delete_account", "edit_account", 
                "switch_account", "registration_problems", "recover_password"],
    "CANCELLATION_FEE": ["check_cancellation_fee"],
    "CONTACT": ["contact_customer_service", "contact_human_agent"],
    "DELIVERY": ["delivery_options", "delivery_period"],
    "FEEDBACK": ["complaint", "review"],
    "INVOICE": ["check_invoice", "get_invoice"],
    "SUBSCRIPTION": ["newsletter_subscription"],
    "ORDER": ["track_order", "cancel_order", "change_order", "place_order"],
    "PAYMENT": ["check_payment_methods", "payment_issue"],
    "REFUND": ["check_refund_policy", "get_refund", "track_refund"],
    "SHIPPING_ADDRESS": ["change_shipping_address", "set_up_shipping_address"]
}

# Подправляем датасет, чтобы категории соответствовали интентам
df['category'] = df['category'].str.strip().str.upper()
df['category'] = df['category'].replace('CANCELLATION', 'CANCELLATION_FEE')
df['category'] = df['category'].replace('SHIPPING', 'SHIPPING_ADDRESS')

# Создаем список интентов из целевой схемы для генерации
target_intents_list = []
for category, intents in TARGET_SCHEMA.items():
    for intent in intents:
        target_intents_list.append({'category': category, 'intent': intent})

print(f"Целевая схема определена. {len(target_intents_list)} уникальных пар категория/интент.")

# Параметры генерации
NUM_ROWS_TO_GENERATE = 13500
EXAMPLES_PER_REQUEST = 50
MODEL_NAME = "gemini-2.5-flash-lite"
MULTILABEL_CHANCE = 0.33
API_CALL_DELAY = 1

generation_config = {
  "temperature": 0.9, "top_p": 0.95, "top_k": 0, # creativitymaxxing
  "max_output_tokens": 8192, "response_mime_type": "application/json", # ответ в JSON-формате
}
model = genai.GenerativeModel(model_name=MODEL_NAME, generation_config=generation_config)

Целевая схема определена. 27 уникальных пар категория/интент.
        flags                                        instruction category  \
26842     BIL  what do I have to do to see my compensation cu...   REFUND   
26843      BL   I need assistance to check my restitution status   REFUND   
26844     BLQ     i expect a rebate of {{Refund Amount}} dollars   REFUND   
26845     BKL                 show current status of restitution   REFUND   
26846     BLZ  I expect an restitution of {{Currency Symbol}}...   REFUND   
26847    BLQZ  i expect an rebate of {{Currency Symbol}}{{Ref...   REFUND   
26848  BCILQZ  need to check the rebate curent status how to ...   REFUND   
26849     BLQ  i expect a restitution of {{Currency Symbol}}{...   REFUND   
26850      BZ  I expect a reufnd of {{Currency Symbol}}{{Refu...   REFUND   
26851    BLQZ  i expect a reimbursxement of {{Currency Symbol...   REFUND   
26852     BLM  help me seeing if there are any updates on the...   REFUND   
26853     BLM 

In [None]:
# Функция для генерации промпта
def create_prompt(intent_info_list, few_shot_examples):
    schema_str = ""
    for category, intents in TARGET_SCHEMA.items():
        schema_str += f"  - {category}: {', '.join(intents)}\n"

    if len(intent_info_list) > 1:
        target_intent_str = "\n".join([f"   - Category: {info['category']}, Intent: {info['intent']}" for info in intent_info_list])
        instruction_focus = "Generate a multilabel example that combines these intents."
    else:
        info = intent_info_list[0]
        target_intent_str = f"   - Category: \"{info['category']}\"\n   - Intent: \"{info['intent']}\""
        instruction_focus = "Generate examples for this specific intent."

    examples_str = "\n".join([f"- {row.to_dict()}" for _, row in few_shot_examples.iterrows()])

    return f"""
You are an expert data generation assistant. Your task is to create high-quality, diverse, and realistic synthetic data for training a multilabel customer support chatbot classifier.

**TASK:**
Generate {EXAMPLES_PER_REQUEST} new examples of customer support queries based on the provided schema.

**DATASET SCHEMA:**
Your generated "category" and "intent" values MUST strictly come from this schema. Do not invent new ones.
{schema_str}

**FLAG DEFINITIONS:**
You MUST use a combination of the following flags to describe the generated text. Be creative and mix them.
M - Morphological, L - Semantic (synonyms), B - Basic syntax, I - Interrogative, C - Coordinated, N - Negation, P - Politeness, Q - Colloquial, W - Offensive, K - Keyword mode, E - Abbreviations, Z - Errors/Typos.

**MULTILABEL INSTRUCTIONS:**
- For user queries addressing multiple topics, "category" and "intent" must be JSON lists of strings.
- Example: "I want to cancel my last order and get a refund." -> "category": ["ORDER", "REFUND"], "intent": ["cancel_order", "track_refund"]

**TARGET INTENT(S) FOR THIS TASK:**
{target_intent_str}
{instruction_focus}

**OUTPUT FORMAT:**
- Strictly return a valid JSON list of {EXAMPLES_PER_REQUEST} objects.
- Each object in the list MUST contain these five keys: "flags", "instruction", "category", "intent", and "response".
- The "instruction" key should contain a realistic user query.
- The "response" key MUST contain a helpful and relevant answer from a support agent that directly addresses the user's query in "instruction".
- Do NOT include any text, explanations, or markdown. Your output must be ONLY the JSON list.

**EXAMPLES OF GENERAL QUERY STRUCTURE (for style reference only):**
{examples_str}

Now, generate {EXAMPLES_PER_REQUEST} new and unique examples adhering to all instructions.
"""

# Улучшение промпта на будущее:  
# В секции OUTPUT FORMAT добавить:
# - The "instruction" key should contain a realistic, non-empty user query.

In [None]:
# Цикл генерации
generated_data = []
pbar = tqdm(total=NUM_ROWS_TO_GENERATE, desc="Генерация данных с Gemini") # progressbar

while len(generated_data) < NUM_ROWS_TO_GENERATE:
    try:
        intents_for_prompt = []
        if random.random() < MULTILABEL_CHANCE and len(target_intents_list) > 1:
            intents_for_prompt = sorted(random.sample(target_intents_list, 2))
        else:
            intents_for_prompt.append(random.choice(target_intents_list))

        # Для few-shot примеров используем оригинальный датасет (берем три рандомных строки)
        few_shot_examples = df.sample(n=3)

        prompt = create_prompt(intents_for_prompt, few_shot_examples)
        
        response = model.generate_content(prompt)
        new_examples = json.loads(response.text)

        if not isinstance(new_examples, list):
             print(f"\nWarning: получен не список. Пропускаем. Тип: {type(new_examples)}")
             continue

        for example in new_examples:
            if len(generated_data) < NUM_ROWS_TO_GENERATE:
                 generated_data.append(example)
                 pbar.update(1)

        time.sleep(API_CALL_DELAY)

    except json.JSONDecodeError as e:
        print(f"\nОшибка парсинга JSON: {e}. Ответ модели: {response.text[:200]}...")
        time.sleep(5)
    except Exception as e:
        print(f"\nПроизошла ошибка: {e}. Повторная попытка через 10 секунд...")
        time.sleep(10)

pbar.close()

# Сохраняем результат
print(f"\nГенерация завершена. Сгенерировано {len(generated_data)} новых строк.")

new_df = pd.DataFrame(generated_data)

def parse_list_string(s): # по сути оказалось лишним, т.к. потребовалось сохранить еще в Parquet
    if isinstance(s, str) and s.startswith('[') and s.endswith(']'):
        try: return ast.literal_eval(s)
        except (ValueError, SyntaxError): return s
    return s

for col in ['category', 'intent']:
    if col in new_df.columns:
        new_df[col] = new_df[col].apply(parse_list_string)

# Объединяем исходный датасет с новым сгенерированным
combined_df = pd.concat([df, new_df], ignore_index=True)

# Преобразуем все значения в этих столбцах в их строковое представление.
# Списки будут сохранены как строки, например, "['ORDER', 'REFUND']".
for col in ['category', 'intent']:
    if col in combined_df.columns:
        # astype(str) - самый надежный способ
        combined_df[col] = combined_df[col].astype(str)


output_filename_parquet = "augmented_customer_support_dataset.parquet"
combined_df.to_parquet(output_filename_parquet, index=False)
print(f"Файл сохранен (Parquet): {output_filename_parquet}")

output_filename_csv = "augmented_customer_support_dataset.csv"
combined_df.to_csv(output_filename_csv, index=False)
print(f"Файл сохранен (CSV): {output_filename_csv}")

print(f"Размер исходного датасета: {len(df)} строк.")
print(f"Количество сгенерированных строк: {len(new_df)} строк.")
print(f"Итоговый размер объединенного датасета: {len(combined_df)} строк.")

Генерация данных с Gemini:   2%|▏         | 250/13500 [01:14<1:05:32,  3.37it/s]
Генерация данных с Gemini:   8%|▊         | 1145/13500 [05:21<56:16,  3.66it/s] 


Ошибка парсинга JSON: Expecting value: line 1 column 1 (char 0). Ответ модели: ```json
[
  {
    "flags": "B",
    "instruction": "I need to see my last invoice.",
    "category": "INVOICE",
    "intent": "check_invoice",
    "response": "I can help with that! Please provide you...


Генерация данных с Gemini:  47%|████▋     | 6328/13500 [27:14<30:58,  3.86it/s]  


Ошибка парсинга JSON: Expecting value: line 1 column 1 (char 0). Ответ модели: ```json
[
  {
    "flags": "BKL",
    "instruction": "Could you please show me my latest invoice?",
    "category": "INVOICE",
    "intent": "check_invoice",
    "response": "Certainly! I can help you...

Ошибка парсинга JSON: Invalid \escape: line 14 column 449 (char 1485). Ответ модели: [
  {
    "flags": "BKL",
    "instruction": "opening {{Account Type}} account for wife",
    "category": "ACCOUNT",
    "intent": "create_account",
    "response": "Thank you for considering opening ...


Генерация данных с Gemini: 100%|██████████| 13500/13500 [57:29<00:00,  3.91it/s]



Генерация завершена. Сгенерировано 13500 новых строк.
Расширенный датасет сохранен в файл: augmented_customer_support_dataset.parquet
Версия в CSV также сохранена: augmented_customer_support_dataset.csv
Размер отфильтрованного исходного датасета: 26872 строк.
Количество сгенерированных строк: 13500 строк.
Итоговый размер объединенного датасета: 40372 строк.
      flags                                        instruction category  \
40342   BPQ     I want to buy some new shoes. Can you help me?    ORDER   
40343    BL      Could you assist me with placing a new order?    ORDER   
40344    BI                How do I go about ordering an item?    ORDER   
40345    BP       I'd like to place an order for a new laptop.    ORDER   
40346     Q                                 wanna order a gift    ORDER   
40347     L                 I need to purchase some groceries.    ORDER   
40348     B                             Place an order for me.    ORDER   
40349     P             Could you pleas

In [23]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40372 entries, 0 to 40371
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   flags        40372 non-null  object
 1   instruction  40366 non-null  object
 2   category     40372 non-null  object
 3   intent       40372 non-null  object
 4   response     40372 non-null  object
dtypes: object(5)
memory usage: 1.5+ MB


In [None]:
df = pd.read_parquet("augmented_customer_support_dataset.parquet") 

# Строки с пропусками в 'instruction'
missing_instruction_df = df[df['instruction'].isna()]

# Выведем их на экран
print(f"Найдено {len(missing_instruction_df)} строк с пропущенным 'instruction'.")
display(missing_instruction_df.head())

# Дропаем
df.dropna(subset=['instruction'], inplace=True)

Найдено 6 строк с пропущенным 'instruction'.


Unnamed: 0,flags,instruction,category,intent,response
31362,BLC,,REFUND,track_refund,I can help with that. Please provide your orde...
39551,BILC,,ORDER,place_order,I can help with both! Once we've sorted out yo...
39560,BIC,,ORDER,place_order,"Yes, I can assist with both. Once we've locate..."
39571,BIC,,ORDER,place_order,I can certainly help with both. After we've ad...
39584,BIZC,,ORDER,cancel_order,I can help you with your order cancellation. P...
