In [1]:
import time         # timing utilities → sleep() for rate limit handling
import textwrap     # clean multi-line strings → format the extraction prompt
import unicodedata  # unicode normalisation → fix NBSP, quotes, special chars
from pathlib import Path  # path handling → cross-platform file locations
import pandas as pd       # data frames → read/write CSV
import langextract as lx  # core library → span-based extraction with LLMs

In [None]:
import os
os.environ["LANGEXTRACT_API_KEY"] = "Enter_Your_Api_Key"

In [None]:
INPUT = "crm_complaints_sample.csv"
OUT_CSV = "complaints_enriched.csv"
OUT_JSONL = "complaints_annotated.jsonl"
OUT_HTML = "complaints_annotated.html"

prompt = textwrap.dedent("""\
From customer complaint descriptions, extract the following fields:
- order_id
- product_name (or sku if product name absent)
- issue_type (one of: damaged, wrong item, late delivery, refund request, double charge, missing parts, wrong size, other)
- quantity_affected
- incident_date (keep as in text)
- customer_email
- refund_intent (Yes/No)
- sentiment (Positive/Neutral/Negative/Frustrated; leave empty if unclear)

Rules:
- Use exact spans from the text (no paraphrase).
- If a field is missing or uncertain, leave it empty.
- Map variants (e.g., sipariş/sip., Bestellung/Order/OrderID) to order_id value.
- Prefer the most specific issue_type present; if none matches, use "other".
""")

examples = [
    lx.data.ExampleData(
        text=(
            "Subject: Complaint – Broken mouse\n"
            "Order: #662918  Product: Gaming Mouse X200\n"
            "Problem: arrived damaged, box crushed, left click stuck.\n"
            "Qty: 1   Incident date: 21.09.2025\n"
            "Customer: sofia.romero@gmail.com\n"
            "Very disappointed. Please refund."
        ),
        extractions=[
            lx.data.Extraction("order_id", "662918"),                    
            lx.data.Extraction("product_name", "Gaming Mouse X200"),     
            lx.data.Extraction("issue_type", "damaged"),                 
            lx.data.Extraction("quantity_affected", "1"),                
            lx.data.Extraction("incident_date", "21.09.2025"),           
            lx.data.Extraction("customer_email", "sofia.romero@gmail.com"),
            lx.data.Extraction("refund_intent", "Please refund."),       
            lx.data.Extraction("sentiment", "Very disappointed."),       
        ],
    ),
    lx.data.ExampleData(
        text=(
            "Betreff: Falsche Größe geliefert\n"
            "Bestellung #772199, Produkt: Sneaker RunFast 42. Falsche Größe: 41.\n"
            "Menge: 2 Paar. Bitte Austausch. Kunde: markus.schneider@mail.de"
        ),
        extractions=[
            lx.data.Extraction("order_id", "772199"),
            lx.data.Extraction("product_name", "Sneaker RunFast 42."),
            lx.data.Extraction("issue_type", "Falsche Größe"),           
            lx.data.Extraction("quantity_affected", "2"),                
            lx.data.Extraction("customer_email", "markus.schneider@mail.de"),
        ],
    ),
]

df = pd.read_csv(INPUT)

docs = []     # AnnotatedDocument list
rows = []     # for CSV 

for _, r in df.iterrows():
    text = str(r["description"])
    doc = lx.extract(
        text_or_documents=text,
        prompt_description=prompt,
        examples=examples,
        model_id="gemini-2.5-flash",
    )
    docs.append(doc)
    time.sleep(35)

    out = {
        "ticket_id": r["ticket_id"],
        "customer_id": r["customer_id"],
        "channel": r["channel"],
        "created_at": r["created_at"],
        "order_id": "",
        "product_name": "",
        "issue_type": "",
        "quantity_affected": "",
        "incident_date": "",
        "customer_email": "",
        "refund_intent": "",
        "exchange_intent": "",
        "sentiment": "",
    }

    for ext in doc.extractions:
        cls = ext.extraction_class
        if cls in out and not out[cls]:
            out[cls] = ext.extraction_text

    rows.append(out)



In [10]:
# 1) Save annotated JSONL
lx.io.save_annotated_documents(docs, output_name=OUT_JSONL, output_dir=".")

# 2) Save visualization HTML
html_obj = lx.visualize(OUT_JSONL)
with open(OUT_HTML, "w", encoding="utf-8") as f:
    f.write(html_obj.data)

# 3) Save enriched CSV
pd.DataFrame(rows).to_csv(OUT_CSV, index=False)

print(f"Saved -> {OUT_CSV}, {OUT_JSONL}, {OUT_HTML}")

[94m[1mLangExtract[0m: Saving to [92mcomplaints_annotated.jsonl[0m: 12 docs [00:00, 2242.94 docs/s]

[92m✓[0m Saved [1m12[0m documents to [92mcomplaints_annotated.jsonl[0m



[94m[1mLangExtract[0m: Loading [92mcomplaints_annotated.jsonl[0m: 100%|█| 21.2k/21.2k [00:00<00:0[0m

[92m✓[0m Loaded [1m12[0m documents from [92mcomplaints_annotated.jsonl[0m
Saved -> complaints_enriched.csv, complaints_annotated.jsonl, complaints_annotated.html





In [11]:
import pandas as pd

# CSV'yi oku
df_out = pd.read_csv("complaints_enriched.csv")

# İlk 10 satırı göster
df_out.head(10)

Unnamed: 0,ticket_id,customer_id,channel,created_at,order_id,product_name,issue_type,quantity_affected,incident_date,customer_email,refund_intent,exchange_intent,sentiment
0,10001,C-87421,email,2025-09-18 10:12:00,928331,BaristaPro Coffee Machine,damaged,1,18.09.2025,laura.mueller@example.com,İade/Değişim istiyorum.,İade/Değişim istiyorum.,Frustrated
1,10002,C-44110,email,2025-09-15 08:57:00,772199,Sneaker RunFast 42.,wrong size,2,,markus.schneider@mail.de,,Bitte Austausch.,
2,10003,C-19907,chat,2025-09-20 19:03:00,472233,Sneaker X200,wrong size,One pair,20.09.2025,john.doe77@gmail.com,,Need exchange,Neutral
3,10004,C-55220,whatsapp,2025-09-21 09:40:00,883421,Yoga Mat Pro,late delivery,,19.09.2025,d.jones@hotmail.com,para iadesi düşünüyorum.,,memnun değilim
4,10005,C-33005,email,2025-09-21 16:22:00,662918,Gaming Mouse X200,damaged,One,21.09.2025,sofia.romero@gmail.com,Please refund or replace.,Please refund or replace.,
5,10006,C-28771,email,2025-09-19 11:11:00,771010,Blender Turbo 900,late delivery,,18/09/2025,pierre.martin@example.fr,Yes,,Negative
6,10007,C-94410,chat,2025-09-22 13:55:00,901122,CHAIR-AX5,missing parts,4,,nina.kovacs@shop.hu,,,
7,10008,C-50001,email,2025-09-17 07:20:00,770555,Wasserkocher K-600,damaged,,,melis.yilmaz@outlook.com,No,,
8,10009,C-81117,phone_note,2025-09-18 17:09:00,550010,T-Shirt Basic M,wrong item,1,,tom.smith@gmail.com,,Wants exchange,
9,10010,C-72229,instagram_dm,2025-09-19 22:41:00,440044,Ceramic Bowl Set,damaged,1,,ayse.deniz@icloud.com,,,Negative


In [None]:
OUT_CSV = "complaints_enriched.csv"
pd.DataFrame(rows).to_csv(OUT_CSV, index=False)