Using OpenAPI 4.0-Turbo perform sentiment analysis on return comments

In [1]:
import pandas as pd
import duckdb
from typing import Optional, Tuple
import openpyxl

Fetch data, load into duckdb table

For now, from CSV

In [3]:
def import_data(
        fname: str,
        clear=True,
        db_path='temp_db',
        tname='staging_table',
        ftype=None,
        ):
    
    """
    load CSV, Parquet, or Excel file into a DuckDB table.
    Returns confirmation message.
    clear = Replace existing table if true, else create if not exists.
    """
    
    import os
    con = duckdb.connect(db_path)

    # Infer file type if not provided
    if ftype is None:
        ext = os.path.splitext(fname)[1].lower()
        if ext == '.csv':
            ftype = 'csv'
        elif ext == '.parquet':
            ftype = 'parquet'
        elif ext in ('.xlsx', '.xls'):
            ftype = 'excel'
        else:
            con.close()
            raise ValueError("Unsupported file extension.")

    if ftype == 'csv':
        mode = 'OR REPLACE TABLE' if clear else 'TABLE IF NOT EXISTS'
        con.execute(f"""
            CREATE {mode} {tname} AS
            SELECT * FROM read_csv_auto('{fname}', escape='\\', encoding='utf-8', header=True)
        """)
    elif ftype == 'parquet':
        mode = 'OR REPLACE TABLE' if clear else 'TABLE IF NOT EXISTS'
        con.execute(f"""
            CREATE {mode} {tname} AS
            SELECT * FROM read_parquet('{fname}')
        """)
    elif ftype == 'excel':
        df = pd.read_excel(fname)
        if clear:
            con.execute(f"DROP TABLE IF EXISTS {tname}")
        con.register('temp_excel_df', df)
        con.execute(f"CREATE TABLE {tname} AS SELECT * FROM temp_excel_df")
        con.unregister('temp_excel_df')
    else:
        con.close()
        raise ValueError("Unsupported file type.")

    return print(f"Import completed: {fname} into {tname} at {db_path}")


In [4]:
# Define import parameters
file_path = r'data\RETURN_COMMENTS_GROUP.xlsx'
tname = 'staging_table'
db_path='return_comment_group'


In [None]:
import_data(file_path, db_path=db_path,tname=tname, clear=True)


In [9]:
con = duckdb.connect(db_path)
describe = con.execute(f"SELECT * FROM {tname} LIMIT 5").fetchall()
print(describe)

[('FREE PEOPLE (NA)', 'FN13796599', 'N402015792', 'way too big', datetime.datetime(2025, 4, 1, 0, 0), '8', 'Fit'), ('FREE PEOPLE (NA)', 'FP20959356', 'N400832467', 'I like the black version of this better', datetime.datetime(2024, 12, 22, 0, 0), '35', 'Ordered Multiple'), ('FREE PEOPLE (NA)', 'FP20959356', 'N400832467', 'Potential outfit for New Years eve. I ended up going with something else', datetime.datetime(2024, 12, 22, 0, 0), '28', 'Changed Mind'), ('FREE PEOPLE (NA)', 'FP20959356', 'N400832467', 'This was so itchy to wear. Did not like it especially for the price', datetime.datetime(2024, 12, 22, 0, 0), '30', 'Not As Expected'), ('FREE PEOPLE (NA)', 'FN12479870', 'N221117524', 'one sleeve was more fitted than the other, looked \nodd', datetime.datetime(2024, 9, 2, 0, 0), '30', 'Not As Expected')]


In [10]:
sample_df = con.execute(f"SELECT * FROM {tname} LIMIT 1").df()
print(sample_df.columns.tolist())

['BRAND', 'ORIG ORDER #', 'RETURN #', 'RETURN COMMENT', 'RETURN DATE', 'STERLING RETURN REASON', 'RETURN REASON GROUP']


Fetch sample or full data into DataFrame

In [14]:

def fetch_return_comments(con, tname, is_sample=True, comment_only=True)->pd.DataFrame:
    """
    Fetch return comments from the DuckDB table.
    If is_sample is True, fetch a sample of 100 rows.
    If comment_only is True, only returns the RETURN_COMMENT column.
    If comment_only is False, returns all columns.
    """
    if is_sample: sample_query = "ORDER BY RANDOM() LIMIT 100"
    if comment_only:
        query = f"""
        SELECT "RETURN COMMENT"
        FROM {tname}
        {sample_query if is_sample else ""}
        """
    else:
        query = f"""
        SELECT *
        FROM {tname}
        {sample_query if is_sample else ""}
        """
    
    return con.execute(query).df()


In [15]:
con = duckdb.connect('return_coomment_group')
# tname = ''
# is_sample = ''
# comment_only = ''

In [52]:
# Fetch 100 rows comments only
df = fetch_return_comments(con, tname, is_sample=False, comment_only=True)
df


Unnamed: 0,RETURN COMMENT
0,way too big
1,I like the black version of this better
2,Potential outfit for New Years eve. I ended up...
3,This was so itchy to wear. Did not like it esp...
4,"one sleeve was more fitted than the other, loo..."
...,...
1048560,This item was missing from the package. Instea...
1048561,Sizing
1048562,says that shirt arrived with one sleeve longer...
1048563,It looked like a poncho on me.


In [19]:
df.describe()


Unnamed: 0,RETURN COMMENT
count,100
unique,99
top,Too short
freq,2


In [18]:
df.head()


Unnamed: 0,RETURN COMMENT
0,Different color as order
1,Style too baggy and long
2,Flawed - zipper doesnt stay up
3,It would hold me up better if there were adjus...
4,ORDERED SIZE 8 RECEIVED SIZE 8.5. PACKAGE IS O...


Fetch A.Rosenwinkle DS OpenAI API Key

In [33]:
from google.cloud import secretmanager
from google.cloud import storage

def access_secret(secret_path):
    """Establishes connection to GCP secret manager and retrieves secret value.
    ensure authentication is setup for GCP: in bash: gcloud auth application-default login"""

    client = secretmanager.SecretManagerServiceClient()
    response = client.access_secret_version(name=secret_path)
    secret_payload = response.payload.data.decode("UTF-8")

    return secret_payload


In [43]:
import json
import logging
import openai
import re
from dotenv import load_dotenv
import os


def ai_analyze_comments(client, prompt: str, df: pd.DataFrame, debug: bool = True) -> str:
    """
    Sends `prompt` plus the JSON version of `df` to ChatGPT,
    and returns the model's response.strip()
    """
    import re
    df_json = df.to_json(orient="records")
    if debug:
        print("Prompt sent to model:\n", prompt)

    messages = [
    {"role": "system",
    "content": 
        "You are an expert linguistic analyst specializing in extracting and scoring themes from customer return comments. "
        "You always return your output as a single JSON array of objects, one per input record, using exactly the keys and structure specified in the user's instructions. "
        "Do not include any explanations, extra text, or formatting outside the required JSON array. "
        "Be precise, consistent, and strictly follow the output schema and scoring rules provided."
        },
        {"role": "user", "content": prompt},
        {"role": "user", "content": df_json}
    ]

    resp = client.chat.completions.create(
        model="gpt-4.1-nano",
        messages=messages,
        temperature=0.1,
    )

    content = resp.choices[0].message.content.strip()

    if debug:
        print("Raw response from OpenAI:\n", content)
    if not content:
        raise ValueError("Empty response from OpenAI")

    return content


run test data through ai_analyze_comments

In [37]:

import openai
secret = access_secret("projects/572292574132/secrets/openai_monday_status_alerts/versions/latest")
client = openai.OpenAI(api_key=secret)

with open('prompts/customer_sentiment_prompt.txt', 'r', encoding='utf-8') as f:
    prompt = f.read()

In [47]:
response_str = ai_analyze_comments(
    client,
    prompt,
    df,
    debug=True
)

Prompt sent to model:
 """
You are a linguistic expert in customer satisfaction evaluation. You will receive a JSON array of return comments, each record with:

  • RETURN_NO  
  • RETURN_COMMENT  

Your job is to extract all themes and process each record into one output row. Do not consider information from prior records for future records.
Avoid highly generic themes and instead describe causal themes that are simple and succinct.
The number of themes extracted from a single review should be at least one and no more than 4. Each theme should be at least 2 words, but no more than 6.
Themes should not be excessively repetitive.
Return your answer as a single JSON array of objects, one object per input record, with exactly these keys:

  IDENTIFIER       – unique ID you generate (e.g. “C1”, “C2”, …)  
  RETURN_NO        – copied from the input  
  RETURN_COMMENT   – copied from the input  
  Theme 1          – satisfaction theme #1 (1–5 words)  
  Sentiment 1      – score for theme #1 

In [48]:
# export to csv, or text if formatting fails
def export_response(response_str: str):
    """
    Exports the response string to a CSV file if it is valid JSON,
    otherwise saves it as a TXT file.
    """
    import os

    try:
        # Try to parse as JSON and save as CSV
        data = json.loads(response_str)
        if isinstance(data, dict):
            data = [data]
        df_out = pd.DataFrame(data)
        out_path = "response_output.csv"
        df_out.to_csv(out_path, index=False)
        print(f"Saved as CSV: {out_path}")
    except Exception as e:
        # If not valid JSON, save as TXT
        out_path = "response_output.txt"
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(response_str)
        print(f"Could not parse as JSON, saved as TXT: {out_path}\nError: {e}")
    
    return out_path

Saved as CSV: response_output.csv


Functions to run both product and customer sentiment analysis.

In [49]:
def ai_format_customer_feedback(
    df,
    client,
    prompt='prompts/customer_sentiment_prompt.txt',
    batch_size=100,
    debug=True,
    excel_export=False,
    csv_export=True
):
    """
    Run ai_analyze_comments on the input DataFrame in batches.
    df: expects a DataFrame with a return_comment column
    prompt: path to the prompt file or prompt string
    batch_size: number of rows to process in each batch
    debug: if True, prints debug information
    excel_export: if True, exports the result to an Excel file
    csv_export: if True, exports the result to a CSV file
    """
    import pandas as pd
    import os
    import json

    # Load prompt if it's a file path
    if os.path.exists(prompt):
        with open(prompt, 'r', encoding='utf-8') as f:
            prompt_str = f.read()
    else:
        prompt_str = prompt

    total = len(df)
    results = []

    if total == 0:
        print("Input DataFrame is empty.")
        return None

    # If fewer rows than batch_size, process all at once
    if total <= batch_size:
        if debug:
            print(f"Processing all {total} rows in a single batch.")
        try:
            response_str = ai_analyze_comments(client, prompt_str, df, debug=debug)
            batch_results = json.loads(response_str)
            if isinstance(batch_results, dict):
                batch_results = [batch_results]
            results.extend(batch_results)
        except Exception as e:
            print(f"Error processing single batch: {e}")
            return None
    else:
        for i in range(0, total, batch_size):
            batch = df.iloc[i:i+batch_size]
            if debug:
                print(f"Processing batch {i//batch_size + 1} ({i} to {min(i+batch_size, total)-1})")
            try:
                response_str = ai_analyze_comments(client, prompt_str, batch, debug=debug)
                batch_results = json.loads(response_str)
                if isinstance(batch_results, dict):
                    batch_results = [batch_results]
                results.extend(batch_results)
            except Exception as e:
                print(f"Error processing batch {i//batch_size + 1}: {e}")
                continue

    if not results:
        print("No results to export.")
        return None

    df_out = pd.DataFrame(results)

    # Export to Excel
    if excel_export:
        try:
            out_xlsx = "analyzed_customer_feedback_output.xlsx"
            df_out.to_excel(out_xlsx, index=False)
            if debug:
                print(f"Exported results to {out_xlsx}")
        except Exception as e:
            print(f"Failed to export to Excel: {e}")

    # Export to CSV
    if csv_export:
        try:
            out_csv = "analyzed_customer_feedback_output.csv"
            df_out.to_csv(out_csv, index=False)
            if debug:
                print(f"Exported results to {out_csv}")
        except Exception as e:
            print(f"Failed to export to CSV: {e}")

    return df_out


def ai_format_product_feedback(
    df,
    client,
    prompt='prompts/product_prompt.txt',
    batch_size=100,
    debug=True,
    excel_export=False,
    csv_export=True
):
    """
    Run ai_analyze_comments on the input DataFrame in batches.
    df: expects a DataFrame with a return_comment column
    prompt: path to the prompt file or prompt string
    batch_size: number of rows to process in each batch
    debug: if True, prints debug information
    excel_export: if True, exports the result to an Excel file
    csv_export: if True, exports the result to a CSV file
    """
    import pandas as pd
    import os
    import json

    # Load prompt if it's a file path
    if os.path.exists(prompt):
        with open(prompt, 'r', encoding='utf-8') as f:
            prompt_str = f.read()
    else:
        prompt_str = prompt

    total = len(df)
    results = []

    if total == 0:
        print("Input DataFrame is empty.")
        return None

    # If fewer rows than batch_size, process all at once
    if total <= batch_size:
        if debug:
            print(f"Processing all {total} rows in a single batch.")
        try:
            response_str = ai_analyze_comments(client, prompt_str, df, debug=debug)
            batch_results = json.loads(response_str)
            if isinstance(batch_results, dict):
                batch_results = [batch_results]
            results.extend(batch_results)
        except Exception as e:
            print(f"Error processing single batch: {e}")
            return None
    else:
        for i in range(0, total, batch_size):
            batch = df.iloc[i:i+batch_size]
            if debug:
                print(f"Processing batch {i//batch_size + 1} ({i} to {min(i+batch_size, total)-1})")
            try:
                response_str = ai_analyze_comments(client, prompt_str, batch, debug=debug)
                batch_results = json.loads(response_str)
                if isinstance(batch_results, dict):
                    batch_results = [batch_results]
                results.extend(batch_results)
            except Exception as e:
                print(f"Error processing batch {i//batch_size + 1}: {e}")
                continue

    if not results:
        print("No results to export.")
        return None

    df_out = pd.DataFrame(results)

    # Export to Excel
    if excel_export:
        try:
            out_xlsx = "analyzed_product_feedback_output.xlsx"
            df_out.to_excel(out_xlsx, index=False)
            if debug:
                print(f"Exported results to {out_xlsx}")
        except Exception as e:
            print(f"Failed to export to Excel: {e}")

    # Export to CSV
    if csv_export:
        try:
            out_csv = "analyzed_product_feedback_output.csv"
            df_out.to_csv(out_csv, index=False)
            if debug:
                print(f"Exported results to {out_csv}")
        except Exception as e:
            print(f"Failed to export to CSV: {e}")

    return df_out 


Function to run both Product and Customer analysis on same data set, combine into one .xlsx with two tabs.

In [50]:
def handle_sentiment_analysis(
    df: pd.DataFrame,
    client,
    product_prompt: str = 'prompts/product_prompt.txt',
    customer_prompt: str = 'prompts/customer_sentiment_prompt.txt',
    batch_size: int = 100,
    debug: bool = True,
    excel_export: bool = False,
    csv_export: bool = False,
    combined_excel_export: bool = True,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Run both Product and Customer analysis on same data set, combine into one .xlsx with two tabs.
    """
    
    # Analyze product feedback
    product_df = ai_format_product_feedback(
        df, client, prompt=product_prompt, batch_size=batch_size, debug=debug, excel_export=excel_export, csv_export=csv_export
    )
    
    # Analyze customer feedback
    customer_df = ai_format_customer_feedback(
        df, client, prompt=customer_prompt, batch_size=batch_size, debug=debug, excel_export=excel_export, csv_export=csv_export
    )
    
    if combined_excel_export:
        try:
            with pd.ExcelWriter("combined_feedback_analysis.xlsx", engine='openpyxl') as writer:
                product_df.to_excel(writer, sheet_name='Product Feedback', index=False)
                customer_df.to_excel(writer, sheet_name='Customer Feedback', index=False)
            if debug:
                print("Exported combined results to combined_feedback_analysis.xlsx")
        except Exception as e:
            print(f"Failed to export combined Excel file: {e}")

    return product_df, customer_df

Run both Customer and Product Analysis

In [53]:
# mid-sized sample

df = df.iloc[:1000]

In [54]:
prod_df, customer_df = handle_sentiment_analysis(
    df,
    client,
    product_prompt='prompts/product_prompt.txt',
    customer_prompt='prompts/customer_sentiment_prompt.txt',
    batch_size=100,
    debug=True,
    excel_export=True,
    csv_export=False,
    combined_excel_export=True,
)

Processing batch 1 (0 to 99)
Prompt sent to model:
 """
You are a linguistic expert specialized in detailed product sentiment evaluation. You will receive a batch of return comments as a JSON array, each record containing these fields:

  • RETURN_NO  
  • RETURN_COMMENT  

You must analyze each comment independently, extracting precise, non-generic causal themes reflecting clear evaluative sentiments.

Theme Extraction Guidelines:
- Extract between 1 and 4 themes per comment.
- Each theme should contain between 2 and 6 words.
- Avoid redundant themes within the same comment.

Sentiment Interpretation Guidelines:
- Intensively interpret subtle evaluative language. Explicitly recognize nuanced positive expressions (e.g., "cute," "adorable," "chic," "cozy," "fun," "trendy") and subtle negative expressions (e.g., "cheap feel," "awkward fit," "dull," "unexpected", "matronly").
- Descriptive size/fit statements without explicit positive judgment ("runs small," "too long") must still be scor

In [55]:
con.close()