In [2]:
import pandas as pd
import json
from openai import OpenAI

# Load the CSV file
batches_to_keys = pd.read_csv("batches_OpenAI.csv")

# strip all whitespace from the values
batches_to_keys = batches_to_keys.applymap(lambda x: x.strip() if isinstance(x, str) else x)

test = batches_to_keys[:1]

  batches_to_keys = batches_to_keys.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [4]:
# Iterate through each row in the CSV file
all_reviews = pd.DataFrame(columns=["food_sentences", "service_sentences", "atmosphere_sentences", "price_sentences"]) # Initialize an empty DataFrame to store the results

for _, row in test.iterrows():
    batch_job_id = row['batch_job_id']
    api_key = row['api_key']
    
    # Create the OpenAI client
    client = OpenAI(api_key=api_key)
    
    try:
        # Get the results file ID for the batch job (assuming this comes from an API or prior step)
        result_file_id = client.batches.retrieve(batch_job_id).output_file_id
        
        # Download the results file
        result_content = client.files.content(result_file_id).content
        result_file_name = "batch_results.jsonl"
        with open(result_file_name, 'wb') as file:
            file.write(result_content)  # Save the content to a file

        # Parse the results
        results = []
        with open(result_file_name, 'r') as file:
            for line in file:
                results.append(json.loads(line.strip()))
        results = pd.DataFrame(results)

        # Extract the sentences for each topic
        def extract_sentences(response):
            try:
                # Parse the assistant's message content
                content = json.loads(response['body']['choices'][0]['message']['content'])
                return {
                    "food_sentences": " ".join(content.get("food_sentences", [])),
                    "service_sentences": " ".join(content.get("service_sentences", [])),
                    "atmosphere_sentences": " ".join(content.get("atmosphere_sentences", [])),
                    "price_sentences": " ".join(content.get("price_sentences", [])),
                }
            except Exception as e:
                print(f"Error parsing response: {e}")
                return {
                    "food_sentences": None,
                    "service_sentences": None,
                    "atmosphere_sentences": None,
                    "price_sentences": None,
                }

        # Apply the extraction to the 'response' column
        category_data = results['response'].apply(extract_sentences)

        # Create a new DataFrame with the extracted category sentences
        category_df = pd.DataFrame(category_data.tolist())

        # Combine 'custom_id' and extracted category sentences
        category_df['custom_id'] = results['custom_id']
        # Rename custom_id to review_id
        category_df.rename(columns={"custom_id": "review_id"}, inplace=True)

        # Append the results for this batch to the collection
        all_reviews = pd.concat([all_reviews, category_df], ignore_index=True)

    except Exception as e:
        print(f"Error processing batch_job_id {batch_job_id}: {e}")

Error parsing response: Unterminated string starting at: line 6 column 9 (char 215)


In [10]:
print(min(category_df["review_id"].astype(int)))
print(max(category_df["review_id"].astype(int)))

4471
12774


In [44]:
# Save the final DataFrame to a CSV file
all_reviews.to_csv("extracted_category_subset.csv", index=False)

## Sentiment Analysis

### Food Results

In [None]:
# Load the CSV file
batches_to_keys = pd.read_csv("batches_sentiment_OpenAI.csv")

# strip all whitespace from the values
batches_to_keys = batches_to_keys.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [None]:
# Iterate through each row in the CSV file
food_ratings = pd.DataFrame(columns=["food_rating"])  # Initialize an empty DataFrame to store the results

for _, row in batches_to_keys.iterrows():
    batch_job_id = row['batch_job_id']
    api_key = row['api_key']
    batch_file_name = row.get('batch_file_name', '')  # Retrieve the batch_file_name column

    # Process only if the batch_file_name contains the word "food"
    if "food" not in batch_file_name.lower():
        continue  # Skip this row if "food" is not in the batch_file_name

    # Create the OpenAI client
    client = OpenAI(api_key=api_key)

    try:
        # Get the results file ID for the batch job (assuming this comes from an API or prior step)
        result_file_id = client.batches.retrieve(batch_job_id).output_file_id

        # Download the results file
        result_content = client.files.content(result_file_id).content
        result_file_name = "batch_results.jsonl"
        with open(result_file_name, 'wb') as file:
            file.write(result_content)  # Save the content to a file

        # Parse the results
        results = []
        with open(result_file_name, 'r') as file:
            for line in file:
                results.append(json.loads(line.strip()))
        results = pd.DataFrame(results)

        # Extract 'rating' from the 'response' column
        def extract_rating(response):
            try:
                # Parse the assistant's JSON content
                content = json.loads(response['body']['choices'][0]['message']['content'])
                return content.get('rating', None)
            except Exception as e:
                print(f"Error extracting rating: {e}")
                return None

        # Apply the function to extract the rating
        results['food_rating'] = results['response'].apply(extract_rating)

        # Keep only 'custom_id' and 'food_rating'
        results = results[['custom_id', 'food_rating']]

        # Rename custom_id to review_id
        results.rename(columns={"custom_id": "review_id"}, inplace=True)

        # Change data type of review_id to int64
        results['review_id'] = results['review_id'].astype('int64')

        # Combine 'custom_id' and extracted category sentences
        category_df['custom_id'] = results['custom_id']
        # Rename custom_id to review_id
        category_df.rename(columns={"custom_id": "review_id"}, inplace=True)

        # Append the results for this batch to the collection
        food_ratings = pd.concat([food_ratings, category_df], ignore_index=True)

    except Exception as e:
        print(f"Error processing batch_job_id {batch_job_id}: {e}")


## Service Results

In [None]:
# Iterate through each row in the CSV file
service_ratings = pd.DataFrame(columns=["service_rating"]) # Initialize an empty DataFrame to store the results

for _, row in batches_to_keys.iterrows():
    batch_job_id = row['batch_job_id']
    api_key = row['api_key']
    batch_file_name = row.get('batch_file_name', '')  # Retrieve the batch_file_name column
    
    # Process only if the batch_file_name contains the word "service"
    if "service" not in batch_file_name.lower():
        continue  # Skip this row if "service" is not in the batch_file_name
    
    # Create the OpenAI client
    client = OpenAI(api_key=api_key)

    try:
        # Get the results file ID for the batch job (assuming this comes from an API or prior step)
        result_file_id = client.batches.retrieve(batch_job_id).output_file_id
        
        # Download the results file
        result_content = client.files.content(result_file_id).content
        result_file_name = "batch_results.jsonl"
        with open(result_file_name, 'wb') as file:
            file.write(result_content)  # Save the content to a file

        # Parse the results
        results = []
        with open(result_file_name, 'r') as file:
            for line in file:
                results.append(json.loads(line.strip()))
        results = pd.DataFrame(results)

        # Extract 'rating' from the 'response' column
        def extract_rating(response):
            try:
                # Parse the assistant's JSON content
                content = json.loads(response['body']['choices'][0]['message']['content'])
                return content.get('rating', None)
            except Exception as e:
                print(f"Error extracting rating: {e}")
                return None

        # Apply the function to extract the rating
        results['service_rating'] = results['response'].apply(extract_rating)

        # Keep only 'custom_id' and 'service_rating'
        results = results[['custom_id', 'service_rating']]

        # rename custom_id to review_id
        results.rename(columns={"custom_id": "review_id"}, inplace=True)

        # change data type of review_id to int64
        results['review_id'] = results['review_id'].astype('int64')

        # Combine 'custom_id' and extracted category sentences
        category_df['custom_id'] = results['custom_id']
        # Rename custom_id to review_id
        category_df.rename(columns={"custom_id": "review_id"}, inplace=True)

        # Append the results for this batch to the collection
        service_ratings = pd.concat([service_ratings, category_df], ignore_index=True)

    except Exception as e:
        print(f"Error processing batch_job_id {batch_job_id}: {e}")

### Atmosphere Results

In [None]:
# Iterate through each row in the CSV file
atmosphere_ratings = pd.DataFrame(columns=["atmosphere_rating"]) # Initialize an empty DataFrame to store the results

for _, row in batches_to_keys.iterrows():
    batch_job_id = row['batch_job_id']
    api_key = row['api_key']
    batch_file_name = row.get('batch_file_name', '')  # Retrieve the batch_file_name column
    
    # Process only if the batch_file_name contains the word "atmosphere"
    if "atmosphere" not in batch_file_name.lower():
        continue  # Skip this row if "atmosphere" is not in the batch_file_name
    
    # Create the OpenAI client
    client = OpenAI(api_key=api_key)

    try:
        # Get the results file ID for the batch job (assuming this comes from an API or prior step)
        result_file_id = client.batches.retrieve(batch_job_id).output_file_id
        
        # Download the results file
        result_content = client.files.content(result_file_id).content
        result_file_name = "batch_results.jsonl"
        with open(result_file_name, 'wb') as file:
            file.write(result_content)  # Save the content to a file

        # Parse the results
        results = []
        with open(result_file_name, 'r') as file:
            for line in file:
                results.append(json.loads(line.strip()))
        results = pd.DataFrame(results)

        # Extract 'rating' from the 'response' column
        def extract_rating(response):
            try:
                # Parse the assistant's JSON content
                content = json.loads(response['body']['choices'][0]['message']['content'])
                return content.get('rating', None)
            except Exception as e:
                print(f"Error extracting rating: {e}")
                return None

        # Apply the function to extract the rating
        results['atmosphere_rating'] = results['response'].apply(extract_rating)

        # Keep only 'custom_id' and 'atmosphere_rating'
        results = results[['custom_id', 'atmosphere_rating']]

        # rename custom_id to review_id
        results.rename(columns={"custom_id": "review_id"}, inplace=True)

        # change data type of review_id to int64
        results['review_id'] = results['review_id'].astype('int64')

        # Combine 'custom_id' and extracted category sentences
        category_df['custom_id'] = results['custom_id']
        # Rename custom_id to review_id
        category_df.rename(columns={"custom_id": "review_id"}, inplace=True)

        # Append the results for this batch to the collection
        atmosphere_ratings = pd.concat([atmosphere_ratings, category_df], ignore_index=True)

    except Exception as e:
        print(f"Error processing batch_job_id {batch_job_id}: {e}")