In [None]:
!pip install openai

In [None]:
import pandas as pd
import time
import os
from openai import OpenAI

# --- Setup ---
# It is best practice to set your API key as an environment variable.
# For example: export OPENAI_API_KEY='your_key_here'
# The client will automatically pick it up.
client = OpenAI()

# Load your dataset
try:
    df = pd.read_csv('dataset.csv')
except FileNotFoundError:
    print("Error: 'dataset.csv' not found. Please ensure the input file is in the correct directory.")
    exit()

# --- Functions for OpenAI API Calls ---

def try_request(func, retries=5, delay=5, **kwargs):
    """A wrapper to handle API errors and retries."""
    for i in range(retries):
        try:
            return func(**kwargs)
        except Exception as e:
            print(f"An error occurred: {e}. Attempt {i + 1} of {retries}.")
            if i < retries - 1:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print("Max retries reached. Skipping this request.")
                return None # Return None if all retries fail

def classify_issue_presence(review_text):
    """
    Classifies if a review contains an issue or not, based on provided definitions.
    Replaces the original sentiment analysis function.
    """
    prompt_content = f"""
    Please classify the following software review into one of two categories: "Issue" or "No Issue".
    Use only these two labels for the classification.

    Definitions:
    - No Issue: The user reports a positive experience, smooth performance, intuitive design, reliable functionality, or no significant problems.
    - Issue: The user reports any problem or limitation that negatively impacts the user experience, such as technical malfunctions, crashes, bugs, usability challenges, poor design, or confusing navigation.

    Review to classify:
    "{review_text}"

    Category:
    """
    
    messages = [{"role": "system", "content": "You are a precise classifier. Your only valid outputs are 'Issue' or 'No Issue'."},
                {"role": "user", "content": prompt_content}]

    response = try_request(
        client.chat.completions.create,
        model="gpt-4-turbo",
        messages=messages,
        max_tokens=10,
        temperature=0.0
    )
    
    if response:
        return response.choices[0].message.content.strip()
    return "Error"


def get_issue_type(review_text):
    """
    Classifies a review into one of 13 fine-grained issue types.
    The definitions are taken directly from the "Grounded_theory_for issues detection.pdf" document.
    """
    prompt_content = f"""
    Please identify the dominant software issue in the following review based ONLY on these 13 definitions.
    Respond with only the single, most relevant issue category title (e.g., "Functionality Issues", "Performance Issues").

    Definitions:
    -Functionality Issues: Problems where the app's core functions do not operate as expected. These issues typically involve broken functionalities, errors, or glitches that prevent the app from performing essential tasks.
    -Performance Issues: Problems that affect the app's speed, responsiveness, and overall efficiency. These issues include slow loading times, unresponsive interactions, and delays that make the app feel sluggish or inefficient.
    -Stability Issues: Problems that affect the app's reliability, causing it to crash, freeze, or become unresponsive unexpectedly. These issues lead to interruptions in the user experience, making the app unreliable and frustrating to use.
    -UI Issues: Problems with the design, layout, and visual elements of the app. These issues include cluttered interfaces, poorly arranged elements, and visual inconsistencies.
    -UX Issues: Pertain to the overall user satisfaction with the app, including how users interact with the app and how smooth and efficient those interactions are. These issues include unintuitive navigation, poor responsiveness, and a lack of user-centered design.
    -Compatibility Issues: Occur when the app fails to function properly across different operating systems, platforms, or versions. These problems arise due to insufficient optimization or support for certain OS updates, versions, or configurations.
    -Device-Specific Issues: Arise when the app does not function well or crashes on particular hardware configurations. These problems may include poor optimization for certain devices, resulting in slower performance or crashes.
    -Bug: Encompass a wide range of errors, glitches, and malfunctions within the app that prevent it from operating as intended. These issues can range from minor visual anomalies to critical errors that cause the app to crash or behave unpredictably.
    -Customer Support Issues: Problems related to the assistance provided by the app's support team. These issues include unhelpful, slow, or inadequate responses that fail to address users' concerns.
    -Security Issues: Problems related to the app's ability to safeguard user data and protect against unauthorized access. These concerns may involve data breaches, vulnerabilities, and unauthorized data transmission.
    -Privacy Concerns: Focus on issues related to the app's handling of user data and its privacy policies. These concerns include excessive permission requests, unclear or vague privacy policies, and the lack of transparency regarding data usage.
    -Network Issues: Pertain to problems with the app's connectivity, such as difficulties loading content, syncing data, or maintaining a stable connection.
    -Installation Issues: Encompass issues encountered during the download, installation, or initial launch of the app. These issues may include download errors, installation failures, or crashes upon launch.

    Review to classify:
    "{review_text}"

    Issue Category:
    """

    messages = [{"role": "system", "content": "You are a precise classifier. Your only valid outputs are one of the 13 issue category titles provided."},
                {"role": "user", "content": prompt_content}]

    response = try_request(
        client.chat.completions.create,
        model="gpt-4-turbo",
        messages=messages,
        max_tokens=20,
        temperature=0.0
    )
    
    if response:
        return response.choices[0].message.content.strip()
    return "Error"


# --- Main Processing Loop ---

# Prepare the output file
output_filename = 'classified_issues.csv'
if os.path.exists(output_filename):
    # If the file exists, find where to resume
    processed_df = pd.read_csv(output_filename)
    last_processed_index = processed_df.index[-1]
    start_index = df.index.get_loc(last_processed_index) + 1
    print(f"Resuming from index {start_index}...")
else:
    # If the file doesn't exist, start from the beginning and create it
    start_index = 0
    # Create the file with headers
    pd.DataFrame(columns=list(df.columns) + ['classification', 'issue_type']).to_csv(output_filename, index=False)


# Process the dataframe starting from the correct index
for index, row in df.iloc[start_index:].iterrows():
    print(f"Processing review {index + 1}/{len(df)}...")
    
    # Step 1: Classify if the review contains an issue
    issue_classification = classify_issue_presence(row['review'])
    df.loc[index, 'classification'] = issue_classification
    
    # Add a delay to respect API rate limits
    time.sleep(1) 

    # Step 2: If it's an issue, classify its fine-grained type
    if issue_classification == 'Issue':
        issue_type = get_issue_type(row['review'])
        df.loc[index, 'issue_type'] = issue_type
        time.sleep(1) # Additional delay
    else:
        df.loc[index, 'issue_type'] = 'N/A'
    
    # Save the single processed row to the CSV file
    df.iloc[index:index+1].to_csv(output_filename, mode='a', header=False, index=False)

print("Processing complete. All results saved to classified_issues.csv")