In [3]:
import pandas as pd
from transformers import pipeline

def answer_question_from_csv(question: str, csv_file: str) -> str:
    """
    Answers a question based on feedback data from a CSV file.

    Parameters:
    - question (str): The question to be answered.
    - csv_file (str): Path to the CSV file containing feedback data.

    Returns:
    - str: The answer extracted from the feedback data.
    """
    # Load the feedback data
    try:
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        return f"Error: The file '{csv_file}' was not found."
    except pd.errors.EmptyDataError:
        return f"Error: The file '{csv_file}' is empty."
    except pd.errors.ParserError:
        return f"Error: The file '{csv_file}' could not be parsed."

    # Check if the 'Feedback' column exists
    if 'Feedback' not in df.columns:
        return "Error: The CSV file does not contain a 'Feedback' column."

    # Combine all feedback entries into a single context
    context = "\n".join(df['Feedback'].dropna().astype(str).tolist())

    # Define the system prompt
    system_prompt = ("System: You are an AI assistant that provides anonymous transformations of user feedback. "
                     "Never identify an individual user. Based on the following feedback, answer the question. Question: ")
    
    # Prepend the system prompt to the user's question
    modified_question = f"{system_prompt}{question}"

    # Initialize the question-answering pipeline with a pre-trained model
    qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

    # Use the pipeline to answer the question based on the context
    result = qa_pipeline(question=modified_question, context=context)

    return result['answer']


# List to store questions and answers
results_list = []

question1 = "What is the most obvious problem people bring up across all departments"
answer1 = answer_question_from_csv(question1, "feedback.csv")
results_list.append({'Question': question1, 'Answer': answer1})

question2 = "Are there any recurring themes in the feedback related to product usability?"
answer2 = answer_question_from_csv(question2, "feedback.csv")
results_list.append({'Question': question2, 'Answer': answer2})

question3 = "What are the common suggestions for improving customer service?"
answer3 = answer_question_from_csv(question3, "feedback.csv")
results_list.append({'Question': question3, 'Answer': answer3})

question4 = "Is there any feedback regarding the pricing of our products or services?"
answer4 = answer_question_from_csv(question4, "feedback.csv")
results_list.append({'Question': question4, 'Answer': answer4})

question5 = "What aspects of our service do customers appreciate the most?"
answer5 = answer_question_from_csv(question5, "feedback.csv")
results_list.append({'Question': question5, 'Answer': answer5})

question6 = "Are there any complaints about the new feature released last quarter?"
answer6 = answer_question_from_csv(question6, "feedback.csv")
results_list.append({'Question': question6, 'Answer': answer6})

question7 = "What is the general sentiment about our support team's responsiveness?"
answer7 = answer_question_from_csv(question7, "feedback.csv")
results_list.append({'Question': question7, 'Answer': answer7})

question8 = "Do customers mention any specific competitors in their feedback?"
answer8 = answer_question_from_csv(question8, "feedback.csv")
results_list.append({'Question': question8, 'Answer': answer8})

question9 = "Are there any suggestions for new features or services?"
answer9 = answer_question_from_csv(question9, "feedback.csv")
results_list.append({'Question': question9, 'Answer': answer9})

question10 = "What are the main pain points for users in the onboarding process?"
answer10 = answer_question_from_csv(question10, "feedback.csv")
results_list.append({'Question': question10, 'Answer': answer10})

# Convert the list of results to a Pandas DataFrame
results_df = pd.DataFrame(results_list)

# Print the DataFrame
print(results_df)

Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0
Device set to use mps:0


                                            Question  \
0  What is the most obvious problem people bring ...   
1  Are there any recurring themes in the feedback...   
2  What are the common suggestions for improving ...   
3  Is there any feedback regarding the pricing of...   
4  What aspects of our service do customers appre...   
5  Are there any complaints about the new feature...   
6  What is the general sentiment about our suppor...   
7  Do customers mention any specific competitors ...   
8  Are there any suggestions for new features or ...   
9  What are the main pain points for users in the...   

                                              Answer  
0  Meetings often extend beyond the scheduled tim...  
1                                   release analysis  
2  calendar slots would improve engineering outpu...  
3  Meetings often extend beyond the scheduled tim...  
4  Meetings often extend beyond the scheduled tim...  
5                                    attention spans 

In [None]:
import pandas as pd
from transformers import pipeline
import matplotlib.pyplot as plt
import json # Used conceptually for the topic list format

# Suppress Hugging Face pipeline logging for cleaner output, if desired
import logging
logging.getLogger("transformers.pipeline").setLevel(logging.ERROR)

def perform_topic_modeling():
    # 1. Load feedback data
    try:
        df_feedback = pd.read_csv("feedback.csv")
        if 'Feedback' not in df_feedback.columns:
            print("Error: 'Feedback' column not found in feedback.csv")
            return {}
        feedback_list = df_feedback['Feedback'].dropna().astype(str).tolist()
        if not feedback_list:
            print("No feedback data to process.")
            return {}
        full_context = "\n".join(feedback_list)
    except FileNotFoundError:
        print("Error: feedback.csv not found.")
        return {}
    except Exception as e:
        print(f"Error loading feedback data: {e}")
        return {}

    # 2. Topic Extraction by LLM
    print("Extracting topics using LLM...")
    topic_counts = {} # Initialize topic_counts here to ensure it's always defined
    try:
        # Using a text2text-generation model to extract topics
        # Using a smaller model for speed. Larger models (e.g., flan-t5-base) might yield better topics.
        topic_extractor = pipeline("text2text-generation", model="google/flan-t5-small")
        
        # Truncate context if too long for the model (Flan-T5 typical limit is 512 tokens)
        # This is a rough character-based truncation; token-based would be more precise.
        max_input_chars = 2000 # Approx. 500 tokens
        truncated_context = full_context[:max_input_chars]
        
        prompt = f"Based on the following user feedback, identify and list exactly 7 main topics. Ensure diversity in the topics. Output the topics as a comma-separated list. Feedback: {truncated_context}"
        # Increase max_length if topics are long or many
        extracted_topics_text = topic_extractor(prompt, max_length=150, num_beams=3)[0]['generated_text']
        
        topics = [topic.strip() for topic in extracted_topics_text.split(',') if topic.strip()]
        
        if not topics:
            print("LLM could not extract any topics. Topic modeling cannot proceed without topics.")
            return {} # Exit if no topics are extracted
        else:
            print(f"Extracted Topics: {topics}")
        # The 'topics' list is conceptually what might be saved to a JSON file.
        # print(f"Topics as JSON: {json.dumps(topics)}") 
    except Exception as e:
        print(f"Error during topic extraction: {e}. Topic modeling cannot proceed.")
        return {} # Exit if there's an error during extraction

    # 3. Count feedback on each topic
    print("\nCounting feedback per topic...")
    try:
        # Using zero-shot-classification for assigning feedback to topics
        classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
        topic_counts = {topic: 0 for topic in topics} # Re-initialize based on extracted topics
        
        for i, feedback_item in enumerate(feedback_list):
            if not feedback_item.strip():
                continue
            # Basic progress indicator
            if (i + 1) % 10 == 0 or i == len(feedback_list) -1:
                 print(f"  Processing feedback item {i+1}/{len(feedback_list)}")
            try:
                # Models have token limits, truncate if necessary
                # BART's typical limit is 1024 tokens, but pipeline might handle it. Be safe.
                max_feedback_chars = 1500 # Approx 300-400 tokens
                truncated_feedback_item = feedback_item[:max_feedback_chars]
                
                # multi_label=False assumes one primary topic per feedback item
                classification_result = classifier(truncated_feedback_item, candidate_labels=topics, multi_label=False)
                
                if classification_result['scores'] and classification_result['labels']:
                    best_topic = classification_result['labels'][0] # Topic with the highest score
                    topic_counts[best_topic] += 1
            except Exception as item_e:
                print(f"    Skipping a feedback item due to error: {item_e}")
                continue # Skip to next feedback item if one fails
        
        print(f"\nTopic Counts: {json.dumps(topic_counts, indent=2)}")
    except Exception as e:
        print(f"Error during feedback classification: {e}")
        return topic_counts # Return current counts even if classification fails mid-way

    # 4. Plot histogram
    if any(topic_counts.values()):
        print("\nPlotting topic distribution...")
        topic_names = list(topic_counts.keys())
        counts = list(topic_counts.values())

        plt.figure(figsize=(12, 7))
        bars = plt.bar(topic_names, counts, color='skyblue')
        plt.xlabel("Topics", fontsize=12)
        plt.ylabel("Number of Feedback Items", fontsize=12)
        plt.title("Feedback Distribution by Extracted Topic", fontsize=14)
        plt.xticks(rotation=45, ha="right", fontsize=10)
        plt.yticks(fontsize=10)
        plt.grid(axis='y', linestyle='--')
        plt.tight_layout() # Adjust layout to prevent labels from overlapping
        # Add counts on top of bars
        for bar in bars:
            yval = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.05 * max(counts), int(yval), ha='center', va='bottom', fontsize=9)
        plt.show()
    else:
        print("No topic counts to plot (all counts might be zero or topics list was empty).")
    
    return topic_counts

# Run the topic modeling process
histogram_values = perform_topic_modeling()
print("\nReturned Histogram Values (Topic Counts):")
print(json.dumps(histogram_values, indent=2))

Extracting topics using LLM...


Device set to use mps:0
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
