In [11]:
from datasets import load_dataset
from openai import OpenAI

import csv
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt


In [12]:
client = OpenAI(
    api_key="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
)

def generate_cluster(selected_texts, expert_profile, best_prompt):
  response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
      {"role": "system", "content": expert_profile + "\n" + best_prompt + "\n"},
      {"role": "user", "content": f"{list(selected_texts)}"}
    ]
  )

  result = response.choices[0].message.content
  return result

In [13]:
def extract_final_answer(answer: str):       
    if not answer:
        return "<INVALID>"

    model_pred = answer.lower()
    preds = model_pred.split("<ans_start>")

    pred = preds[-1].split("<ans_end>")[0].strip()

    if len(pred) == 0:
        return "<INVALID>"

    return pred

In [14]:
dataset = load_dataset("clinc_oos", "small")

test_split = dataset["test"]
texts = test_split["text"]
intents = test_split["intent"]

# Filter out intent 42
filtered_pairs = [(t, i) for (t, i) in zip(texts, intents) if i != 42]
filtered_texts, filtered_intents = zip(*filtered_pairs)

In [15]:
selected_pairs = [(text, intent) for text, intent in filtered_pairs if 0 <= intent <= 3]
selected_texts, selected_intents = zip(*selected_pairs)

In [16]:
len((" ").join(list(selected_texts)))

5220

In [17]:
len(expert_profile + best_prompt)

4338

In [None]:
# Initialize an empty list to store the results
for k in tqdm([2,4,6,8,10,12,14,16,18,20]):
    results = []
    selected_pairs = [(text, intent) for text, intent in filtered_pairs if 0 <= intent <= k-1]
    selected_texts, selected_intents = zip(*selected_pairs)

    n = len(selected_texts)
    k = len(set(selected_intents))
    
    print(f"running prompt_wizard_{n}_{k}.csv")

    expert_profile = "Expert profile: You are a data scientist with expertise in natural language processing and machine learning. You possess a deep understanding of clustering algorithms and their application to textual data. With your extensive experience, you can efficiently analyze and process datasets to uncover patterns and group sentences based on semantic similarities. You are skilled in using state-of-the-art NLP techniques and tools, such as word embeddings, vectorization, and dimensionality reduction, to convert textual data into meaningful numerical representations. Your proficiency in utilizing algorithms like k-means, hierarchical clustering, or advanced models like BERT-based clustering enables you to discern subtle differences and similarities among the sentences in the dataset. Leveraging your strong analytical and problem-solving skills, you can accurately assign each of the given {n} sentences to one of the {k} clusters, providing precise and insightful clustering labels that reflect the underlying structure of the data. Your work aids in understanding and organizing large-scale textual information efficiently."
    best_prompt = f"\nYou are given a dataset of {n} sentences that needs to be clustered into {k} clusters. For each sentence, assign a cluster label such that the total number of labels matches exactly {n}. \n\nInstructions:\n1. For each sentence, assign it to a cluster by providing a specific cluster label.\n2. Ensure that the number of labels output is exactly the same as the number of input sentences, {n}.\n3. Format your output as a sequential list where each label corresponds to its sentence in the same order.\n4. Verify that every sentence has a cluster label assigned without any omissions.\n5. Double-check your final output to confirm it contains exactly {n} cluster labels.\n  \nExample:\nInput Sentences: [\'sentence1\', \'sentence2\', \'sentence3\']\nOutput Labels: [1, 0, 2]\n  \nMake sure each output list of labels directly matches the number of sentences given.\n\n\n[Question] [\'create a playlist of my favorite songs\', \'find the nearest Thai restaurant\', \'which team won the world series in 2020\', \'how many calories are burnt in a 30-minute run\', \'can you show me my calendar for today\', \'what is the current exchange rate for usd to eur\', \'write a report on the latest tech trends\', \'locate a public library near me\', \'turn on the AC to 70 degrees\', \'what ingredients are needed for chocolate cake\', \'display my latest photos\', \'how to perform CPR\', \'suggest a workout plan for beginners\', \'play the next episode of my favorite series\', \'book an appointment for a haircut\']\n[Answer] <ANS_START>[55, 72, 148, 43, 10, 136, 121, 37, 80, 43, 19, 58, 95, 76, 134]<ANS_END>\n\n[Question] [\'change my phone wallpaper to the latest downloaded image\', \'calculate the tip for a $50 meal\', \'find my current GPS location\', \'translate the word friendship to Spanish\', \'who is the president of France\', \'wake me up at 6:30 am tomorrow\', \'delete all messages from my inbox\', \'what time does the sun set today\', \'recommend a sci-fi book\', \'search for vegan recipes on the internet\', \'schedule a dentist appointment for next week\', \'show me directions to Central Park\', \'is there a basketball game today\', \'add $20 to my Starbucks card\', \'when does the next train to Boston depart\']\n[Answer] <ANS_START>[21, 48, 112, 45, 132, 101, 60, 84, 91, 134, 109, 24, 105, 125, 146]<ANS_END>\n\n[Question] [\'order a coffee from Starbucks\', \'delete my last note\', \'who invented the telephone\', \'paint my living room green\', \'what is the square root of 144\', \'find a recipe for spaghetti carbonara\', \'convert 100 degrees Fahrenheit to Celsius\', \'is my flight to LA on time\', \'give me a summary of the book 1984\', \'what are the symptoms of the flu\', \'set a reminder to pick up groceries at 5 pm\', \'where is the closest gas station\', \'what is the speed of light\', \'block spam emails in my account\', \'buy tickets for the concert next Saturday\']\n[Answer] <ANS_START>[88, 144, 39, 93, 42, 24, 77, 145, 74, 112, 95, 69, 133, 60, 18]<ANS_END>\n\n\nFor each sentence, assignment it to one of the {k} cluster label and output the cluster number. Your output should ONLY contain a list of {n} integers in the format <ANS_START>[cluster asignments]<ANS_END>. Do not include any other texts.\nKeywords: Data clustering, Sentence labeling, Cluster assignment, Verification, Accuracy"

    # Save the processed result and the count in a CSV file
    with open(f'prompt_wizard_{n}_{k}.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # writer.writerow(['Iteration', 'Label Count', 'Processed Result'])
        for i in tqdm(range(0, 50)):
            try:
                result = generate_cluster(selected_texts, expert_profile, best_prompt)
                # print(result)
            except:
                print("GPT Error")
                continue
            try:
                processed_result = extract_final_answer(result)         # Extract the final answer from the result
            except:
                print("INVALID OUTPUT")
                print(result)
                break
            label_count = len(processed_result[1:-1].split(", "))         # Count the number of labels in the processed result
            writer.writerow([i, label_count, processed_result])
            results.append({'Iteration': i, 'Label Count': label_count, 'Processed Result': processed_result})
            
    # Convert the results to a DataFrame
    df_results = pd.DataFrame(results)
    df_results.to_csv(f"clustering_result/n_k/prompt_wizard_{n}_{k}_add.csv", index=False)
    

  0%|          | 0/1 [00:00<?, ?it/s]

running prompt_wizard_180_6.csv






In [19]:
# Convert the results to a DataFrame
df_label_counts = pd.read_csv('clustering_tasks_label_counts.csv')

for col in df_label_counts.columns:
    print(col)
    df_results = pd.read_csv(f"clustering_result/n_k/prompt_wizard_{col}.csv", header=None)
    df_results.columns = ['Iteration', 'Label Count', 'Processed Result']

    df_label_counts[f"{col}"] = df_results["Label Count"]

    df_label_counts.to_csv('clustering_tasks_label_counts.csv', index=False)
    
df_label_counts = df_label_counts.reindex(sorted(df_label_counts.columns), axis=1)
df_label_counts.to_csv('clustering_tasks_label_counts.csv', index=False)

60_2
90_3
120_4
150_5
240_8
300_10
600_20
180_6
450_15
360_12
420_14
480_16


In [None]:
def plot_violin(df, points=False):
    # Set the aesthetic style of the plots
    sns.set_theme(style="darkgrid")

    # Define the columns to plot
    columns = df.columns

    # Define a color palette suitable for nature publications
    palette = sns.color_palette("pastel", len(columns))  # Use a colorblind-friendly palette

    # Melt the DataFrame to long format for seaborn
    melted_df = df[columns].melt(var_name='Label', value_name='Count')

    # Create a single violin plot for all columns
    plt.figure(figsize=(8, 5))  # Make the figure less wide and more compact
    sns.violinplot(x='Label', y='Count', data=melted_df, scale='area', palette=palette, inner="quartile", linewidth=1.2)

    # Set title and labels with enhanced font size
    # plt.title('Violin Plot of Label Counts', fontsize=16, weight='bold')
    plt.xlabel('Configuration', fontsize=14)
    plt.ylabel('Number of Labels', fontsize=14)

    # Customize the tick parameters
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    if points:
        # Draw dots to indicate specific label values
        for index, value in enumerate([col.split('_')[0] for col in df.columns]):
            plt.plot(index, int(value), 'ro', markersize=8)  # Plot a red dot at the specified y-value
            # plt.text(index, value + 5, f'{value}', color='red', fontsize=10, ha='center', va='bottom')  # Adjust annotation position

    # Adjust the layout for better spacing
    plt.tight_layout()

    # Show the plot
    plt.show()
    
df = pd.read_csv('clustering_tasks_label_counts.csv')

df = df.drop(columns=["60_2", "90_3", "150_5", "450_15"])

df = df.reindex(sorted(df.columns), axis=1)

plot_violin(df, points=True)