In [2]:
%load_ext autoreload
%autoreload 2
!source /home/murilo/RelNetCare/.env

In [3]:
from src.paths import LOCAL_PROCESSED_DATA_PATH

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.express as px
import numpy as np
import pandas as pd

data_path = LOCAL_PROCESSED_DATA_PATH / 'dialog-re-ddrel'
df = pd.read_pickle(data_path / "df_embeddings_original.pkl")
# Assume df['embedding'] contains your embeddings
embeddings = np.array(df['embedding'].tolist())

# KMeans Clustering
kmeans = KMeans(n_clusters=5)  # You can change the number of clusters
kmeans.fit(embeddings)
labels = kmeans.labels_

# Add cluster labels to DataFrame
df['cluster'] = labels

# PCA for dimensionality reduction
pca = PCA(n_components=3)  # 3 components for 3D
reduced_embeddings = pca.fit_transform(embeddings)

# Convert to DataFrame for easier plotting
reduced_df = pd.DataFrame(reduced_embeddings, columns=['PC1', 'PC2', 'PC3'])
reduced_df['cluster'] = labels


  super()._check_params_vs_input(X, default_n_init=10)


In [9]:

# Interactive 3D Scatter Plot with Plotly
fig = px.scatter_3d(reduced_df, x='PC1', y='PC2', z='PC3', color='cluster', opacity=0.3)
fig.show()


import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter3d(x=reduced_df['PC1'], y=reduced_df['PC2'], z=reduced_df['PC3'],
                                   mode='markers', marker=dict(size=5, color=reduced_df['cluster'], opacity=0.3))])

fig.show()



In [4]:
df['turn_count'] = df.dialogue.apply(lambda x: len(x.split('\n')))

Work with me in an ongoing, iterative cycle to label and refine clusters, generated from 10,000 dialogue samples via BERT and K-means. These dialogues are from movies and episodes of Friends. Keep the following steps in mind throughout the whole process:

1. Name each cluster and assign a confidence level. Explain your reasoning.
   
2. Assess the cluster's alignment with the goals of downstream personal relation extraction (e.g. visited_place, spouse, friends) within a chatbot's history.
  
3. Utilize 'Sample Deviation from Cluster Avg' to determine if the sample subset accurately represents its cluster. Suggest if more samples are needed.

4. Flag any dialogues that are 'hard-to-follow' or unclear from the text alone (some friends dialogue can seem nonsensical for a reader without more context).
  
5. Persistently revise labels and confidence levels with each new sample input. We're iterating until we lock in stable, trustworthy labels.
  
6. Confirm, eventually, if the labels truly represent the full spectrum of each cluster or tell me if that's not possible.

7. Keep an eye on 'Sample Deviation from Cluster Avg' for ongoing cluster validity.

You are expected to reapply these steps every time I provide new samples for the clusters. State clearly in future steps, if a cluster name was changed and keep track of rename count. Continue this process until we reach reliable and accurate cluster labels.


In [6]:
print(df)

                                               dialogue  \
0     Speaker 1: It's been an hour and not one of my...   
1     Speaker 1: So, eh... it's probably gonna be ha...   
2     Speaker 1: Hi!\nSpeaker 2: Hi!\nSpeaker 1: So ...   
3     Speaker 1: Hi.\nSpeaker 2: Hi.\nSpeaker 1: I j...   
4     Speaker 1: 'Okay. Okay, daddy we'll see you to...   
...                                                 ...   
8083  Speaker 1: I feel like you're turning me into ...   
8084  Speaker 1: You have to go.  I mean it.\nSpeake...   
8085  Speaker 1: You're crazier than I thought, Lenn...   
8086  Speaker 1: What's going on?\nSpeaker 2: Faith,...   
8087  Speaker 1: So finally he gives Iris some cash ...   

                                               relation  \
0     [\n{\n"x": "Speaker 2",\n"y": "Chandler Bing",...   
1     [\n{\n"x": "Speaker 2",\n"y": "Boston",\n"rid"...   
2     [\n{\n"x": "Speaker 2",\n"y": "goodie-goodie",...   
3     [\n{\n"x": "Phoebe",\n"y": "Mike",\n"rid": [\n...

In [27]:
def compute_centroid_shift(cluster_distance, sample_distance):
    return ((sample_distance - cluster_distance) / cluster_distance) * 100

max_turn_count = 7
max_dialogue_count = 5
import random
import numpy as np

random_seed = 7   # Set your random seed here

def compute_mean_distance(embeddings):
    array_data = np.array(embeddings)
    centroid = np.mean(array_data, axis=0)
    mean_distance = np.mean(np.linalg.norm(array_data - centroid, axis=1))
    return mean_distance

print('Continuing the iterative cluster labeling task, here are the new samples (Remember to reapply all steps and state clearly if a cluster was renamed else write the previous cluster name, finally keep track of a renaming count. Also state clearly the new "Alignment with Personal Relation Extraction", and if it changed, explain why and keep track of the count here):')
print('Remember, the goal is to tell if each cluster is useful for my downstream task of personal relation extraction from a chatbot history:')
for c in sorted(df.cluster.unique()):
    mask = (df.cluster == c) & (df['turn_count'] < max_turn_count)
    
    # Whole cluster
    cluster_embeddings = df[df['cluster'] == c]['embedding'].tolist()
    cluster_distance = compute_mean_distance(cluster_embeddings)

    # Sample dialogues
    sampled_dialogues = df[mask].sample(n=max_dialogue_count, random_state=random_seed)
    
    # For the sub-sample
    sample_embeddings = sampled_dialogues['embedding'].tolist()
    sample_distance = compute_mean_distance(sample_embeddings)

    # Compute centroid shift
    centroid_shift = compute_centroid_shift(cluster_distance, sample_distance)
    
    print(20 * '=')
    print(f'CLUSTER {c}')
    print(f'> Sample Deviation from Cluster Avg: {centroid_shift:.2f}% (lower is better)')
    
    for idx, row in sampled_dialogues.iterrows():
        print(f'- DIALOGUE ID: {idx}')
        print(row['dialogue'])


Continuing the iterative cluster labeling task, here are the new samples (Remember to reapply all steps and state clearly if a cluster was renamed else write the previous cluster name, finally keep track of a renaming count. Also state clearly the new "Alignment with Personal Relation Extraction", and if it changed, explain why and keep track of the count here):
Remember, the goal is to tell if each cluster is useful for my downstream task of personal relation extraction from a chatbot history:
CLUSTER 0
> Sample Deviation from Cluster Avg: -14.49% (lower is better)
- DIALOGUE ID: 4762
Speaker 1: I want a 24 hour protection of Margie's house. Otherwise I'm not saying nothing. You give me that, or read me my rights, and talk to a lawyer.
Speaker 2: Alright, you got it. And I'll be putting an Observer in with you.
Speaker 1: I want Serato ..
Speaker 2: Coz I'm short of men & Ang-
- DIALOGUE ID: 6937
Speaker 2: Every day of my life feels like a mile on the Bataan Death March.
Speaker 1: I

In [38]:
df.cluster.value_counts()

cluster
2    2468
4    1845
3    1404
0    1228
1    1143
Name: count, dtype: int64

In [42]:
# df.relation = df.relation.apply(eval)

In [47]:
df['relation_classes'] = df.relation.apply(lambda r: [x['r'][0] for x in r])

In [30]:
def assign_cluster_name(cluster):
    if cluster == 0:
        return "Formal or Official Conversations (Not Useful)" # "Criminal & Law Enforcement Context"
    elif cluster == 1:
        return "Ambiguous Conversations (Not Useful)" #"Celebrity & Entertainment"
    elif cluster == 2: 
        return "Informal Chit-Chat (Moderately Useful)" #"Personal & Intimate (Useful)"
    elif cluster == 3:
        return "Work-Related Conversations (Moderately Useful)" #Whimsical & Bizarre
    elif cluster == 4: 
        return "Knowledge Sharing (Not Useful)" #Casual Farewells & Romantic Undertones (Useful)  
    # Add more conditions for other clusters
    else:
        return "Other"

# Apply the function to create a new 'cluster_name' column
df['cluster_name'] = df['cluster'].apply(assign_cluster_name)



In [36]:
df['relation_classes'] = df.relation.apply(eval).apply(lambda rels: [r['r'][0] for r in rels])

In [38]:
counts = df.explode('relation_classes').groupby(['cluster','cluster_name', 'relation_classes']).size().reset_index(name='counts')
mask = counts.cluster == 3
counts[mask].sort_values('counts', ascending=False)

Unnamed: 0,cluster,cluster_name,relation_classes,counts
139,3,Work-Related Conversations (Moderately Useful),per:Lovers,1254
138,3,Work-Related Conversations (Moderately Useful),per:Friends,718
136,3,Work-Related Conversations (Moderately Useful),per:Colleague/Partners,534
168,3,Work-Related Conversations (Moderately Useful),per:spouse,450
142,3,Work-Related Conversations (Moderately Useful),per:Professional Contact,282
147,3,Work-Related Conversations (Moderately Useful),per:alternate_names,253
144,3,Work-Related Conversations (Moderately Useful),per:Workplace Superior - Subordinate,206
161,3,Work-Related Conversations (Moderately Useful),per:parents,197
150,3,Work-Related Conversations (Moderately Useful),per:children,196
141,3,Work-Related Conversations (Moderately Useful),per:Opponents,168


In [65]:
mask = df.cluster == 2
tmp = df[mask].dialogue.iloc[6]
print(tmp)

Speaker 1: Hey, we're having some fun now, huh, Ross? Wanna do another one, huh Russ? OK... eleven letters, atomic element number 101... ends in ium.
Speaker 2: Dysprosium.
Speaker 3: Dysprosium? Try mendelevium.
Speaker 1: And weenie number two has it. Unless, of course, nine-down, Knights in White Satin was sung by the Doody Blues.


In [79]:

import pandas as pd
import json

clusters = (2,)
for s in ['train', 'test', 'dev']:
    mask = (df.cluster.isin(clusters)) & (df.dataset == s)
    tmp = df[mask].dialogue.iloc[6]

    tmp = pd.DataFrame()

    # Assuming you have your dataset in a DataFrame called 'df'
    # First, split the 'dialogue' column into a list of strings
    tmp['dialogue'] = df[mask]['dialogue'].apply(lambda x: x.split('\n'))

    # Convert the 'relation' column to a list of dictionaries
    tmp['relation'] = df[mask]['relation'].apply(eval)

    # Then, create a new list to store the transformed data
    json_list = []

    # Iterate through the DataFrame and create JSON objects
    for _, row in tmp.iterrows():
        dialogue_list = row['dialogue']
        relation_list = row['relation']
        
        json_object = [dialogue_list, relation_list]
        
        json_list.append(json_object)

    # Finally, convert the list to a JSON string
    json_string = json.dumps(json_list, indent=2, ensure_ascii=False)


    # Create the directory if it doesn't exist
    output_dir = LOCAL_PROCESSED_DATA_PATH / f'dialog-re-ddrel-cluster{"&".join(map(str, clusters))}'
    output_dir.mkdir(parents=True, exist_ok=True)

    # Create the JSON file path
    json_file_path = output_dir / f'{s}.json'


    # Open the file in write mode ('w') and write the JSON string
    with open(json_file_path, 'w', encoding='utf8') as fp:
        fp.write(json_string)