In [1]:
import pandas as pd
df = pd.read_csv('ai_metaphors_1010_checked_for_ai.csv')
df=df.loc[df.ai_generated==False]
df = df.reset_index()

In [2]:
df = df.loc[~df.ai_metaphor.str.contains(" NA ")]

In [3]:
def remove_words_with_substring(s, substring):
    return ' '.join([word for word in s.split() if substring not in word])

import re
# for w in ['evaluation', 'metric', 'measure']:
df['clean_metaphor'] = df['ai_metaphor'].apply(remove_words_with_substring, args=('AI',))#.replace(nam,'_')
df['clean_metaphor'] = df['clean_metaphor'].apply(remove_words_with_substring, args=('metaphor',))#.replace(nam,'_')
df['clean_metaphor'] = df['clean_metaphor'].str.replace(r'\bai\b', '', regex=True,flags=re.IGNORECASE)
df['clean_metaphor'] = df['clean_metaphor'].str.replace(r'artificial intelligence', '', regex=True,flags=re.IGNORECASE)
df['clean_metaphor'] = df['clean_metaphor'].str.replace(r'like', '', regex=True,flags=re.IGNORECASE)

# Automatically cluster

In [4]:
# check the topic model
import umap
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN

from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("all-mpnet-base-v2")


In [6]:
import numpy as np
normalized_embeddings = np.load('normalized_embeddings.npy')

In [24]:
import spacy
nlp = spacy.load('en_core_web_sm')
vectorizer_model = CountVectorizer(ngram_range=(1,2),
                                   stop_words=list(nlp.Defaults.stop_words))


# cluster_model = KMeans(n_clusters=20, random_state=42)

topic_model = BERTopic(embedding_model=sentence_model, language='English',
                       verbose=True,calculate_probabilities=False,
                       vectorizer_model = vectorizer_model,
                       n_gram_range=(1,2), min_topic_size = 50)#hdbscan_model = cluster_model)

topic, probs = topic_model.fit_transform(list(df.clean_metaphor), normalized_embeddings[df.index])

topic_model.get_topic_info()

2024-10-10 22:05:14,719 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-10 22:05:19,522 - BERTopic - Dimensionality - Completed ✓
2024-10-10 22:05:19,523 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-10 22:05:19,930 - BERTopic - Cluster - Completed ✓
2024-10-10 22:05:19,934 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-10 22:05:20,500 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1852,-1_information_human_machine_works,"[information, human, machine, works, data, bas...","[Machine learning, machine learning, Machine L..."
1,0,1044,0_brain_human_human brain_mind,"[brain, human, human brain, mind, digital, dig...","[a brain, Brain, is a brain]"
2,1,637,1_robot_human_robots_human robot,"[robot, human, robots, human robot, think, rob...","[is a robot, It’s a robot, is a robot becau..."
3,2,591,2_assistant_personal_personal assistant_tasks,"[assistant, personal, personal assistant, task...","[A personal assistant, is a personal assistant..."
4,3,557,3_future_people_technology_sword,"[future, people, technology, sword, double, do...","[Double edged sword., can be a double edged sw..."
5,4,425,4_tool_use_help_helps,"[tool, use, help, helps, work, people, time, t...",[is another tool in the tool box for people to...
6,5,367,5_data_works_information_based,"[data, works, information, based, uses, given,...","[It is a set of networks that, through massive..."
7,6,319,6_genie_wishes_wish_bottle,"[genie, wishes, wish, bottle, genie bottle, ma...","[A genie, is a ""genie"", is a genie]"
8,7,296,7_computer_machine_human_humans,"[computer, machine, human, humans, machine lea...","[One can easily think of examples , machine vi..."
9,8,283,8_library_book_knowledge_books,"[library, book, knowledge, books, information,...","[Imagine as a vast library of magical books, e..."


In [26]:
topic_model.save("50_clusters", serialization="pickle")




In [35]:
df['topic_50'] = topic

# Get dominant metaphors determined by manual refinement


In [2]:
import csv

def read_csv_as_dict(file_path):
    data_dict = {}
    with open(file_path, mode='r') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            if len(row[0])>1:
                key = row[0]  # First column as the key
                data_dict[key] = [int(x.split('_')[0]) for x in row[1:] if len(x)>2]  # Remaining columns as a list
    return data_dict

# Usage
file_path = '../../Desktop/topic_groups.csv'
topic_groups = read_csv_as_dict(file_path)

def invert_dict(original_dict):
    inverted_dict = {}
    for key, values in original_dict.items():
        for value in values:
            inverted_dict[value] = key
    return inverted_dict

topic_key = invert_dict(topic_groups)

In [75]:
df['big_topic'] = df.topic_50.apply(lambda x: topic_key[x])

# Reassign outliers

In [107]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
cluster_centroids = np.array(topic_embeddings)

# new_topic = 5
new_topics = []
for i,x in df.iterrows():
    if x['big_topic'] == 'misc':
#         print(x.clean_metaphor)
        similarities = cosine_similarity(normalized_embeddings[1].reshape(1,-1),normalized_embeddings[i].reshape(1,-1))
#         print(similarities)
        # Assign each outlier to the nearest cluster (highest similarity score)
        ind = np.argmax(similarities)
        if max(similarities) > 0.995:
#         print(outlier_assignments)
            new_topic = df.big_topic.unique()[ind]
            new_topics.append(new_topic)
        else:
            print('no')
            new_topics.append(x.big_topic)
    else:
        new_topics.append(x.big_topic)