This file has code for help getting started creating a function which generate clues considering the entire board (opponents' words + assassin words + "bystander" words)

In [None]:
# Import packages
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import gensim
import re
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import nltk
#nltk.download('punkt')

from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans


In [None]:
#reading in the cleaned dictionary
df = pd.read_csv('cleaned_dict.csv')

#replace commas with spaces
df['definition'] = df['definition'].str.replace(',', ' ')
#remove any non-alphabetic characters
df['definition'] = df['definition'].str.replace('[^a-zA-Z]', '')

#tokenize the definitions
df['definition'] = df['definition'].apply(word_tokenize)
#remove quotes from the words
df['definition'] = df['definition'].apply(lambda x: [word.replace("'", "") for word in x])

#train the word2vec model
model = Word2Vec(df['definition'], min_count=5, window=5, sg=0)

#save the model
model.save('codenames.model')

Here's a function that generates clues ONLY for our words

In [None]:
def generate_word_clusters_clues(words_input, model, df, n_clustersIn):
    # Extract word vectors and normalize them
    vectors = [model.wv[word] for word in words_input]
    normalized_vectors = normalize(vectors)

    # Initialize k-means model and fit it to the normalized vectors
    kmeans = KMeans(n_clusters=n_clustersIn, random_state=100)
    kmeans.fit(normalized_vectors)

    # Get cluster assignments for each word
    cluster_assignments = kmeans.predict(normalized_vectors)

    # Create a dataframe using Pandas library with the clusters of words as rows in lists
    clues_df = pd.DataFrame({'words': words_input, 'cluster': cluster_assignments})
    clues_df = clues_df.groupby('cluster')['words'].apply(list).reset_index()

    # Add columns to keep track of the most similar word and its similarity score -- so we can use it as a clue
    clues_df['maxSimilarity'] = 0.0
    clues_df['mostSimilarWord'] = ''

    # Loop to generate clues for each cluster
    # Iterate throw every row in the dataframe
    for index, row in clues_df.iterrows():
        # Get the cluster words and calculate the cluster center
        cluster_words = row['words']
        cluster_center = np.mean([model.wv[word] for word in cluster_words], axis=0)

        # Get the most similar words and their similarity scores
        most_similar_words = model.wv.most_similar([cluster_center], topn=len(words_input) * 2)
        # Filter out words that are already in the clusters -- we CANNOT use them as clues
        most_similar_words = [(word, score) for word, score in most_similar_words if word not in cluster_words]
        
        # first word is most similar word
        most_similar_word, similarity_score = most_similar_words[0]
        # Set maxSimilarity and mostSimilarWord for the clusters; add to dataframe
        clues_df.at[index, 'maxSimilarity'] = similarity_score
        clues_df.at[index, 'mostSimilarWord'] = most_similar_word

    # Return the dataframe with the clusters and their clues
    return clues_df


In [None]:
# Assume we are the starting team, so we have 9 words
word_list = ['battery', 'beach', 'church', 'ham', 'lawyer', 'marble', 'night', 'michigan', 'horse']

# Use our function to generate clues, we'll set the number of clusters to 5
clues_dataframe = generate_word_clusters_clues(word_list, model, df, n_clustersIn=5)

# Print the generated data frame which contains the clusters, their clues (mostSimilarWord), and the similarity score
print(clues_dataframe)


Here we give out the clue and associated number of words it corresponds to (like we would when playing the game)

In [None]:
# Find the row with the highest similarity score and print the clue (most similar word, number)
max_similarity_row = clues_dataframe.loc[clues_dataframe['maxSimilarity'].idxmax()]
print(f"{max_similarity_row['mostSimilarWord']} {len(max_similarity_row['words'])}")

Now, we should modify our function to consider the entire board.
NOTE: this is just one way of implementing this. Feel free to experiment with other methods. We use similarity score to check how "good" the clues are

In [None]:
def generate_word_clusters_clues(words_input, model, df, opponents_words, assassin_word, n_clustersIn, threshold_similarity):
    # Extract word vectors and normalize them
    vectors = [model.wv[word] for word in words_input]
    normalized_vectors = normalize(vectors)

    # Initialize k-means model and fit it to the normalized vectors
    kmeans = KMeans(n_clusters=n_clustersIn, random_state=10)
    kmeans.fit(normalized_vectors)

    # Get cluster assignments for each word
    cluster_assignments = kmeans.predict(normalized_vectors)

    # Create a dataframe using Pandas library with the clusters of words as rows in lists
    clues_df = pd.DataFrame({'words': words_input, 'cluster': cluster_assignments})
    clues_df = clues_df.groupby('cluster')['words'].apply(list).reset_index()

    # Add columns to keep track of the most similar word and its similarity score -- so we can use it as a clue
    clues_df['maxSimilarity'] = 0.0
    clues_df['mostSimilarWord'] = ''

    # Loop to generate clues for each cluster
    # Iterate throw every row in the dataframe
    for index, row in clues_df.iterrows():
        # Get the cluster words and calculate the cluster center
        cluster_words = row['words']
        cluster_center = np.mean([model.wv[word] for word in cluster_words], axis=0)

        # Get the most similar words and their similarity scores
        most_similar_words = model.wv.most_similar([cluster_center], topn=len(words_input) * 2)
        # Filter out words that are already in the clusters -- we CANNOT use them as clues
        most_similar_words = [(word, score) for word, score in most_similar_words if word not in cluster_words]

        # TODO: Filter out clues too similar to bad words or the assassin word
        filtered_similar_words = []
        for word, score in most_similar_words:
            # IDEA: Use the similarity score to filter out words that are too similar to opponents words or the assassin word

        # Check if there are still potential clues after filtering
        if filtered_similar_words:
            # First word is most similar word
            most_similar_word, similarity_score = filtered_similar_words[0]
            # Set maxSimilarity and mostSimilarWord for the clusters; add to dataframe
            clues_df.at[index, 'maxSimilarity'] = similarity_score
            clues_df.at[index, 'mostSimilarWord'] = most_similar_word

    return clues_df

def get_best_clues(clues_dataframe):
    # Find the best clue from the available options
    max_similarity_row = clues_dataframe.loc[clues_dataframe['maxSimilarity'].idxmax()]
    best_clue_word = max_similarity_row['mostSimilarWord']
    cluster_words = max_similarity_row['words']
    return best_clue_word, len(cluster_words), cluster_words

This in an example for how we can use the function. Experiment with different words and assasin word

In [None]:
# Set up the scenario
word_list = ['battery', 'beach', 'church', 'ham', 'lawyer', 'marble', 'night', 'michigan', 'horse']
opponents_words = ['angel', 'bottle', 'diamond', 'glove', 'needle', 'temple', 'pound', 'stream']  # Add your list of bad words
assassin_word = 'assassin'
threshold_similarity = 0.8  # Adjust the threshold based on how "aggressive" you want the clues to be
n_clusters = 5  # Adjust the number of clusters if needed

# Generate clues
clues_dataframe = generate_word_clusters_clues(word_list, model, df, bad_words, assassin_word, n_clusters, threshold_similarity)

# Get the best clue
best_clue = get_best_clues(clues_dataframe)

best_clue_word, num_words, associated_words = best_clue
print(f"Best Clue: {best_clue_word} {num_words}")
print("Associated Words:")
for word in associated_words:
    similarity_score = model.wv.similarity(best_clue_word, word)
    print(f"{word}: {similarity_score}")
