In [3]:
from gensim.models import KeyedVectors
# Load the Word2Vec model (this may take a few minutes)
model_path = 'GoogleNews-vectors-negative300.bin'
word_vectors = KeyedVectors.load_word2vec_format(model_path, binary=True)

result = word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
most_similar_key, similarity = result[0]  # look at the first match
print(f"{most_similar_key}: {similarity:.4f}")


queen: 0.9314


In [8]:
result = word_vectors.most_similar_cosmul(positive=['plant', 'desert'], negative=['water'])
for val in result[0:3]:
    most_similar_key, similarity = val
    print(f"{most_similar_key}: {similarity:.4f}")

Desert_Proving_Ground: 0.8663
factory: 0.8367
Ramos_Arizpe: 0.8350


In [40]:
import gensim.downloader
from gensim.models import KeyedVectors
import nltk
from nltk.corpus import words
# Load the Word2Vec model (this may take a few minutes)
# glove_tw25_vectors = gensim.downloader.load('glove-twitter-25')
# glove_wiki50_vectors = gensim.downloader.load('glove-wiki-gigaword-50')
glove_wiki300_vectors = gensim.downloader.load('glove-wiki-gigaword-300')




2024-02-12 10:31:46,836 : INFO : glove-wiki-gigaword-300 downloaded
2024-02-12 10:31:46,840 : INFO : loading projection weights from /Users/Alex/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz
2024-02-12 10:32:42,474 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /Users/Alex/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2024-02-12T10:32:42.474579', 'gensim': '4.3.2', 'python': '3.11.6 (main, Oct  2 2023, 13:45:54) [Clang 15.0.0 (clang-1500.0.40.1)]', 'platform': 'macOS-14.3.1-x86_64-i386-64bit', 'event': 'load_word2vec_format'}


In [94]:
import nltk
from nltk.corpus import words, brown
from nltk import FreqDist
from gensim.models import KeyedVectors

model = glove_wiki300_vectors

# Download and prepare the list of English words and Brown frequency list
nltk.download('words')
nltk.download('brown')
english_words = set(words.words())
frequency_list = FreqDist(i.lower() for i in brown.words())

# Filter the model's vocabulary
filtered_vocab = {
    word: {"vector": model[word], "frequency": frequency_list[word]} for word in model.key_to_index
    if word in english_words and word in frequency_list
}
new_kv = KeyedVectors(vector_size=model.vector_size)

# Prepare lists of keys (words) and their vectors
keys = list(filtered_vocab.keys())
vectors = [filtered_vocab[word]["vector"] for word in keys]
frequencies = [filtered_vocab[word]["frequency"] for word in keys]

# Add all vectors in one batch
new_kv.add_vectors(keys, vectors)
# new_kv.save_word2vec_format('onlyWordsToVec.bin', binary=True)
new_kv.most_similar('queen')


[nltk_data] Downloading package words to /Users/Alex/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package brown to /Users/Alex/nltk_data...
[nltk_data]   Package brown is already up-to-date!


[('princess', 0.635676383972168),
 ('king', 0.6336469650268555),
 ('monarch', 0.5814188122749329),
 ('royal', 0.543052613735199),
 ('majesty', 0.5350356698036194),
 ('throne', 0.5097099542617798),
 ('lady', 0.5045416355133057),
 ('crown', 0.49980056285858154),
 ('consort', 0.4955049455165863),
 ('mary', 0.4903523623943329)]

In [125]:
import numpy as np
wordSpread = ['undertaker', 'heresy', 'kidney', 'dividend', 'moccasin', 'dramatics', 'commander', 'connect']
def forward_selection(words, targetLength, model):
    while len(words) < targetLength:
        words.append(new_kv.most_similar(negative=words, topn=1)[0][0])
    print(new_kv.rank_by_centrality(words))
    return words

def backwards_selection(words, targetLength, model):
    while len(words) > targetLength:
        words.remove(new_kv.rank_by_centrality(words)[0][1])
    print(new_kv.rank_by_centrality(words))
    return words

def cosine_similarity_matrix(word_list, model):
    """
    Generate a 2D array of cosine similarities between all elements in a list of words.

    :param word_list: List of words to compute the cosine similarity matrix for.
    :param model: Pre-loaded Gensim KeyedVectors model containing the word vectors.
    :return: 2D NumPy array of cosine similarities.
    """
    # Filter the list to keep only words present in the model
    valid_words = [word for word in word_list if word in model.key_to_index]
    
    # Retrieve the vectors for these words
    vectors = np.array([model[word] for word in valid_words])
    
    # Normalize the vectors to unit length
    vectors_norm = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
    
    # Compute the cosine similarity matrix
    similarity_matrix = np.dot(vectors_norm, vectors_norm.T)
    
    return similarity_matrix

def back_select(word_list, model):
    """
    Remove the word with the highest average cosine similarity to the other elements in the list,
    using the cosine_similarity_matrix function for computation.

    :param word_list: List of words.
    :param model: Pre-loaded Gensim KeyedVectors model containing the word vectors.
    :return: Modified list with the word removed.
    """
    # Generate cosine similarity matrix for the word list
    sim_matrix = cosine_similarity_matrix(word_list, model)
    
    # Compute the average cosine similarity per word (excluding self-similarity)
    np.fill_diagonal(sim_matrix, 0)  # Ensure self-similarity is not considered
    avg_sim = np.mean(sim_matrix, axis=1)
    
    # Find the index of the word with the highest average similarity
    max_sim_index = np.argmax(avg_sim)
    
    # Identify the word to remove
    valid_words = [word for word in word_list if word in model.key_to_index]
    word_to_remove = valid_words[max_sim_index]
    
    print(f"Drop '{word_to_remove}'.")
    
    # Remove the identified word from the original list and return the modified list
    modified_list = [word for word in word_list if word != word_to_remove]
    return modified_list

def bidirectional_selection(words, targetLength, iterations, model):
    for i in range(iterations):
        newWord = new_kv.most_similar(negative=words, topn=1)[0][0]
        words.append(newWord)
        if(len(words) > targetLength):
            words = back_select(words, new_kv)
    return words

wordSpread = ["apple"]
wordSpread10 = forward_selection(wordSpread, 10, new_kv)

print(bidirectional_selection(["apple", "elephant", "vacancy"], 10, 50, new_kv))



[(0.29509893, 'aw'), (0.2633828, 'distance'), (0.24080755, 'blues'), (0.23872589, 'arabesque'), (0.2085289, 'chlorine'), (0.14262965, 'elect'), (0.13942298, 'shipwreck'), (0.12546945, 'apple'), (0.123517394, 'rationalistic'), (0.10731378, 'undersecretary')]
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
Drop 'ne'.
['apple', 'elephant', 'vacancy', 'psychical', 'nymphomaniac', 'republic', 'encouragingly', 'slater', 'bank', 'thudding']


In [126]:
dir(new_kv)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_load_specials',
 '_log_evaluate_word_analogies',
 '_save_specials',
 '_smart_save',
 '_upconvert_old_d2vkv',
 '_upconvert_old_vocab',
 'add_lifecycle_event',
 'add_vector',
 'add_vectors',
 'allocate_vecattrs',
 'closer_than',
 'cosine_similarities',
 'distance',
 'distances',
 'doesnt_match',
 'evaluate_word_analogies',
 'evaluate_word_pairs',
 'expandos',
 'fill_norms',
 'get_index',
 'get_mean_vector',
 'get_normed_vectors',
 'get_vecattr',
 'get_vector',
 'has_index_for',
 'index2entity',
 'index2word',
 'index_to_key'

In [81]:
import numpy as np
model = new_kv
def calculate_mean_distance(word, word_list, model):
    """Calculate the mean distance of a word from a list of words."""
    if not word_list:  # Avoid division by zero
        return float('inf')  
    distances = model.distances(word, word_list)
    return np.mean(distances)

def forward_selection(model, initial_list, n_words, vocab_limit=10000):
    """
    Perform forward selection to maximize mean distance between words in the list.
    
    :param model: KeyedVectors model
    :param initial_list: list of initial words (can be empty)
    :param n_words: target number of words in the list
    :param vocab_limit: number of top frequent words to consider from the vocabulary
    """
    selected_words = initial_list[:]
    vocabulary = [word for word, _ in model.most_similar(negative=selected_words, topn=vocab_limit)]

    while len(selected_words) < n_words and vocabulary:
        max_distance = 0
        best_candidate = None
        for candidate in vocabulary:
            mean_distance = calculate_mean_distance(candidate, selected_words, model)
            if mean_distance > max_distance:
                max_distance = mean_distance
                best_candidate = candidate
                
        if best_candidate:
            selected_words.append(best_candidate)
            vocabulary.remove(best_candidate)
            print(f"Added: {best_candidate}, Mean Distance: {max_distance}")
        else:
            break  # Stop if no suitable candidate is found

    return selected_words

def backward_selection(selected_words, model):
    """
    Perform backward selection by removing the least distinctive word from the list
    in a more elegant way.
    
    :param selected_words: The initial list of words.
    :param model: The KeyedVectors model containing the word vectors.
    :return: The modified list with one word removed.
    """
    if len(selected_words) <= 1:
        return selected_words  # Cannot remove words from a list with 1 or 0 elements.

    # Calculate the overall mean distance for the list without each word.
    mean_distances_without_word = [
        (word, calculate_mean_distance(word, selected_words[:i] + selected_words[i+1:], model))
        for i, word in enumerate(selected_words)
    ]

    # Find the word whose removal results in the highest mean distance.
    word_to_remove, _ = min(mean_distances_without_word, key=lambda x: x[1])

    # Remove the least distinctive word from the list.
    print(f"Removing: {word_to_remove}")
    selected_words.remove(word_to_remove)

    return selected_words

def calculate_mean_distances(word_list, model):
    """
    Calculate the mean distance of each word from all other words in the list.
    
    :param word_list: A list of words to calculate mean distances for.
    :param model: The KeyedVectors model containing the word vectors.
    :return: A dictionary with words as keys and their mean distances as values.
    """
    mean_distances = {}
    
    for i, word in enumerate(word_list):
        if word not in model.key_to_index:
            continue  # Skip words not in the model
        
        # Collect all other words except the current one
        other_words = word_list[:i] + word_list[i+1:]
        
        # Filter out words not in the model
        other_words = [w for w in other_words if w in model.key_to_index]
        
        if not other_words:  # If no other valid words are left, skip
            continue
        
        # Calculate distances from the current word to all other words
        distances = model.distances(word, other_words)
        
        # Compute the mean distance and store it
        mean_distances[word] = np.mean(distances)
    
    return mean_distances

# Example usage
wordSpread = ['undertaker', 'heresy', 'kidney', 'dividend', 'moccasin', 'whetstone', 'dramatics']
wordSpread = ['apple']

selected_words = forward_selection(model, wordSpread, 10)  # Target 10 words in the list
print("Final selection:", selected_words)

Added: levator, Mean Distance: 1.295461893081665
Added: woodrow, Mean Distance: 1.1436173915863037
Added: accommodation, Mean Distance: 1.102936029434204
Added: manto, Mean Distance: 1.0776101350784302
Added: mobster, Mean Distance: 1.0628013610839844
Added: gainfully, Mean Distance: 1.0435519218444824
Added: nineteen, Mean Distance: 1.0391043424606323
Added: bolivar, Mean Distance: 1.0277533531188965
Added: photochemical, Mean Distance: 1.0326476097106934
Final selection: ['apple', 'levator', 'woodrow', 'accommodation', 'manto', 'mobster', 'gainfully', 'nineteen', 'bolivar', 'photochemical']


In [None]:
model.distances