In [1]:
import gensim
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load pre-trained Word2Vec model
model_path = '<path_to__directory>\GoogleNews-vectors-negative300.bin'  # Provide the path to the .bin file

# Load the Word2Vec model
model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

In [3]:
# Function to compute the embedding for a description by averaging word embeddings
def get_average_word2vec(description, model):
    words = description.split()  # Split the description into words
    word_vectors = []
    
    for word in words:
        # If the word exists in the Word2Vec vocabulary, add its vector
        if word in model:
            word_vectors.append(model[word])
    
    # If no words are found in the model, return a zero vector (this is a fallback)
    if not word_vectors:
        return np.zeros(model.vector_size)
    
    # Return the average of the word vectors
    return np.mean(word_vectors, axis=0)

# Function to compute cosine similarity between two embeddings
def compute_cosine_similarity(embedding1, embedding2):
    similarity = cosine_similarity([embedding1], [embedding2])
    return similarity[0][0]

In [4]:
heartwood_file = 'Heartwood_Corpus.txt'  # Path to the heartwood description file
sapwood_file = 'Sapwood_Corpus.txt'  # Path to the sapwood description file

# Step 1: Generate embeddings by averaging Word2Vec vectors
def generate_embedding_from_file(file_path, model):
    with open(file_path, 'r') as file:
        description = file.read().strip()  # Read the description and strip any extra whitespace
    return get_average_word2vec(description, model)

heartwood_embedding = generate_embedding_from_file(heartwood_file, model)
sapwood_embedding = generate_embedding_from_file(sapwood_file, model)

In [5]:
# Step 2: Compute cosine similarity
similarity = compute_cosine_similarity(heartwood_embedding, sapwood_embedding)

# Step 3: Display the similarity score
print(f"Cosine similarity between heartwood and sapwood: {similarity}")
heartwood_embedding.shape

Cosine similarity between heartwood and sapwood: 0.9913541078567505


(300,)

In [6]:
# Create a dictionary containing both embeddings
embeddings_dict = {
    'heartwood': heartwood_embedding,
    'sapwood': sapwood_embedding
}

# Save the dictionary as a single .npy file
np.save('embeddings_dict.npy', embeddings_dict)

In [6]:
np.save('heartwood_embedding_word2vec.npy', heartwood_embedding)
np.save('sapwood_embedding_word2vec.npy', sapwood_embedding)