In [6]:
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import csv
!pip install python-docx
# Check if python-docx is installed
try:
    import docx
except ModuleNotFoundError:
    print("Error: The 'python-docx' library is not installed.")
    print("Please install it using: pip install python-docx")
    exit(1)  # Exit the program if the module is not installed

# Load GloVe model using gensim's built-in downloader
print("Loading GloVe model...")
glove_model = api.load("glove-wiki-gigaword-100")  # 100-dimensional GloVe embeddings
print("GloVe model loaded successfully!")

# Root words
root_words = ["entrepreneurial", "creative", "innovative", "trailblazing"]

# Get vectors for root words
root_vectors = [glove_model[word] for word in root_words]

# Calculate cosine similarity between each pair of root words
similarities = {}
for i, word1 in enumerate(root_words):
    for j, word2 in enumerate(root_words):
        if i < j:
            sim = cosine_similarity([root_vectors[i]], [root_vectors[j]])[0][0]
            similarities[f"{word1}-{word2}"] = sim

# Save similarities to word_embedding_results.docx
doc = docx.Document()  # This will work because docx is imported successfully
doc.add_paragraph("Cosine Similarities between Root Words:")
for pair, sim in similarities.items():
    doc.add_paragraph(f"{pair}: {sim:.4f}")

# Identify and drop the word with the lowest average similarity
avg_similarities = {word: np.mean([sim for pair, sim in similarities.items() if word in pair]) for word in root_words}
word_to_drop = min(avg_similarities, key=avg_similarities.get)
remaining_words = [word for word in root_words if word != word_to_drop]
remaining_vectors = [glove_model[word] for word in remaining_words]

# Calculate the average vector of the remaining words
avg_vector = np.mean(remaining_vectors, axis=0)

# Save the first five dimensions of the average vector
doc.add_paragraph("\nFirst five dimensions of the average vector:")
doc.add_paragraph(str(avg_vector[:5]))

# Calculate cosine similarity between the average vector and each root word
avg_similarities = {word: cosine_similarity([glove_model[word]], [avg_vector])[0][0] for word in remaining_words}
doc.add_paragraph("\nCosine Similarities between Average Vector and Root Words:")
for word, sim in avg_similarities.items():
    doc.add_paragraph(f"{word}: {sim:.4f}")

# Find the 50 words closest to the average vector
deductive_words = glove_model.similar_by_vector(avg_vector, topn=50)


from google.colab import drive
drive.mount('/content/drive')




# Load the article and find the 50 closest words
with open('/content/drive/My Drive/test1/article_preprint.txt', 'r') as file:
    article_text = file.read().split()

# Filter words that are in the GloVe model
article_words = [word for word in article_text if word in glove_model]

# Calculate similarity to the average vector for each word in the article
article_similarities = {word: cosine_similarity([glove_model[word]], [avg_vector])[0][0] for word in article_words}

# Sort by similarity and take the top 50
inductive_words = sorted(article_similarities.items(), key=lambda x: x[1], reverse=True)[:50]

# Combine the two lists and remove duplicates
combined_words = {word: sim for word, sim in deductive_words}
combined_words.update({word: sim for word, sim in inductive_words})

# Save to CSV
with open('word_list_for_evaluation.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['word', 'score', 'eval'])
    for word, sim in combined_words.items():
        writer.writerow([word, sim, ''])

# Save the Word document
doc.save('word_embedding_results.docx')

Loading GloVe model...
GloVe model loaded successfully!
Mounted at /content/drive
