# Elamite Word2Vec Model
## Adapted for local use from Basic_Word2Vec notebook

This notebook builds a Word2Vec model from the Elamite texts and computes cosine similarity scores.

See documentation: https://radimrehurek.com/gensim/models/word2vec.html

## 1. Install dependencies (run once)

In [None]:
# Uncomment and run this cell if you need to install gensim
# !pip install gensim

## 2. Import libraries and load texts

In [None]:
from gensim.models import Word2Vec
import os
import csv
import numpy as np

# Load all text files from the texts/ directory
texts_dir = 'texts'
txts = []

for file in os.listdir(texts_dir):
    if file.endswith('.txt'):
        with open(os.path.join(texts_dir, file), 'r', encoding='utf-8') as f:
            txts.append(f.read())

print(f"Loaded {len(txts)} documents")

## 3. Tokenize texts

For Elamite texts, we use simple whitespace tokenization to preserve the transliteration format (including hyphens, parentheses, etc.).

In [None]:
def tokenize_elamite(texts):
    """Tokenize Elamite texts by whitespace, preserving special characters."""
    tokenized = []
    for text in texts:
        # Split by whitespace, keeping transliteration intact
        words = text.strip().split()
        if words:  # Only add non-empty documents
            tokenized.append(words)
    return tokenized

words = tokenize_elamite(txts)
print(f"Tokenized {len(words)} documents")
print(f"Sample tokens from first document: {words[0][:10]}")

## 4. Build Word2Vec model

In [None]:
# Build Word2Vec model
# min_count=1 ensures all words are included (important for small corpus)
# vector_size=100 is the dimensionality of word vectors
# window=5 is the context window size

w2v = Word2Vec(
    sentences=words,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

print(f"Vocabulary size: {len(w2v.wv)}")

## 5. Explore the model

In [None]:
# Test with some example words
# Find most similar words to a given word
test_word = 'su-un-ki-ik'  # Change this to test different words

if test_word in w2v.wv:
    print(f"Most similar to '{test_word}':")
    for word, score in w2v.wv.most_similar(test_word, topn=10):
        print(f"  {word}: {score:.4f}")
else:
    print(f"'{test_word}' not in vocabulary")

In [None]:
# Get similarity between two words
word1 = 'dingir-gal'
word2 = 'DINGIR-GAL'

if word1 in w2v.wv and word2 in w2v.wv:
    similarity = w2v.wv.similarity(word1, word2)
    print(f"Similarity between '{word1}' and '{word2}': {similarity:.4f}")
else:
    print("One or both words not in vocabulary")

## 6. Compute cosine similarity for each word

For each word, we compute the average cosine similarity to its most similar neighbors. This gives a measure of how "typical" or "central" each word is in the semantic space.

In [None]:
def compute_avg_similarity(word, model, topn=5):
    """Compute average cosine similarity of a word to its top-n neighbors."""
    try:
        similar = model.wv.most_similar(word, topn=topn)
        avg_sim = np.mean([score for _, score in similar])
        return round(avg_sim, 4)
    except KeyError:
        return None

# Test
test_word = 'su-un-ki-ik'
print(f"Average similarity for '{test_word}': {compute_avg_similarity(test_word, w2v)}")

## 7. Update CSV with cosine similarity scores

In [None]:
# Read the original CSV and add cosine similarity column
input_csv = 'UntN-Nasu texts Word-level.csv'
output_csv = 'UntN-Nasu texts Word-level with similarity.csv'

rows = []
with open(input_csv, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    fieldnames = reader.fieldnames + ['Cosine_similarity']
    
    for row in reader:
        word = row['Text']
        similarity = compute_avg_similarity(word, w2v)
        row['Cosine_similarity'] = similarity if similarity else ''
        rows.append(row)

# Write the updated CSV
with open(output_csv, 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

print(f"Updated CSV saved to: {output_csv}")
print(f"Total rows: {len(rows)}")

## 8. Save the model for future use

In [None]:
# Save the model
model_path = 'elamite_word2vec.model'
w2v.save(model_path)
print(f"Model saved to: {model_path}")

# To load later:
# loaded_model = Word2Vec.load('elamite_word2vec.model')

## 9. Visualization (Optional)

In [None]:
# Uncomment to install visualization dependencies
# !pip install matplotlib scikit-learn

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Get all words and vectors
vocab_words = list(w2v.wv.index_to_key)
vectors = np.array([w2v.wv[word] for word in vocab_words])

# Reduce to 2D using PCA
pca = PCA(n_components=2)
vectors_2d = pca.fit_transform(vectors)

# Plot
plt.figure(figsize=(15, 10))
plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], alpha=0.5)

# Annotate some words (sample to avoid overcrowding)
sample_indices = np.random.choice(len(vocab_words), min(50, len(vocab_words)), replace=False)
for i in sample_indices:
    plt.annotate(vocab_words[i], (vectors_2d[i, 0], vectors_2d[i, 1]), fontsize=8)

plt.title('Elamite Word Embeddings (PCA)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.tight_layout()
plt.savefig('word_embeddings_pca.png', dpi=150)
plt.show()

print("Plot saved to: word_embeddings_pca.png")