In [2]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import gensim.downloader as api
!pip install python-docx
# Check if python-docx is installed
try:
    import docx
except ModuleNotFoundError:
    print("Error: The 'python-docx' library is not installed.")
    print("Please install it using: pip install python-docx")
    exit(1)  # Exit the program if the module is not installed
from docx import Document
from google.colab import drive
import os

def verify_file_exists(file_path, file_type):
    """Check if file exists and print its size"""
    if os.path.exists(file_path):
        size = os.path.getsize(file_path)
        print(f"✓ {file_type} saved successfully at: {file_path}")
        print(f"  File size: {size/1024:.2f} KB")
        return True
    else:
        print(f"✗ Error: {file_type} not found at: {file_path}")
        return False

def create_directory_if_not_exists(path):
    """Create directory if it doesn't exist"""
    directory = os.path.dirname(path)
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

def main():
    # 1. اتصال به Google Drive
    print("\n=== Step 1: Mounting Google Drive ===")
    drive.mount('/content/drive')

    base_path = '/content/drive/My Drive/test1/'
    create_directory_if_not_exists(base_path)

    # 2. خواندن فایل CSV
    print("\n=== Step 2: Reading CSV file ===")
    csv_path = base_path + 'word_list_for_evaluation.csv'
    try:
        df = pd.read_csv(csv_path)
        print(f"✓ CSV file loaded successfully")
        print(f"Total rows: {len(df)}")
        print("\nFirst few rows:")
        print(df.head())
        print("\nScore statistics:")
        print(df['score'].describe())
    except Exception as e:
        print(f"✗ Error reading CSV: {str(e)}")
        return

    # 3. فیلتر کردن کلمات بر اساس score به جای eval
    print("\n=== Step 3: Filtering words based on score ===")
    # انتخاب کلمات با score بالاتر از 0.8
    threshold = 0.8
    df_filtered = df[df['score'] >= threshold].sort_values('score', ascending=False)
    included_words = df_filtered['word'].tolist()

    print(f"Words with score >= {threshold}: {len(included_words)}")
    if len(included_words) < 2:
        print("✗ Error: Need at least 2 words with high scores")
        # اگر کلمات کافی نبود، آستانه را کاهش می‌دهیم
        threshold = 0.7
        df_filtered = df[df['score'] >= threshold].sort_values('score', ascending=False)
        included_words = df_filtered['word'].tolist()
        print(f"Trying with lower threshold {threshold}: {len(included_words)} words")

    if len(included_words) < 2:
        print("✗ Error: Still not enough words")
        return

    print("Selected words and scores:")
    for word, score in zip(df_filtered['word'], df_filtered['score']):
        print(f"  {word}: {score:.3f}")

    # 4. بارگذاری مدل GloVe
    print("\n=== Step 4: Loading GloVe model ===")
    try:
        glove_model = api.load("glove-wiki-gigaword-100")
        print("✓ GloVe model loaded successfully")
    except Exception as e:
        print(f"✗ Error loading GloVe model: {str(e)}")
        return

    # 5. تبدیل کلمات به بردار
    print("\n=== Step 5: Converting words to vectors ===")
    word_vectors = []
    valid_words = []
    valid_scores = []
    for word, score in zip(df_filtered['word'], df_filtered['score']):
        try:
            vector = glove_model[word]
            word_vectors.append(vector)
            valid_words.append(word)
            valid_scores.append(score)
            print(f"✓ Vector created for: {word} (score: {score:.3f})")
        except KeyError:
            print(f"✗ Warning: '{word}' not in vocabulary")

    if len(valid_words) < 2:
        print("✗ Error: Not enough valid words")
        return

    # 6. اجرای t-SNE و ایجاد نمودار
    print("\n=== Step 6: Creating visualization ===")
    try:
        word_vectors_array = np.array(word_vectors)
        perplexity_value = max(2, min(5, len(valid_words) - 1))
        tsne = TSNE(n_components=2, perplexity=perplexity_value, random_state=42)
        word_vectors_2d = tsne.fit_transform(word_vectors_array)

        plt.figure(figsize=(12, 8))
        plt.clf()

        # رسم نقاط با اندازه متناسب با score
        sizes = np.array(valid_scores) * 100  # تبدیل score به اندازه نقاط

        scatter = plt.scatter(word_vectors_2d[:, 0], word_vectors_2d[:, 1],
                            s=sizes, alpha=0.6, c=valid_scores, cmap='viridis')

        # اضافه کردن برچسب‌ها
        for i, word in enumerate(valid_words):
            plt.annotate(f"{word}\n({valid_scores[i]:.3f})",
                        (word_vectors_2d[i, 0], word_vectors_2d[i, 1]),
                        xytext=(5, 5), textcoords='offset points',
                        fontsize=8, alpha=0.8)

        plt.colorbar(scatter, label='Score')
        plt.title('2D Visualization of Dictionary Words\nColored by Score')
        plt.xlabel('Dimension 1')
        plt.ylabel('Dimension 2')

        # ذخیره نمودار
        plot_path = base_path + 'dictionary_embedding_plot.jpg'
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        plt.close()
        verify_file_exists(plot_path, "Plot")

    except Exception as e:
        print(f"✗ Error creating visualization: {str(e)}")
        return

    # 7. ایجاد و ذخیره فایل Word
    print("\n=== Step 7: Creating Word document ===")
    try:
        doc = Document()

        doc.add_heading('Word Embedding Analysis Report', 0)

        from datetime import datetime
        doc.add_paragraph(f'Generated on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

        doc.add_heading('Analysis Parameters', level=1)
        doc.add_paragraph(f'Score threshold: {threshold}')
        doc.add_paragraph(f'Total words analyzed: {len(valid_words)}')

        doc.add_heading('Word Scores', level=1)
        table = doc.add_table(rows=1, cols=2)
        table.style = 'Table Grid'
        table.rows[0].cells[0].text = 'Word'
        table.rows[0].cells[1].text = 'Score'

        for word, score in zip(valid_words, valid_scores):
            row = table.add_row()
            row.cells[0].text = word
            row.cells[1].text = f'{score:.3f}'

        doc.add_heading('Visualization', level=1)
        doc.add_paragraph("""
        The visualization shows the semantic relationships between words in a two-dimensional space.
        - Larger circles indicate higher scores
        - Colors indicate the score (darker = higher score)
        - Words that appear closer together share similar semantic meanings
        - Distance between words represents semantic similarity
        """)

        doc.add_picture(plot_path, width=docx.shared.Inches(6))

        word_doc_path = base_path + 'word_embedding_results.docx'
        doc.save(word_doc_path)
        verify_file_exists(word_doc_path, "Word document")

    except Exception as e:
        print(f"✗ Error creating Word document: {str(e)}")
        return

    print("\n=== Analysis completed successfully! ===")
    print(f"Files saved in: {base_path}")
    print("1. Plot: dictionary_embedding_plot.jpg")
    print("2. Report: word_embedding_results.docx")

if __name__ == "__main__":
    main()

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2

=== Step 1: Mounting Google Drive ===
Mounted at /content/drive

=== Step 2: Reading CSV file ===
✓ CSV file loaded successfully
Total rows: 84

First few rows:
              word     score  eval
0         creative  0.907521   NaN
1       innovative  0.895459   NaN
2  entrepreneurial  0.856225   NaN
3       innovation  0.782994   NaN
4       creativity  0.737144   NaN

Score statistics:
count    84.000000
mean      0.600273
std       0.079888
min       0.509404
25%       0.542408
50%       0.587261
75%       0.623841
max   