<a href="https://colab.research.google.com/github/pasknk/contextotr/blob/main/turkish_contexto_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [19]:
from gensim.models import KeyedVectors
import random
import heapq
import re # Harf kontrolü için eklendi
import os

# --- ---
# Configuration
# --- ---
# This file must be in the same directory as this Python script,
# or you must provide the full path here.
MODEL_FILE = '/content/fastText/cc.tr.filtered.vec' # Updated to use the filtered file


# --- ---\
# Oyun Motoru (ContextoEngine)
# --- ---
class ContextoEngine:
    def __init__(self, model_path=MODEL_FILE):
        if not os.path.exists(model_path):
            print(f"Error: Model file '{model_path}' not found.")
            print("Please ensure the filtered model file exists.")
            self.model = None
            return

        print(f"Loading Turkish model from '{model_path}'...")
        print("--- Model yükleniyor ---") # Updated message
        try:
            self.model = KeyedVectors.load_word2vec_format(model_path)
            print("Model loaded successfully.")

            self.vocabulary = list(self.model.index_to_key)

            # Türkçe karakterleri de içeren bir regex filtresi
            # Sadece harf içeren ve 3 harften uzun kelimeleri al
            turkish_letters = re.compile(r'^[a-zA-ZçÇğĞıİöÖşŞüÜ]+$')

            self.filtered_vocabulary = [
                word for word in self.vocabulary
                if len(word) > 3 and turkish_letters.match(word)
            ]
            # Re-filter based on the loaded model's vocabulary
            if len(self.filtered_vocabulary) != len(self.vocabulary):
                 print(f"Warning: Additional filtering applied after loading. Original loaded vocab size: {len(self.vocabulary)}, Filtered usable words: {len(self.filtered_vocabulary)}")
            else:
                 print(f"Model vocabulary loaded ({len(self.filtered_vocabulary)} usable Turkish words).")


        except Exception as e:
            print(f"Error loading model: {e}")
            self.model = None
            self.filtered_vocabulary = []

    def set_secret_word(self, word=None):
        if not self.model or not self.filtered_vocabulary:
            print("Model not loaded or vocabulary empty, cannot set secret word.")
            return None, []


        if word is None:
            self.secret_word = random.choice(self.filtered_vocabulary)
        else:
            self.secret_word = word.lower()

        if self.secret_word not in self.model:
            print(f"Error: Secret word '{self.secret_word}' not in model vocabulary.")
            # Başka bir kelime seçmeyi dene
            if self.filtered_vocabulary:
                self.secret_word = random.choice(self.filtered_vocabulary)
                print(f"Trying a different word from filtered vocabulary: '{self.secret_word}'")
            else:
                print("Filtered vocabulary is empty. Cannot select a secret word.")
                return None, []


        print(f"New game started. The secret word has been chosen.")
        # İpucu:
        print(f"(Hint: The secret word is '{self.secret_word}')")

        print("Calculating similarity rankings... (This can take a minute)")

        all_word_distances = []
        for w in self.filtered_vocabulary: # Use filtered_vocabulary for ranking
            if w == self.secret_word:
                continue
            try:
                similarity = self.model.similarity(self.secret_word, w)
                distance = 1.0 - similarity
                all_word_distances.append((distance, w))
            except KeyError:
                # This should ideally not happen if filtered_vocabulary is a subset of model.index_to_key
                print(f"Warning: Word '{w}' not found in model during ranking calculation.")
                continue

        all_word_distances.sort()

        self.ranked_list = {self.secret_word: 1}
        self.rank_to_word = {1: self.secret_word}

        rank = 2
        for distance, word in all_word_distances:
            if word not in self.ranked_list:
                self.ranked_list[word] = rank
                self.rank_to_word[rank] = word
                rank += 1

        print("Rankings calculated.")
        return self.secret_word, self.ranked_list

    def get_rank(self, word):
        word = word.lower()
        if not self.model:
             return None, "Oyun motoru başlatılmamış."

        if word not in self.model:
            return None, "Kelime modelin sözlüğünde yok."

        if word not in self.ranked_list:
            # If a valid word is not in the ranked list (because it was filtered out initially
            # but is still in the model's full vocabulary), calculate its rank.
            # This makes the game more robust if the filtering criteria for the file
            # is different from the filtering criteria for the game's usable vocabulary.
            try:
                similarity = self.model.similarity(self.secret_word, word)
                # Estimate a rank based on similarity - this is not perfect but better than a fixed large number
                # A higher similarity means a lower rank number (closer to 1).
                # We can't get the exact rank without the full list, so we'll give a general message.
                return 999998, f"Kelime gizli kelimeye uzak. Benzerlik skoru: {similarity:.4f}"
            except KeyError:
                 return None, "Kelime modelin sözlüğünde yok (ikinci kontrol)."


        return self.ranked_list[word], None

    def get_hint(self):
        """Returns a word similar to the secret word as a hint."""
        if not self.model or not self.secret_word:
            return "Hint not available."

        # Get the most similar words, excluding the secret word itself
        try:
            # Use the filtered_vocabulary to find similar words that are also game-valid
            similar_words = self.model.most_similar(positive=[self.model[self.secret_word]], topn=50) # Get top N to choose from

            # Find the first similar word that is in the filtered_vocabulary and not the secret word
            for word, similarity in similar_words:
                if word != self.secret_word and word in self.filtered_vocabulary:
                    return f"Hint: A similar word is '{word}' (Similarity: {similarity:.4f})"

            return "Could not find a suitable hint word."

        except KeyError:
            return "Error getting hint for the secret word."


# --- ---\
# Benzerlik Test Fonksiyonu (Türkçe kelimelerle güncellendi)
# --- ---
def test_similarity(engine):
    if not engine.model:
        print("Model not loaded, can't test.")
        return

    print("\n--- Türkçe Benzerlik Testleri ---")
    model = engine.model

    # Türkçe test çiftleri
    test_pairs = [
        ('kral', 'kraliçe'),   # İlgili
        ('çay', 'bardak'),     # Bağlamsal olarak ilgili
        ('sıcak', 'soğuk'),    # Zıt anlamlı (genellikle yakındır)
        ('kral', 'spatula')    # İlgisiz
    ]

    for w1, w2 in test_pairs:
        try:
            score = model.similarity(w1, w2)
            print(f"Benzerlik '{w1}' vs '{w2}': {score:.4f}")
        except KeyError:
            print(f"Test edilemiyor '{w1}' vs '{w2}': kelimelerden biri sözlükte yok.")

    print("------------------------\n")


# --- ---
# Ana Oyun Döngüsü
# --- ---
def play_game():
    print("Before running this, make sure you have installed 'gensim':")
    print("pip install gensim")
    print(f"And that '{MODEL_FILE}' exists.\n") # Updated message

    # Initialize the engine once outside the loop
    engine = ContextoEngine()

    if not engine.model:
        print("Oyun motoru başlatılamadı. Çıkılıyor.")
        return

    test_similarity(engine)

    while True:
        secret_word, rankings = engine.set_secret_word()
        if secret_word is None:
            print("Yeni oyun başlatılamadı. Çıkılıyor.")
            return

        guesses = {} # {word: rank}
        guess_count = 0
        is_game_over = False

        print("\n--- Yeni Oyun Başladı! (Türkçe) ---")
        print("Tahmin için bir kelime yazın. 'quit' yazarak çıkın, 'new' yazarak yeni oyuna başlayın.")
        print("Ipucu almak için 'hint' yazın.") # Added hint instruction

        while not is_game_over:
            try:
                user_input = input("\nTahmin: ").strip().lower()

                if user_input == 'quit':
                    print("Oynadığınız için teşekkürler!")
                    return
                if user_input == 'new':
                    print("Yeni oyun başlatılıyor...")
                    break
                if user_input == 'hint': # Added hint command
                    print(engine.get_hint())
                    continue


                # Sadece harf kontrolü (Türkçe karakterler dahil)
                turkish_letters = re.compile(r'^[a-zA-ZçÇğĞıİöÖşŞüÜ]+$')
                if not turkish_letters.match(user_input):
                    print("Lütfen sadece harf içeren geçerli bir kelime girin.")
                    continue

                if user_input in guesses:
                    print(f"'{user_input}' kelimesini zaten tahmin ettiniz (Sıra: {guesses[user_input]}).")
                    continue

                rank, error = engine.get_rank(user_input)
                guess_count += 1

                if error:
                    print(error)
                    continue

                guesses[user_input] = rank

                print("\n--- Tahminleriniz (En yakından uzağa) ---")
                sorted_guesses = sorted(guesses.items(), key=lambda item: item[1])

                for word, r in sorted_guesses:
                    prefix = "🎉" if r == 1 else "  "
                    print(f"{prefix} Sıra {r}: {word}")

                if rank == 1:
                    print(f"\n--- Tebrikler! ---")
                    print(f"Gizli kelimeyi buldunuz: '{secret_word.upper()}' ({guess_count} tahminde!)")
                    is_game_over = True

                    play_again = input("Tekrar oyna? (e/h): ").strip().lower()
                    if play_again != 'e':
                        print("Oynadığınız için teşekkürler!")
                        return
                    else:
                        break

            except EOFError:
                print("\nOyun kapatılıyor...")
                return
            except KeyboardInterrupt:
                print("\nOyun kapatılıyor...")
                return

# --- ---
# Script'i çalıştır
# --- ---
if __name__ == "__main__":
    play_game()

Before running this, make sure you have installed 'gensim':
pip install gensim
And that '/content/fastText/cc.tr.filtered.vec' exists.

Loading Turkish model from '/content/fastText/cc.tr.filtered.vec'...
--- Model yükleniyor ---
Model loaded successfully.

--- Türkçe Benzerlik Testleri ---
Benzerlik 'kral' vs 'kraliçe': 0.6781
Benzerlik 'çay' vs 'bardak': 0.6022
Benzerlik 'sıcak' vs 'soğuk': 0.8104
Test edilemiyor 'kral' vs 'spatula': kelimelerden biri sözlükte yok.
------------------------

New game started. The secret word has been chosen.
(Hint: The secret word is 'tapulu')
Calculating similarity rankings... (This can take a minute)


KeyboardInterrupt: 

In [20]:
from gensim.models import KeyedVectors
import random
import heapq
import re
import os

# --- ---
# Configuration
# --- ---
MODEL_FILE = '/content/fastText/cc.tr.filtered.vec'

# --- ---\
# Oyun Motoru (ContextoEngine)
# --- ---
class ContextoEngine:
    def __init__(self, model_path=MODEL_FILE):
        if not os.path.exists(model_path):
            print(f"Error: Model file '{model_path}' not found.")
            print("Please ensure the filtered model file exists.")
            self.model = None
            return

        print(f"Loading Turkish model from '{model_path}'...")
        print("--- Model yükleniyor ---")
        try:
            self.model = KeyedVectors.load_word2vec_format(model_path)
            print("Model loaded successfully.")

            self.vocabulary = list(self.model.index_to_key)

            turkish_letters = re.compile(r'^[a-zA-ZçÇğĞıİöÖşŞüÜ]+$')

            self.filtered_vocabulary = [
                word for word in self.vocabulary
                if len(word) > 3 and turkish_letters.match(word)
            ]
            if len(self.filtered_vocabulary) != len(self.vocabulary):
                 print(f"Warning: Additional filtering applied after loading. Original loaded vocab size: {len(self.vocabulary)}, Filtered usable words: {len(self.filtered_vocabulary)}")
            else:
                 print(f"Model vocabulary loaded ({len(self.filtered_vocabulary)} usable Turkish words).")


        except Exception as e:
            print(f"Error loading model: {e}")
            self.model = None
            self.filtered_vocabulary = []

    def set_secret_word(self, word=None):
        if not self.model or not self.filtered_vocabulary:
            print("Model not loaded or vocabulary empty, cannot set secret word.")
            return None, []


        if word is None:
            self.secret_word = random.choice(self.filtered_vocabulary)
        else:
            self.secret_word = word.lower()

        if self.secret_word not in self.model:
            print(f"Error: Secret word '{self.secret_word}' not in model vocabulary.")
            if self.filtered_vocabulary:
                self.secret_word = random.choice(self.filtered_vocabulary)
                print(f"Trying a different word from filtered vocabulary: '{self.secret_word}'")
            else:
                print("Filtered vocabulary is empty. Cannot select a secret word.")
                return None, []


        print(f"New game started. The secret word has been chosen.")
        print("Calculating similarity rankings... (This can take a minute)")

        all_word_distances = []
        for w in self.filtered_vocabulary:
            if w == self.secret_word:
                continue
            try:
                similarity = self.model.similarity(self.secret_word, w)
                distance = 1.0 - similarity
                all_word_distances.append((distance, w))
            except KeyError:
                print(f"Warning: Word '{w}' not found in model during ranking calculation.")
                continue

        all_word_distances.sort()

        self.ranked_list = {self.secret_word: 1}
        self.rank_to_word = {1: self.secret_word}

        rank = 2
        for distance, word in all_word_distances:
            if word not in self.ranked_list:
                self.ranked_list[word] = rank
                self.rank_to_word[rank] = word
                rank += 1

        print("Rankings calculated.")
        return self.secret_word, self.ranked_list

    def get_rank(self, word):
        word = word.lower()
        if not self.model:
             return None, "Oyun motoru başlatılmamış."

        if word not in self.model:
            return None, "Kelime modelin sözlüğünde yok."

        if word not in self.ranked_list:
            try:
                similarity = self.model.similarity(self.secret_word, word)
                return 999998, f"Kelime gizli kelimeye uzak. Benzerlik skoru: {similarity:.4f}"
            except KeyError:
                 return None, "Kelime modelin sözlüğünde yok (ikinci kontrol)."


        return self.ranked_list[word], None

    def get_hint(self):
        """Returns a word similar to the secret word as a hint."""
        if not self.model or not self.secret_word:
            return "Hint not available."

        try:
            similar_words = self.model.most_similar(positive=[self.model[self.secret_word]], topn=50)

            for word, similarity in similar_words:
                if word != self.secret_word and word in self.filtered_vocabulary:
                    return f"Hint: A similar word is '{word}' (Similarity: {similarity:.4f})"

            return "Could not find a suitable hint word."

        except KeyError:
            return "Error getting hint for the secret word."

# Initialize the engine once
print("Initializing Contexto Engine...")
engine = ContextoEngine()
if not engine.model:
    print("Failed to initialize engine. Game will not run.")

Initializing Contexto Engine...
Loading Turkish model from '/content/fastText/cc.tr.filtered.vec'...
--- Model yükleniyor ---
Model loaded successfully.


In [26]:
# --- ---\
# Benzerlik Test Fonksiyonu (Türkçe kelimelerle güncellendi)
# --- ---
def test_similarity(engine):
    if not engine.model:
        print("Model not loaded, can't test.")
        return

    print("\n--- Türkçe Benzerlik Testleri ---")
    model = engine.model

    # Türkçe test çiftleri
    test_pairs = [
        ('kral', 'kraliçe'),   # İlgili
        ('çay', 'bardak'),     # Bağlamsal olarak ilgili
        ('sıcak', 'soğuk'),    # Zıt anlamlı (genellikle yakındır)
        ('kral', 'spatula')    # İlgisiz
    ]

    for w1, w2 in test_pairs:
        try:
            score = model.similarity(w1, w2)
            print(f"Benzerlik '{w1}' vs '{w2}': {score:.4f}")
        except KeyError:
            print(f"Test edilemiyor '{w1}' vs '{w2}': kelimelerden biri sözlükte yok.")

    print("------------------------\n")


# --- ---
# Ana Oyun Döngüsü
# --- ---
def play_game():
    print("Before running this, make sure you have installed 'gensim':")
    print("pip install gensim")
    print(f"And that '{MODEL_FILE}' exists.\n")

    # The engine is now initialized in the previous cell

    if not engine.model:
        print("Oyun motoru başlatılamadı. Çıkılıyor.")
        return

    test_similarity(engine)

    while True:
        secret_word, rankings = engine.set_secret_word()
        if secret_word is None:
            print("Yeni oyun başlatılamadı. Çıkılıyor.")
            return

        guesses = {} # {word: rank}
        guess_count = 0
        is_game_over = False
        best_rank = float('inf') # Track the rank of the best guess

        print("\n--- Yeni Oyun Başladı! (Türkçe) ---")
        print("Tahmin için bir kelime yazın. 'quit' yazarak çıkın, 'new' yazarak yeni oyuna başlayın.")
        print("'hint' yazarak ipucu alın, 'giveup' yazarak oyunu bitirin.") # Added hint and giveup instruction

        while not is_game_over:
            try:
                user_input = input("\nTahmin: ").strip().lower()

                if user_input == 'quit':
                    print("Oynadığınız için teşekkürler!")
                    return
                if user_input == 'new':
                    print("Yeni oyun başlatılıyor...")
                    break
                if user_input == 'giveup': # Added giveup command
                    print(f"\n--- Oyunu bıraktınız ---")
                    print(f"Gizli kelime şuydu: '{secret_word.upper()}'")
                    is_game_over = True
                    play_again = input("Tekrar oyna? (e/h): ").strip().lower()
                    if play_again != 'e':
                        print("Oynadığınız için teşekkürler!")
                        return
                    else:
                        break
                if user_input == 'hint': # Added hint command
                    # Calculate the median rank between the secret word (rank 1) and the best guess
                    # Ensure best_rank is not infinity if no guesses have been made yet
                    current_best_rank = best_rank if best_rank != float('inf') else len(engine.ranked_list) # Use max rank if no guesses

                    median_rank = (1 + current_best_rank) // 2

                    # Define the hint interval around the median
                    interval_size = 30
                    half_interval = interval_size // 2

                    hint_range_start = max(2, median_rank - half_interval) # Start from rank 2, not below 1
                    hint_range_end = min(len(engine.ranked_list) + 1, median_rank + half_interval + 1) # End within bounds, +1 for exclusive range end

                    # Ensure the range is valid
                    if hint_range_end <= hint_range_start:
                         print("Henüz yeterince yakın bir kelime bulamadınız, veya en yakın kelime çok yakın. Başka ipucu bulunamadı.")
                    else:
                        # Find a random rank within the calculated interval
                        hint_rank = random.randint(hint_range_start, hint_range_end - 1)

                        # Get the word for that rank
                        hint_word = engine.rank_to_word.get(hint_rank)

                        if hint_word:
                            # Add the hinted word to guesses with its actual rank
                            if hint_word not in guesses:
                                 guesses[hint_word] = hint_rank
                                 print(f"Ipucu (Sıra {hint_rank}): '{hint_word}'")
                                 # Re-sort and display guesses after adding the hint
                                 print("\n--- Tahminleriniz (En yakından uzağa) ---")
                                 sorted_guesses = sorted(guesses.items(), key=lambda item: item[1])
                                 for word, r in sorted_guesses:
                                      prefix = "🎉" if r == 1 else "  "
                                      print(f"{prefix} Sıra {r}: {word}")
                            else:
                                 print(f"Ipucu: '{hint_word}' kelimesini zaten tahmin ettiniz (Sıra: {guesses[hint_word]}).")
                        else:
                             print("Ipucu bulunamadı.")
                    continue


                # Sadece harf kontrolü (Türkçe karakterler dahil)
                turkish_letters = re.compile(r'^[a-zA-ZçÇğĞıİöÖşŞüÜ]+$')
                if not turkish_letters.match(user_input):
                    print("Lütfen sadece harf içeren geçerli bir kelime girin.")
                    continue

                if user_input in guesses:
                    print(f"'{user_input}' kelimesini zaten tahmin ettiniz (Sıra: {guesses[user_input]}).")
                    continue

                rank, error = engine.get_rank(user_input)
                guess_count += 1

                if error:
                    print(error)
                    continue

                guesses[user_input] = rank

                # Update the best rank if the current guess is better
                if rank < best_rank:
                    best_rank = rank

                print("\n--- Tahminleriniz (En yakından uzağa) ---")
                sorted_guesses = sorted(guesses.items(), key=lambda item: item[1])

                for word, r in sorted_guesses:
                    prefix = "🎉" if r == 1 else "  "
                    print(f"{prefix} Sıra {r}: {word}")

                if rank == 1:
                    print(f"\n--- Tebrikler! ---")
                    print(f"Gizli kelimeyi buldunuz: '{secret_word.upper()}' ({guess_count} tahminde!)")
                    is_game_over = True

                    play_again = input("Tekrar oyna? (e/h): ").strip().lower()
                    if play_again != 'e':
                        print("Oynadığınız için teşekkürler!")
                        return
                    else:
                        break

            except EOFError:
                print("\nOyun kapatılıyor...")
                return
            except KeyboardInterrupt:
                print("\nOyun kapatılıyor...")
                return

# --- ---
# Script'i çalıştır
# --- ---
if __name__ == "__main__":
    play_game()

Before running this, make sure you have installed 'gensim':
pip install gensim
And that '/content/fastText/cc.tr.filtered.vec' exists.


--- Türkçe Benzerlik Testleri ---
Benzerlik 'kral' vs 'kraliçe': 0.6781
Benzerlik 'çay' vs 'bardak': 0.6022
Benzerlik 'sıcak' vs 'soğuk': 0.8104
Test edilemiyor 'kral' vs 'spatula': kelimelerden biri sözlükte yok.
------------------------

New game started. The secret word has been chosen.
Calculating similarity rankings... (This can take a minute)
Rankings calculated.

--- Yeni Oyun Başladı! (Türkçe) ---
Tahmin için bir kelime yazın. 'quit' yazarak çıkın, 'new' yazarak yeni oyuna başlayın.
'hint' yazarak ipucu alın, 'giveup' yazarak oyunu bitirin.

Tahmin: kral

--- Tahminleriniz (En yakından uzağa) ---
   Sıra 22360: kral

Tahmin: kitap

--- Tahminleriniz (En yakından uzağa) ---
   Sıra 18065: kitap
   Sıra 22360: kral

Tahmin: hint
Ipucu (Sıra 9024): 'Dayanılmaz'

--- Tahminleriniz (En yakından uzağa) ---
   Sıra 9024: Dayanılmaz
   Sıra 18065: kit

In [8]:
import os

MODEL_FILE = '/content/fastText/cc.tr.300.vec'

if not os.path.exists(MODEL_FILE):
    print(f"Error: Model file '{MODEL_FILE}' not found.")
else:
    with open(MODEL_FILE, 'r', encoding='utf-8') as f:
        first_line = f.readline().strip()
        parts = first_line.split()
        if len(parts) == 2:
            vocab_size = parts[0]
            vector_size = parts[1]
            print(f"The file '{MODEL_FILE}' contains approximately {vocab_size} words and each vector has a dimension of {vector_size}.")
        else:
            print(f"Could not parse the header line of the file: {first_line}")

The file '/content/fastText/cc.tr.300.vec' contains approximately 2000000 words and each vector has a dimension of 300.


In [13]:
import re
import os

# --- ---
# Configuration
# --- ---
MODEL_FILE = '/content/fastText/cc.tr.300.vec'
FILTERED_MODEL_FILE = '/content/fastText/cc.tr.filtered.vec' # New file name
DICTIONARY_FILE = '/content/words.txt' # Path to the uploaded dictionary file

# --- ---
# Load the Turkish Dictionary
# --- ---
turkish_words = set()
try:
    with open(DICTIONARY_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.strip().lower()
            if word: # Add non-empty lines
                turkish_words.add(word)
    print(f"Loaded {len(turkish_words)} words from '{DICTIONARY_FILE}'.")
except FileNotFoundError:
    print(f"Error: Dictionary file '{DICTIONARY_FILE}' not found.")
    # Exit or handle error appropriately if dictionary is essential
    turkish_words = None # Set to None to indicate failure


# --- ---
# Filtering Logic
# --- ---
def filter_word_with_dict(word, dictionary):
    """Filters words based on dictionary presence and length."""
    if dictionary is None: # If dictionary loading failed
        return False
    # Check if the word is in the dictionary AND has length 2 or more
    return word.lower() in dictionary and len(word) >= 2

# --- ---
# Process the file
# --- ---
if turkish_words is not None: # Only proceed if dictionary was loaded
    filtered_lines = []
    vector_size = None
    original_vocab_size = 0
    filtered_vocab_size = 0

    print(f"Reading from '{MODEL_FILE}' and filtering...")

    try:
        with open(MODEL_FILE, 'r', encoding='utf-8') as f:
            # Read the header line
            header = f.readline().strip()
            parts = header.split()
            if len(parts) == 2:
                original_vocab_size = int(parts[0])
                vector_size = int(parts[1])
                print(f"Original vocabulary size: {original_vocab_size}, Vector size: {vector_size}")
            else:
                print(f"Warning: Could not parse header line: {header}")
                # Try to infer vector size from the first word line if header is malformed
                try:
                    first_line = f.readline().strip()
                    parts = first_line.split()
                    if len(parts) > 1:
                        vector_size = len(parts) - 1
                        print(f"Inferred vector size: {vector_size}")
                        # Reset file pointer to the beginning to read from the start
                        f.seek(len(header.encode('utf-8')) + 1) # +1 for the newline character
                    else:
                         raise ValueError("Could not infer vector size from the first line.")
                except Exception as e:
                    print(f"Error reading first line after malformed header: {e}")
                    vector_size = 300 # Default to 300 if inference fails


            for line in f:
                parts = line.strip().split()
                if len(parts) > 1:
                    word = parts[0]
                    # Apply the new filtering logic
                    if filter_word_with_dict(word, turkish_words):
                        filtered_lines.append(line)
                        filtered_vocab_size += 1

        print(f"Filtering complete. {filtered_vocab_size} words kept.") # No longer reporting out of original_vocab_size accurately

        if vector_size is None:
             print("Error: Could not determine vector size. Cannot create filtered file.")
        elif filtered_vocab_size == 0:
             print("No words passed the filter. The filtered file will be empty.")
        else:
            print(f"Writing filtered data to '{FILTERED_MODEL_FILE}'...")
            with open(FILTERED_MODEL_FILE, 'w', encoding='utf-8') as f_out:
                # Write the new header
                f_out.write(f"{filtered_vocab_size} {vector_size}\n")
                # Write the filtered words and vectors
                for line in filtered_lines:
                    f_out.write(line)
            print("Filtered file created successfully.")

    except FileNotFoundError:
        print(f"Error: Model file '{MODEL_FILE}' not found.")
    except Exception as e:
        print(f"An error occurred during filtering: {e}")
else:
    print("Skipping filtering due to dictionary loading error.")

Loaded 63508 words from '/content/words.txt'.
Reading from '/content/fastText/cc.tr.300.vec' and filtering...
Original vocabulary size: 2000000, Vector size: 300
Filtering complete. 70142 words kept.
Writing filtered data to '/content/fastText/cc.tr.filtered.vec'...
Filtered file created successfully.
