<a href="https://colab.research.google.com/github/pasknk/contextotr/blob/main/contexto_temiz_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Filtering


In [None]:
!pip install gensim



FİLTERİNG

In [2]:
import re
import os

# --- ---
# Configuration
# --- ---
MODEL_FILE = '/content/fastText/cc.tr.300.vec'
FILTERED_MODEL_FILE = '/content/fastText/cc.tr.filtered.vec' # New file name
DICTIONARY_FILE = '/content/words.txt' # Path to the uploaded dictionary file

# --- ---
# Load the Turkish Dictionary
# --- ---
turkish_words = set()
try:
    with open(DICTIONARY_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.strip().lower()
            if word: # Add non-empty lines
                turkish_words.add(word)
    print(f"Loaded {len(turkish_words)} words from '{DICTIONARY_FILE}'.")
except FileNotFoundError:
    print(f"Error: Dictionary file '{DICTIONARY_FILE}' not found.")
    # Exit or handle error appropriately if dictionary is essential
    turkish_words = None # Set to None to indicate failure


# --- ---
# Filtering Logic
# --- ---
def filter_word_with_dict(word, dictionary):
    """Filters words based on dictionary presence and length."""
    if dictionary is None: # If dictionary loading failed
        return False
    # Check if the word is in the dictionary AND has length 2 or more
    return word.lower() in dictionary and len(word) >= 2

# --- ---
# Process the file
# --- ---
if turkish_words is not None: # Only proceed if dictionary was loaded
    filtered_lines = []
    vector_size = None
    original_vocab_size = 0
    filtered_vocab_size = 0

    print(f"Reading from '{MODEL_FILE}' and filtering...")

    try:
        with open(MODEL_FILE, 'r', encoding='utf-8') as f:
            # Read the header line
            header = f.readline().strip()
            parts = header.split()
            if len(parts) == 2:
                original_vocab_size = int(parts[0])
                vector_size = int(parts[1])
                print(f"Original vocabulary size: {original_vocab_size}, Vector size: {vector_size}")
            else:
                print(f"Warning: Could not parse header line: {header}")
                # Try to infer vector size from the first word line if header is malformed
                try:
                    first_line = f.readline().strip()
                    parts = first_line.split()
                    if len(parts) > 1:
                        vector_size = len(parts) - 1
                        print(f"Inferred vector size: {vector_size}")
                        # Reset file pointer to the beginning to read from the start
                        f.seek(len(header.encode('utf-8')) + 1) # +1 for the newline character
                    else:
                         raise ValueError("Could not infer vector size from the first line.")
                except Exception as e:
                    print(f"Error reading first line after malformed header: {e}")
                    vector_size = 300 # Default to 300 if inference fails


            for line in f:
                parts = line.strip().split()
                if len(parts) > 1:
                    word = parts[0]
                    # Apply the new filtering logic
                    if filter_word_with_dict(word, turkish_words):
                        filtered_lines.append(line)
                        filtered_vocab_size += 1

        print(f"Filtering complete. {filtered_vocab_size} words kept.") # No longer reporting out of original_vocab_size accurately

        if vector_size is None:
             print("Error: Could not determine vector size. Cannot create filtered file.")
        elif filtered_vocab_size == 0:
             print("No words passed the filter. The filtered file will be empty.")
        else:
            print(f"Writing filtered data to '{FILTERED_MODEL_FILE}'...")
            with open(FILTERED_MODEL_FILE, 'w', encoding='utf-8') as f_out:
                # Write the new header
                f_out.write(f"{filtered_vocab_size} {vector_size}\n")
                # Write the filtered words and vectors
                for line in filtered_lines:
                    f_out.write(line)
            print("Filtered file created successfully.")

    except FileNotFoundError:
        print(f"Error: Model file '{MODEL_FILE}' not found.")
    except Exception as e:
        print(f"An error occurred during filtering: {e}")
else:
    print("Skipping filtering due to dictionary loading error.")

Loaded 63508 words from '/content/words.txt'.
Reading from '/content/fastText/cc.tr.300.vec' and filtering...
Original vocabulary size: 2000000, Vector size: 300
Filtering complete. 70142 words kept.
Writing filtered data to '/content/fastText/cc.tr.filtered.vec'...
Filtered file created successfully.


ÇALIŞIYOR

In [9]:

# --- ---
# Ana Oyun Döngüsü
# --- ---
def play_game():
    print("Before running this, make sure you have installed 'gensim':")
    print("pip install gensim")
    print(f"And that '{MODEL_FILE}' exists.\n")

    # The engine is now initialized in the previous cell

    if not engine.model:
        print("Oyun motoru başlatılamadı. Çıkılıyor.")
        return

    test_similarity(engine)

    while True:
        secret_word, rankings = engine.set_secret_word()
        if secret_word is None:
            print("Yeni oyun başlatılamadı. Çıkılıyor.")
            return

        guesses = {} # {word: rank}
        guess_count = 0
        is_game_over = False
        best_rank = float('inf') # Track the rank of the best guess

        print("\n--- Yeni Oyun Başladı! (Türkçe) ---")
        print("Tahmin için bir kelime yazın. 'quit' yazarak çıkın, 'new' yazarak yeni oyuna başlayın.")
        print("'hint' yazarak ipucu alın, 'giveup' yazarak oyunu bitirin.") # Added hint and giveup instruction

        while not is_game_over:
            try:
                user_input = input("\nTahmin: ").strip().lower()

                if user_input == 'quit':
                    print("Oynadığınız için teşekkürler!")
                    return
                if user_input == 'new':
                    print("Yeni oyun başlatılıyor...")
                    break
                if user_input == 'giveup': # Added giveup command
                    print(f"\n--- Oyunu bıraktınız ---")
                    print(f"Gizli kelime şuydu: '{secret_word.upper()}'")
                    is_game_over = True
                    play_again = input("Tekrar oyna? (e/h): ").strip().lower()
                    if play_again != 'e':
                        print("Oynadığınız için teşekkürler!")
                        return
                    else:
                        break
                if user_input == 'hint': # Added hint command
                    # Calculate the median rank between the secret word (rank 1) and the best guess
                    # Ensure best_rank is not infinity if no guesses have been made yet
                    current_best_rank = best_rank if best_rank != float('inf') else len(engine.ranked_list) # Use max rank if no guesses

                    median_rank = (1 + current_best_rank) // 2

                    # Define the hint interval around the median
                    interval_size = 30
                    half_interval = interval_size // 2

                    hint_range_start = max(2, median_rank - half_interval) # Start from rank 2, not below 1
                    hint_range_end = min(len(engine.ranked_list) + 1, median_rank + half_interval + 1) # End within bounds, +1 for exclusive range end

                    # Ensure the range is valid
                    if hint_range_end <= hint_range_start:
                         print("Henüz yeterince yakın bir kelime bulamadınız, veya en yakın kelime çok yakın. Başka ipucu bulunamadı.")
                    else:
                        # Find a random rank within the calculated interval
                        hint_rank = random.randint(hint_range_start, hint_range_end - 1)

                        # Get the word for that rank
                        hint_word = engine.rank_to_word.get(hint_rank)

                        if hint_word:
                            # Add the hinted word to guesses with its actual rank
                            if hint_word not in guesses:
                                 guesses[hint_word] = hint_rank
                                 print(f"Ipucu (Sıra {hint_rank}): '{hint_word}'")

                                 # --- FIX: BEST_RANK GÜNCELLENDİ ---
                                 if hint_rank < best_rank:
                                     best_rank = hint_rank
                                 # --- BİTİŞ ---

                                 # Re-sort and display guesses after adding the hint
                                 print("\n--- Tahminleriniz (En yakından uzağa) ---")
                                 sorted_guesses = sorted(guesses.items(), key=lambda item: item[1])
                                 for word, r in sorted_guesses:
                                      prefix = "🎉" if r == 1 else "  "
                                      print(f"{prefix} Sıra {r}: {word}")
                            else:
                                 print(f"Ipucu: '{hint_word}' kelimesini zaten tahmin ettiniz (Sıra: {guesses[hint_word]}).")
                        else:
                             print("Ipucu bulunamadı.")
                    continue


                # Sadece harf kontrolü (Türkçe karakterler dahil)
                turkish_letters = re.compile(r'^[a-zA-ZçÇğĞıİöÖşŞüÜ]+$')
                if not turkish_letters.match(user_input):
                    print("Lütfen sadece harf içeren geçerli bir kelime girin.")
                    continue

                if user_input in guesses:
                    print(f"'{user_input}' kelimesini zaten tahmin ettiniz (Sıra: {guesses[user_input]}).")
                    continue

                rank, error = engine.get_rank(user_input)
                guess_count += 1

                if error:
                    print(error)
                    continue

                guesses[user_input] = rank

                # Update the best rank if the current guess is better
                if rank < best_rank:
                    best_rank = rank

                print("\n--- Tahminleriniz (En yakından uzağa) ---")
                sorted_guesses = sorted(guesses.items(), key=lambda item: item[1])

                for word, r in sorted_guesses:
                    prefix = "🎉" if r == 1 else "  "
                    print(f"{prefix} Sıra {r}: {word}")

                if rank == 1:
                    print(f"\n--- Tebrikler! ---")
                    print(f"Gizli kelimeyi buldunuz: '{secret_word.upper()}' ({guess_count} tahminde!)\")")
                    is_game_over = True

                    play_again = input("Tekrar oyna? (e/h): ").strip().lower()
                    if play_again != 'e':
                        print("Oynadığınız için teşekkürler!")
                        return
                    else:
                        break

            except EOFError:
                print("\nOyun kapatılıyor...")
                return
            except KeyboardInterrupt:
                print("\nOyun kapatılıyor...")
                return

# --- ---
# Script'i çalıştır
# --- ---
if __name__ == "__main__":
    play_game()


Before running this, make sure you have installed 'gensim':
pip install gensim
And that '/content/fastText/cc.tr.filtered.vec' exists.


--- Benzerlik Testi ---
'kedi' ve 'köpek' arasındaki benzerlik: 0.7857
'kedi' ve 'ev' arasındaki benzerlik: 0.2089
'köpek' ve 'kedi' arasındaki benzerlik: 0.7857
'köpek' ve 'ev' arasındaki benzerlik: 0.1508
'ev' ve 'kedi' arasındaki benzerlik: 0.2089
'ev' ve 'köpek' arasındaki benzerlik: 0.1508
--- Test Sonu ---


--- Yeni Oyun Başladı! (Türkçe) ---
Tahmin için bir kelime yazın. 'quit' yazarak çıkın, 'new' yazarak yeni oyuna başlayın.
'hint' yazarak ipucu alın, 'giveup' yazarak oyunu bitirin.

Oyun kapatılıyor...


In [10]:
import gensim.models

class WordVectorEngine:
    def __init__(self, model_path, dictionary_path):
        self.model = None
        self.dictionary = set()
        self.ranked_list = []
        self.rank_to_word = {}
        self.word_to_rank = {}

        self._load_dictionary(dictionary_path)
        if self.dictionary:
            self._load_model(model_path)

    def _load_dictionary(self, dictionary_path):
        try:
            with open(dictionary_path, 'r', encoding='utf-8') as f:
                for line in f:
                    word = line.strip().lower()
                    if word:
                        self.dictionary.add(word)
            print(f"Loaded {len(self.dictionary)} words from '{dictionary_path}'.")
        except FileNotFoundError:
            print(f"Error: Dictionary file '{dictionary_path}' not found.")
            self.dictionary = None # Indicate dictionary loading failed

    def _load_model(self, model_path):
        if self.dictionary is None:
            print("Skipping model loading due to dictionary error.")
            return

        try:
            # Load the model using gensim's KeyedVectors
            # Assumes the model file is in word2vec format
            self.model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=False, limit=100000) # Limiting for faster testing

            print(f"Model loaded successfully from '{model_path}'.")

            # Build the ranked list and word-to-rank mapping based on dictionary and model intersection
            print("Building ranked list and word-to-rank mapping...")
            self.ranked_list = []
            self.rank_to_word = {}
            self.word_to_rank = {}
            rank = 1
            for word in self.model.index_to_key: # Iterate through words in the loaded model
                 if word.lower() in self.dictionary: # Check if the word is in the dictionary
                    self.ranked_list.append(word)
                    self.rank_to_word[rank] = word
                    self.word_to_rank[word.lower()] = rank
                    rank += 1
            print(f"Ranked list built with {len(self.ranked_list)} words.")

        except FileNotFoundError:
            print(f"Error: Model file '{model_path}' not found.")
            self.model = None
        except Exception as e:
            print(f"An error occurred while loading the model: {e}")
            self.model = None

    def set_secret_word(self):
        if not self.model or not self.ranked_list:
            print("Model or ranked list not available to set a secret word.")
            return None, None

        # Select a random word from the ranked list (which is already filtered by dictionary)
        secret_word = random.choice(self.ranked_list)

        # Get the most similar words and their similarities
        try:
            similar_words = self.model.most_similar(secret_word, topn=len(self.ranked_list)) # Get similarities to all words in the ranked list
        except KeyError:
             print(f"Secret word '{secret_word}' not found in model vocabulary.")
             return None, None # Should not happen if secret_word is from self.ranked_list

        # Create a list of (word, rank) tuples for the secret word's similarity
        # The secret word itself will have rank 1
        rankings = [(secret_word, 1)]
        current_rank = 2
        for word, similarity in similar_words:
             if word.lower() in self.word_to_rank and word.lower() != secret_word.lower(): # Ensure word is in our filtered list and not the secret word itself
                 rankings.append((word, current_rank))
                 current_rank += 1
        return secret_word, rankings


    def get_rank(self, word):
        if not self.model or not self.ranked_list:
            return None, "Oyun motoru başlatılmamış."

        word_lower = word.lower()

        if word_lower not in self.word_to_rank:
            return None, f"'{word}' kelimesi sözlükte veya modelde bulunamadı."

        # Get the pre-calculated rank from the word_to_rank mapping
        rank = self.word_to_rank.get(word_lower)

        if rank is not None:
             return rank, None
        else:
             return None, f"'{word}' kelimesi için sıra bulunamadı." # Should not happen if word is in word_to_rank




# --- ---
# Initialize the game engine
# --- ---
# Use the filtered model file
MODEL_FILE = '/content/fastText/cc.tr.filtered.vec'
DICTIONARY_FILE = '/content/words.txt' # Path to the uploaded dictionary file

print("Initializing game engine...")
engine = WordVectorEngine(MODEL_FILE, DICTIONARY_FILE)
print("Engine initialization complete.")

Initializing game engine...
Loaded 63508 words from '/content/words.txt'.
Model loaded successfully from '/content/fastText/cc.tr.filtered.vec'.
Building ranked list and word-to-rank mapping...
Ranked list built with 70142 words.
Engine initialization complete.


In [11]:
import random
import re # Import re for the turkish_letters regex


# --- ---
# Helper function to test similarity
# --- ---
def test_similarity(engine):
     if not engine.model:
         print("Model not loaded for similarity test.")
         return

     test_words = ["kedi", "köpek", "ev"] # Example Turkish words
     print("\n--- Benzerlik Testi ---")
     for word1 in test_words:
         for word2 in test_words:
             if word1 != word2:
                 try:
                     similarity = engine.model.similarity(word1, word2)
                     print(f"'{word1}' ve '{word2}' arasındaki benzerlik: {similarity:.4f}")
                 except KeyError:
                     print(f"'{word1}' veya '{word2}' modelde bulunamadı.")
     print("--- Test Sonu ---\n")


# --- ---
# Ana Oyun Döngüsü
# --- ---
def play_game():
    print("Before running this, make sure you have installed 'gensim':")
    print("pip install gensim")
    # MODEL_FILE is defined in the previous cell where the engine is initialized.

    if not engine.model:
        print("Oyun motoru başlatılamadı. Çıkılıyor.")
        return

    test_similarity(engine)

    while True:
        secret_word, rankings = engine.set_secret_word()
        if secret_word is None:
            print("Yeni oyun başlatılamadı. Çıkılıyor.")
            return

        guesses = {} # {word: rank}
        guess_count = 0
        is_game_over = False
        best_rank = float('inf') # Track the rank of the best guess

        print("\n--- Yeni Oyun Başladı! (Türkçe) ---")
        print("Tahmin için bir kelime yazın. 'quit' yazarak çıkın, 'new' yazarak yeni oyuna başlayın.")
        print("'hint' yazarak ipucu alın, 'giveup' yazarak oyunu bitirin.") # Added hint and giveup instruction

        while not is_game_over:
            try:
                user_input = input("\nTahmin: ").strip().lower()

                if user_input == 'quit':
                    print("Oynadığınız için teşekkürler!")
                    return
                if user_input == 'new':
                    print("Yeni oyun başlatılıyor...")
                    break
                if user_input == 'giveup': # Added giveup command
                    print(f"\n--- Oyunu bıraktınız ---")
                    print(f"Gizli kelime şuydu: '{secret_word.upper()}'")
                    is_game_over = True
                    play_again = input("Tekrar oyna? (e/h): ").strip().lower()
                    if play_again != 'e':
                        print("Oynadığınız için teşekkürler!")
                        return
                    else:
                        break
                if user_input == 'hint': # Added hint command
                    # Calculate the median rank between the secret word (rank 1) and the best guess
                    # Ensure best_rank is not infinity if no guesses have been made yet
                    current_best_rank = best_rank if best_rank != float('inf') else len(engine.ranked_list) # Use max rank if no guesses

                    median_rank = (1 + current_best_rank) // 2

                    # Define the hint interval around the median
                    interval_size = 30
                    half_interval = interval_size // 2

                    hint_range_start = max(2, median_rank - half_interval) # Start from rank 2, not below 1
                    hint_range_end = min(len(engine.ranked_list) + 1, median_rank + half_interval + 1) # End within bounds, +1 for exclusive range end

                    # Ensure the range is valid
                    if hint_range_end <= hint_range_start:
                         print("Henüz yeterince yakın bir kelime bulamadınız, veya en yakın kelime çok yakın. Başka ipucu bulunamadı.")
                    else:
                        # Find a random rank within the calculated interval
                        hint_rank = random.randint(hint_range_start, hint_range_end - 1)

                        # Get the word for that rank
                        hint_word = engine.rank_to_word.get(hint_rank)

                        if hint_word:
                            # Add the hinted word to guesses with its actual rank
                            if hint_word not in guesses:
                                 guesses[hint_word] = hint_rank
                                 print(f"Ipucu (Sıra {hint_rank}): '{hint_word}'")

                                 # --- FIX: BEST_RANK GÜNCELLENDİ ---
                                 if hint_rank < best_rank:
                                     best_rank = hint_rank
                                 # --- BİTİŞ ---

                                 # Re-sort and display guesses after adding the hint
                                 print("\n--- Tahminleriniz (En yakından uzağa) ---")
                                 sorted_guesses = sorted(guesses.items(), key=lambda item: item[1])
                                 for word, r in sorted_guesses:
                                      prefix = "🎉" if r == 1 else "  "
                                      print(f"{prefix} Sıra {r}: {word}")
                            else:
                                 print(f"Ipucu: '{hint_word}' kelimesini zaten tahmin ettiniz (Sıra: {guesses[hint_word]}).")
                        else:
                             print("Ipucu bulunamadı.")
                    continue


                # Sadece harf kontrolü (Türkçe karakterler dahil)
                turkish_letters = re.compile(r'^[a-zA-ZçÇğĞıİöÖşŞüÜ]+$')
                if not turkish_letters.match(user_input):
                    print("Lütfen sadece harf içeren geçerli bir kelime girin.")
                    continue

                if user_input in guesses:
                    print(f"'{user_input}' kelimesini zaten tahmin ettiniz (Sıra: {guesses[user_input]}).")
                    continue

                rank, error = engine.get_rank(user_input)
                guess_count += 1

                if error:
                    print(error)
                    continue

                guesses[user_input] = rank

                # Update the best rank if the current guess is better
                if rank < best_rank:
                    best_rank = rank

                print("\n--- Tahminleriniz (En yakından uzağa) ---")
                sorted_guesses = sorted(guesses.items(), key=lambda item: item[1])

                for word, r in sorted_guesses:
                    prefix = "🎉" if r == 1 else "  "
                    print(f"{prefix} Sıra {r}: {word}")

                if rank == 1:
                    print(f"\n--- Tebrikler! ---")
                    print(f"Gizli kelimeyi buldunuz: '{secret_word.upper()}' ({guess_count} tahminde!)\")")
                    is_game_over = True

                    play_again = input("Tekrar oyna? (e/h): ").strip().lower()
                    if play_again != 'e':
                        print("Oynadığınız için teşekkürler!")
                        return
                    else:
                        break

            except EOFError:
                print("\nOyun kapatılıyor...")
                return
            except KeyboardInterrupt:
                print("\nOyun kapatılıyor...")
                return

# --- ---
# Script'i çalıştır
# --- ---
if __name__ == "__main__":
    play_game()

Before running this, make sure you have installed 'gensim':
pip install gensim

--- Benzerlik Testi ---
'kedi' ve 'köpek' arasındaki benzerlik: 0.7857
'kedi' ve 'ev' arasındaki benzerlik: 0.2089
'köpek' ve 'kedi' arasındaki benzerlik: 0.7857
'köpek' ve 'ev' arasındaki benzerlik: 0.1508
'ev' ve 'kedi' arasındaki benzerlik: 0.2089
'ev' ve 'köpek' arasındaki benzerlik: 0.1508
--- Test Sonu ---


--- Yeni Oyun Başladı! (Türkçe) ---
Tahmin için bir kelime yazın. 'quit' yazarak çıkın, 'new' yazarak yeni oyuna başlayın.
'hint' yazarak ipucu alın, 'giveup' yazarak oyunu bitirin.

Oyun kapatılıyor...
