In [1]:
import gensim
from gensim.models import Word2Vec
import pandas as pd
import re

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [3]:
parameters = [
    {'model_type': 'cbow', 'window': 2, 'vector_size': 100},
    {'model_type': 'skipgram', 'window': 2, 'vector_size': 100},
    {'model_type': 'cbow', 'window': 4, 'vector_size': 100},
    {'model_type': 'skipgram', 'window': 4, 'vector_size': 100},
    {'model_type': 'cbow', 'window': 2, 'vector_size': 300},
    {'model_type': 'skipgram', 'window': 2, 'vector_size': 300},
    {'model_type': 'cbow', 'window': 4, 'vector_size': 300},
    {'model_type': 'skipgram', 'window': 4, 'vector_size': 300}
]

In [11]:
df1 = pd.read_csv("lemmatized_sentences.csv")
df2 = pd.read_csv("stemmed_sentences.csv")

In [13]:
df1.columns = ["0"]

# NaN değerleri ve boş stringleri temizle
df1 = df1.dropna()
df1 = df1[df1["0"].str.strip() != ""]

df2.columns = ["0"]

# NaN değerleri ve boş stringleri temizle
df2 = df2.dropna()
df2 = df2[df2["0"].str.strip() != ""]

In [15]:
# Doğru tokenizasyon fonksiyonu
def proper_tokenize(text):
    # Özel karakterleri kaldır ve küçük harfe çevir
    text = re.sub(r'[^a-zA-ZğüşıöçĞÜŞİÖÇ\s]', '', text.lower())
    # NLTK ile tokenize et
    tokens = word_tokenize(text)
    # Stopwords'leri ve tek karakterli kelimeleri kaldır
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words and len(word) > 1]

In [17]:
# Doğru tokenizasyon uygula
df1['tokens'] = df1['0'].apply(proper_tokenize)
df2['tokens'] = df2['0'].apply(proper_tokenize)

In [19]:
# Token listelerini oluştur
tokenized_corpus_lemmatized = df1['tokens'].tolist()
tokenized_corpus_stemmed = df2['tokens'].tolist()

In [21]:
def train_and_save_model(corpus, param, model_prefix):
    model_type = param['model_type']
    vector_size = param['vector_size']
    window = param['window']
    
    # CBOW (sg=0) veya Skip-gram (sg=1)
    sg = 0 if model_type == 'cbow' else 1

    model = Word2Vec(
        sentences=corpus,
        vector_size=vector_size,
        window=window,
        min_count=1,
        workers=4,
        sg=sg
    )

    model_filename = f"{model_prefix}_{model_type}_vs{vector_size}_w{window}.model"
    model.save(model_filename)
    print(f"Model saved as {model_filename}")

In [23]:
# Lemmatize edilmiş corpus ile modelleri eğitme ve kaydetme
for param in parameters:
    train_and_save_model(tokenized_corpus_lemmatized, param, "lemmatized_model")

# Stemlenmiş corpus ile modelleri eğitme ve kaydetme
for param in parameters:
    train_and_save_model(tokenized_corpus_stemmed, param, "stemmed_model")

Model saved as lemmatized_model_cbow_vs100_w2.model
Model saved as lemmatized_model_skipgram_vs100_w2.model
Model saved as lemmatized_model_cbow_vs100_w4.model
Model saved as lemmatized_model_skipgram_vs100_w4.model
Model saved as lemmatized_model_cbow_vs300_w2.model
Model saved as lemmatized_model_skipgram_vs300_w2.model
Model saved as lemmatized_model_cbow_vs300_w4.model
Model saved as lemmatized_model_skipgram_vs300_w4.model
Model saved as stemmed_model_cbow_vs100_w2.model
Model saved as stemmed_model_skipgram_vs100_w2.model
Model saved as stemmed_model_cbow_vs100_w4.model
Model saved as stemmed_model_skipgram_vs100_w4.model
Model saved as stemmed_model_cbow_vs300_w2.model
Model saved as stemmed_model_skipgram_vs300_w2.model
Model saved as stemmed_model_cbow_vs300_w4.model
Model saved as stemmed_model_skipgram_vs300_w4.model


In [25]:
# Model dosyalarını yüklemek
model_1 = Word2Vec.load("lemmatized_model_cbow_vs100_w2.model")
model_2 = Word2Vec.load("lemmatized_model_cbow_vs100_w4.model")
model_3 = Word2Vec.load("lemmatized_model_cbow_vs300_w2.model")
model_4 = Word2Vec.load("lemmatized_model_cbow_vs300_w4.model")
model_5 = Word2Vec.load("lemmatized_model_skipgram_vs100_w2.model")
model_6 = Word2Vec.load("lemmatized_model_skipgram_vs100_w4.model")
model_7 = Word2Vec.load("lemmatized_model_skipgram_vs300_w2.model")
model_8 = Word2Vec.load("lemmatized_model_skipgram_vs300_w4.model")
model_9  = Word2Vec.load("stemmed_model_cbow_vs100_w2.model")
model_10 = Word2Vec.load("stemmed_model_cbow_vs100_w4.model")
model_11 = Word2Vec.load("stemmed_model_cbow_vs300_w2.model")
model_12 = Word2Vec.load("stemmed_model_cbow_vs300_w4.model")
model_13 = Word2Vec.load("stemmed_model_skipgram_vs100_w2.model")
model_14 = Word2Vec.load("stemmed_model_skipgram_vs100_w4.model")
model_15 = Word2Vec.load("stemmed_model_skipgram_vs300_w2.model")
model_16 = Word2Vec.load("stemmed_model_skipgram_vs300_w4.model")

In [27]:
# 'heart' kelimesi ile en benzer 3 kelimeyi ve skorlarını yazdırmak
def print_similar_words(model, model_name):
    similarity = model.wv.most_similar("heart", topn=3)
    print(f"\n{model_name} Modeli - 'heart' ile En Benzer 3 Kelime:")
    for word, score in similarity:
        print(f"Kelime: {word}, Benzerlik Skoru: {score}")

In [29]:
# 16 model için benzer kelimeleri yazdır
print_similar_words(model_1, "Lemmatized CBOW Window 2 Dim 100")
print_similar_words(model_2, "Stemmed Skipgram Window 4 Dim 100")
print_similar_words(model_3, "Lemmatized Skipgram Window 2 Dim 300")
print_similar_words(model_4, "lemmatized skipgram window 4 dim 100")
print_similar_words(model_5, "lemmatized cbow window 2 dim 300")
print_similar_words(model_6, "lemmatizedskipgramwindow 2 dim300")
print_similar_words(model_7, "lemmatized_cbow_window 4_dim300")
print_similar_words(model_8, "lemmatized_skipgram_window4_dim300.model")
print_similar_words(model_9, "stemmed_cbow_window2_dim100")
print_similar_words(model_10, "stemmed_skipgram_window2_dim100")
print_similar_words(model_11, "stemmed_cbow_window4_dim100")
print_similar_words(model_12, "stemmed_skipgram_window4_dim100")
print_similar_words(model_13, "stemmed_cbow_window2_dim300")
print_similar_words(model_14, "stemmed_skipgram_window2_dim300")
print_similar_words(model_15, "stemmed_cbow_window4_dim300")
print_similar_words(model_16, "stemmed_skipgram_window4_dim300")


Lemmatized CBOW Window 2 Dim 100 Modeli - 'heart' ile En Benzer 3 Kelime:
Kelime: toxic, Benzerlik Skoru: 0.2569843530654907
Kelime: cc, Benzerlik Skoru: 0.22399276494979858
Kelime: renal, Benzerlik Skoru: 0.18181410431861877

Stemmed Skipgram Window 4 Dim 100 Modeli - 'heart' ile En Benzer 3 Kelime:
Kelime: toxic, Benzerlik Skoru: 0.2554498612880707
Kelime: cc, Benzerlik Skoru: 0.22555696964263916
Kelime: renal, Benzerlik Skoru: 0.1849018782377243

Lemmatized Skipgram Window 2 Dim 300 Modeli - 'heart' ile En Benzer 3 Kelime:
Kelime: kidney, Benzerlik Skoru: 0.13194966316223145
Kelime: pleural, Benzerlik Skoru: 0.12958508729934692
Kelime: principal, Benzerlik Skoru: 0.11510880291461945

lemmatized skipgram window 4 dim 100 Modeli - 'heart' ile En Benzer 3 Kelime:
Kelime: kidney, Benzerlik Skoru: 0.13661669194698334
Kelime: pleural, Benzerlik Skoru: 0.12958277761936188
Kelime: principal, Benzerlik Skoru: 0.11802059412002563

lemmatized cbow window 2 dim 300 Modeli - 'heart' ile En Benz