In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [3]:
df = pd.read_json("/content/drive/MyDrive/mava/trendyol_limited.json")
df = df.fillna("")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["title"], df["ana_kategori"], test_size=0.2, random_state=42
)

model_ana = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 2), max_features=20000)),
    ("clf", LogisticRegression(max_iter=200))
])

model_ana.fit(X_train, y_train)
y_pred = model_ana.predict(X_test)
print("ANA_KATEGORI Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

joblib.dump(model_ana, "model_ana.pkl")

ANA_KATEGORI Accuracy: 0.9142916310734902
                          precision    recall  f1-score   support

            Anne & Çocuk       0.85      0.80      0.82      8651
        Ayakkabı & Çanta       0.94      0.96      0.95      6273
              Elektronik       0.92      0.91      0.92     11572
                   Erkek       0.95      0.93      0.94     12875
            Ev & Mobilya       0.93      0.94      0.93     24360
                   Kadın       0.93      0.93      0.93     10394
Kitap & Kırtasiye & Hobi       0.88      0.92      0.90     22672
          Spor & Outdoor       0.93      0.87      0.90      5954
             Süpermarket       0.91      0.91      0.91     13142

                accuracy                           0.91    115893
               macro avg       0.92      0.91      0.91    115893
            weighted avg       0.91      0.91      0.91    115893



['model_ana.pkl']

In [None]:
models_alt1 = {}

for ana in df["ana_kategori"].unique():
    subset = df[df["ana_kategori"] == ana]
    X_train, X_test, y_train, y_test = train_test_split(
        subset["title"], subset["alt_kategori_1"], test_size=0.2, random_state=42
    )

    model = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1, 2), max_features=20000)),
        ("clf", LogisticRegression(max_iter=200))
    ])

    model.fit(X_train, y_train)
    models_alt1[ana] = model

    print(f"[{ana}] ALT_KATEGORI_1 Accuracy:", accuracy_score(y_test, model.predict(X_test)))

joblib.dump(models_alt1, "models_alt1.pkl")

[Anne & Çocuk] ALT_KATEGORI_1 Accuracy: 0.9855089265012752
[Süpermarket] ALT_KATEGORI_1 Accuracy: 0.9200949973186241
[Ayakkabı & Çanta] ALT_KATEGORI_1 Accuracy: 0.9746855596242636
[Elektronik] ALT_KATEGORI_1 Accuracy: 0.9562874251497006
[Erkek] ALT_KATEGORI_1 Accuracy: 0.8992235903066426
[Kadın] ALT_KATEGORI_1 Accuracy: 0.9794612151318304
[Ev & Mobilya] ALT_KATEGORI_1 Accuracy: 0.9666228214403156
[Kitap & Kırtasiye & Hobi] ALT_KATEGORI_1 Accuracy: 0.8247445384073291
[Spor & Outdoor] ALT_KATEGORI_1 Accuracy: 0.8011344678011345


['models_alt1.pkl']

In [None]:
models_alt2 = {}

for alt1 in df["alt_kategori_1"].unique():
    subset = df[df["alt_kategori_1"] == alt1]

    if subset["alt_kategori_2"].nunique() < 2:
        tek_sinif = subset["alt_kategori_2"].iloc[0]
        print(f"[{alt1}] Tek sınıf bulundu: '{tek_sinif}' - Model eğitilmeyecek.")
        models_alt2[alt1] = tek_sinif
        continue

    X_train, X_test, y_train, y_test = train_test_split(
        subset["title"], subset["alt_kategori_2"], test_size=0.2, random_state=42
    )

    model = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1, 2), max_features=20000)),
        ("clf", LogisticRegression(max_iter=200))
    ])

    model.fit(X_train, y_train)
    models_alt2[alt1] = model

    print(f"[{alt1}] ALT_KATEGORI_2 Accuracy:", accuracy_score(y_test, model.predict(X_test)))

joblib.dump(models_alt2, "models_alt2.pkl")


[Bebek] ALT_KATEGORI_2 Accuracy: 0.9662113587347232
[Bebek Bakım] ALT_KATEGORI_2 Accuracy: 0.9666059502125076
[Oyuncak] ALT_KATEGORI_2 Accuracy: 0.7258498821945473
[Erkek Ayakkabı] ALT_KATEGORI_2 Accuracy: 0.965
[Kadın Ayakkabı] ALT_KATEGORI_2 Accuracy: 0.9604166666666667
[Çocuk Ayakkabı] ALT_KATEGORI_2 Accuracy: 0.9854368932038835
[Kadın Çanta] ALT_KATEGORI_2 Accuracy: 0.8705521472392638
[Çocuk Çanta] ALT_KATEGORI_2 Accuracy: 0.9821958456973294
[Beyaz Eşya] ALT_KATEGORI_2 Accuracy: 0.9353287012292891
[Dijital Kod & Ürünler] ALT_KATEGORI_2 Accuracy: 0.9597014925373134
[Foto & Kamera] ALT_KATEGORI_2 Accuracy: 0.8648208469055375
[Giyilebilir Teknoloji] ALT_KATEGORI_2 Accuracy: 0.8223938223938224
[Kişisel Bakım Aletleri] ALT_KATEGORI_2 Accuracy: 0.9230118443316413
[Küçük Ev Aletleri] ALT_KATEGORI_2 Accuracy: 0.8938112952316537
[Telefon] ALT_KATEGORI_2 Accuracy: 0.9819102749638206
[TV & Görüntü & Ses] ALT_KATEGORI_2 Accuracy: 0.7703081232492998
[Aksesuar] ALT_KATEGORI_2 Accuracy: 0.9831181

['models_alt2.pkl']

In [None]:
def predict_hierarchical(title):
    ana_pred = model_ana.predict([title])[0]
    alt1_pred = models_alt1[ana_pred].predict([title])[0]

    alt2_model = models_alt2[alt1_pred]
    if isinstance(alt2_model, str):
        alt2_pred = alt2_model
    else:
        alt2_pred = alt2_model.predict([title])[0]

    return ana_pred, alt1_pred, alt2_pred


In [None]:
l = ["mitoloji kitapları", "bebek bezi sleepy","kadın abiye elbise","tahıl cipsi","protein tozu çilekli","traş makinesi","çoklu bebek bezi"]
for i in l:
    print(predict_hierarchical(i))

('Kitap & Kırtasiye & Hobi', 'Din Ve Mitoloji', 'Mitoloji')
('Anne & Çocuk', 'Bebek Bakım', 'Bebek Bakım ve Kozmetik')
('Kadın', 'Giyim', 'Mezuniyet Elbiseleri')
('Süpermarket', 'Atıştırmalık', 'Cips')
('Spor & Outdoor', 'Sporcu Besinleri', 'Protein Tozu')
('Elektronik', 'Kişisel Bakım Aletleri', 'Tıraş Makinesi')
('Anne & Çocuk', 'Bebek Bakım', 'Bebek Bakım ve Kozmetik')


# FAZLA VERİ İLE - logreg

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib
import os

def retrain_models():

    if 'df' not in globals():
        return False

    os.makedirs("/content/drive/MyDrive/mava/models", exist_ok=True)

    try:
        X = df["title"].astype(str).fillna("").tolist()
        y = df["ana_kategori"].astype(str).fillna("Genel").tolist()

        print(f"Benzersiz ana kategori sayısı: {len(set(y))}")

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        model_ana = Pipeline([
            ("tfidf", TfidfVectorizer(
                ngram_range=(1, 2),
                max_features=15000,
                min_df=2,
                max_df=0.95,
                strip_accents='unicode',
                lowercase=True,
                stop_words=None
            )),
            ("clf", LogisticRegression(
                max_iter=1000,
                class_weight='balanced',
                random_state=42,
                solver='lbfgs'
            ))
        ])

        model_ana.fit(X_train, y_train)

        y_pred = model_ana.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Ana model accuracy: {accuracy:.4f}")

        test_pred = model_ana.predict(["iPhone 14 Pro Max Akıllı Telefon"])[0]
        print(f"Test tahmini: {test_pred}")

        joblib.dump(model_ana, "/content/drive/MyDrive/mava/models/ana_kategori.pkl")

        models_alt1 = {}

        for ana_kat in df["ana_kategori"].unique():
            if pd.isna(ana_kat) or ana_kat == "":
                continue

            subset = df[df["ana_kategori"] == ana_kat]

            if len(subset) < 20:
                print(f"[{ana_kat}] Yetersiz veri: {len(subset)} satır")
                continue

            if subset["alt_kategori_1"].nunique() < 2:
                print(f"{ana_kat}] Tek sınıf var")
                continue

            try:
                X_sub = subset["title"].astype(str).fillna("").tolist()
                y_sub = subset["alt_kategori_1"].astype(str).fillna("Genel").tolist()

                X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(
                    X_sub, y_sub, test_size=0.2, random_state=42, stratify=y_sub
                )

                model_sub = Pipeline([
                    ("tfidf", TfidfVectorizer(
                        ngram_range=(1, 2),
                        max_features=8000,
                        min_df=1,
                        max_df=0.95,
                        strip_accents='unicode',
                        lowercase=True
                    )),
                    ("clf", LogisticRegression(
                        max_iter=1000,
                        class_weight='balanced',
                        random_state=42,
                        solver='lbfgs'
                    ))
                ])

                model_sub.fit(X_train_sub, y_train_sub)
                models_alt1[ana_kat] = model_sub

                acc = accuracy_score(y_test_sub, model_sub.predict(X_test_sub))
                print(f"[{ana_kat}] Alt1 accuracy: {acc:.4f}")

            except Exception as e:
                print(f"[{ana_kat}] Alt1 hata: {e}")

        joblib.dump(models_alt1, "/content/drive/MyDrive/mava/models/alt1_kategori.pkl")

        models_alt2 = {}

        for alt1_kat in df["alt_kategori_1"].unique():
            if pd.isna(alt1_kat) or alt1_kat == "":
                continue

            subset = df[df["alt_kategori_1"] == alt1_kat]

            if subset["alt_kategori_2"].nunique() < 2:
                tek_sinif = subset["alt_kategori_2"].iloc[0]
                if pd.notna(tek_sinif) and tek_sinif != "":
                    models_alt2[alt1_kat] = str(tek_sinif)
                    print(f"[{alt1_kat}] -> '{tek_sinif}' (string)")
                continue

            if len(subset) < 15:
                print(f"[{alt1_kat}] Yetersiz veri: {len(subset)} satır")
                continue

            try:
                X_sub = subset["title"].astype(str).fillna("").tolist()
                y_sub = subset["alt_kategori_2"].astype(str).fillna("Genel").tolist()

                X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(
                    X_sub, y_sub, test_size=0.2, random_state=42, stratify=y_sub
                )

                model_sub = Pipeline([
                    ("tfidf", TfidfVectorizer(
                        ngram_range=(1, 2),
                        max_features=5000,
                        min_df=1,
                        max_df=0.95,
                        strip_accents='unicode',
                        lowercase=True
                    )),
                    ("clf", LogisticRegression(
                        max_iter=1000,
                        class_weight='balanced',
                        random_state=42,
                        solver='lbfgs'
                    ))
                ])

                model_sub.fit(X_train_sub, y_train_sub)
                models_alt2[alt1_kat] = model_sub

                acc = accuracy_score(y_test_sub, model_sub.predict(X_test_sub))
                print(f"[{alt1_kat}] Alt2 accuracy: {acc:.4f}")

            except Exception as e:
                print(f"[{alt1_kat}] Alt2 hata: {e}")

        joblib.dump(models_alt2, "/content/drive/MyDrive/mava/models/alt2_kategori.pkl")

        test_title = "Apple iPhone 14 Pro Max 256GB"

        ana_pred = model_ana.predict([test_title])[0]
        print(f"Ana: {ana_pred}")

        if ana_pred in models_alt1:
            alt1_pred = models_alt1[ana_pred].predict([test_title])[0]
            print(f"Alt1: {alt1_pred}")

            if alt1_pred in models_alt2:
                if isinstance(models_alt2[alt1_pred], str):
                    alt2_pred = models_alt2[alt1_pred]
                    print(f"Alt2: {alt2_pred} (string)")
                else:
                    alt2_pred = models_alt2[alt1_pred].predict([test_title])[0]
                    print(f"Alt2: {alt2_pred} (model)")

                print(f"BAŞARILI: [{ana_pred}, {alt1_pred}, {alt2_pred}]")
            else:
                print(f"Alt2 anahtarı bulunamadı: {alt1_pred}")
        else:
            print(f"Alt1 anahtarı bulunamadı: {ana_pred}")

        return True

    except Exception as e:
        print(f"Genel hata: {e}")
        import traceback
        print(traceback.format_exc())
        return False

if __name__ == "__main__":
    retrain_models()

Benzersiz ana kategori sayısı: 9
Ana model accuracy: 0.9086
Test tahmini: Elektronik
[Anne & Çocuk] Alt1 accuracy: 0.9882
[Süpermarket] Alt1 accuracy: 0.9185
[Ayakkabı & Çanta] Alt1 accuracy: 0.9698
[Elektronik] Alt1 accuracy: 0.9542
[Erkek] Alt1 accuracy: 0.9133
[Kadın] Alt1 accuracy: 0.9761
[Ev & Mobilya] Alt1 accuracy: 0.9587
[Kitap & Kırtasiye & Hobi] Alt1 accuracy: 0.7825
[Spor & Outdoor] Alt1 accuracy: 0.7598
[Bebek] Alt2 accuracy: 0.9684
[Bebek Bakım] Alt2 accuracy: 0.9611
[Oyuncak] Alt2 accuracy: 0.7102
[Erkek Ayakkabı] Alt2 accuracy: 0.9645
[Kadın Ayakkabı] Alt2 accuracy: 0.9833
[Çocuk Ayakkabı] Alt2 accuracy: 1.0000
[Kadın Çanta] Alt2 accuracy: 0.8641
[Çocuk Çanta] Alt2 accuracy: 0.9733
[Beyaz Eşya] Alt2 accuracy: 0.9289
[Dijital Kod & Ürünler] Alt2 accuracy: 0.9582
[Foto & Kamera] Alt2 accuracy: 0.8909
[Giyilebilir Teknoloji] Alt2 accuracy: 0.7780
[Kişisel Bakım Aletleri] Alt2 accuracy: 0.9179
[Küçük Ev Aletleri] Alt2 accuracy: 0.8938
[Telefon] Alt2 accuracy: 0.9877
[TV & Gö

# AZ VERİ İLE - logreg

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib
import os

def retrain_models():

    if 'df' not in globals():
        return False

    os.makedirs("/content/drive/MyDrive/mava/models2", exist_ok=True)

    try:
        X = df["title"].astype(str).fillna("").tolist()
        y = df["ana_kategori"].astype(str).fillna("Genel").tolist()

        print(f"Benzersiz ana kategori sayısı: {len(set(y))}")

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        model_ana = Pipeline([
            ("tfidf", TfidfVectorizer(
                ngram_range=(1, 2),
                max_features=15000,
                min_df=2,
                max_df=0.95,
                strip_accents='unicode',
                lowercase=True,
                stop_words=None
            )),
            ("clf", LogisticRegression(
                max_iter=1000,
                class_weight='balanced',
                random_state=42,
                solver='lbfgs'
            ))
        ])

        model_ana.fit(X_train, y_train)

        y_pred = model_ana.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Ana model accuracy: {accuracy:.4f}")

        test_pred = model_ana.predict(["iPhone 14 Pro Max Akıllı Telefon"])[0]
        print(f"Test tahmini: {test_pred}")

        joblib.dump(model_ana, "/content/drive/MyDrive/mava/models2/ana_kategori.pkl")

        models_alt1 = {}

        for ana_kat in df["ana_kategori"].unique():
            if pd.isna(ana_kat) or ana_kat == "":
                continue

            subset = df[df["ana_kategori"] == ana_kat]

            if len(subset) < 20:
                print(f"[{ana_kat}] Yetersiz veri: {len(subset)} satır")
                continue

            if subset["alt_kategori_1"].nunique() < 2:
                print(f"{ana_kat}] Tek sınıf var")
                continue

            try:
                X_sub = subset["title"].astype(str).fillna("").tolist()
                y_sub = subset["alt_kategori_1"].astype(str).fillna("Genel").tolist()

                X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(
                    X_sub, y_sub, test_size=0.2, random_state=42, stratify=y_sub
                )

                model_sub = Pipeline([
                    ("tfidf", TfidfVectorizer(
                        ngram_range=(1, 2),
                        max_features=8000,
                        min_df=1,
                        max_df=0.95,
                        strip_accents='unicode',
                        lowercase=True
                    )),
                    ("clf", LogisticRegression(
                        max_iter=1000,
                        class_weight='balanced',
                        random_state=42,
                        solver='lbfgs'
                    ))
                ])

                model_sub.fit(X_train_sub, y_train_sub)
                models_alt1[ana_kat] = model_sub

                acc = accuracy_score(y_test_sub, model_sub.predict(X_test_sub))
                print(f"[{ana_kat}] Alt1 accuracy: {acc:.4f}")

            except Exception as e:
                print(f"[{ana_kat}] Alt1 hata: {e}")

        joblib.dump(models_alt1, "/content/drive/MyDrive/mava/models2/alt1_kategori.pkl")

        models_alt2 = {}

        for alt1_kat in df["alt_kategori_1"].unique():
            if pd.isna(alt1_kat) or alt1_kat == "":
                continue

            subset = df[df["alt_kategori_1"] == alt1_kat]

            if subset["alt_kategori_2"].nunique() < 2:
                tek_sinif = subset["alt_kategori_2"].iloc[0]
                if pd.notna(tek_sinif) and tek_sinif != "":
                    models_alt2[alt1_kat] = str(tek_sinif)
                    print(f"[{alt1_kat}] -> '{tek_sinif}' (string)")
                continue

            if len(subset) < 15:
                print(f"[{alt1_kat}] Yetersiz veri: {len(subset)} satır")
                continue

            try:
                X_sub = subset["title"].astype(str).fillna("").tolist()
                y_sub = subset["alt_kategori_2"].astype(str).fillna("Genel").tolist()

                X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(
                    X_sub, y_sub, test_size=0.2, random_state=42, stratify=y_sub
                )

                model_sub = Pipeline([
                    ("tfidf", TfidfVectorizer(
                        ngram_range=(1, 2),
                        max_features=5000,
                        min_df=1,
                        max_df=0.95,
                        strip_accents='unicode',
                        lowercase=True
                    )),
                    ("clf", LogisticRegression(
                        max_iter=1000,
                        class_weight='balanced',
                        random_state=42,
                        solver='lbfgs'
                    ))
                ])

                model_sub.fit(X_train_sub, y_train_sub)
                models_alt2[alt1_kat] = model_sub

                acc = accuracy_score(y_test_sub, model_sub.predict(X_test_sub))
                print(f"[{alt1_kat}] Alt2 accuracy: {acc:.4f}")

            except Exception as e:
                print(f"[{alt1_kat}] Alt2 hata: {e}")

        joblib.dump(models_alt2, "/content/drive/MyDrive/mava/models2/alt2_kategori.pkl")

        test_title = "Apple iPhone 14 Pro Max 256GB"

        ana_pred = model_ana.predict([test_title])[0]
        print(f"Ana: {ana_pred}")

        if ana_pred in models_alt1:
            alt1_pred = models_alt1[ana_pred].predict([test_title])[0]
            print(f"Alt1: {alt1_pred}")

            if alt1_pred in models_alt2:
                if isinstance(models_alt2[alt1_pred], str):
                    alt2_pred = models_alt2[alt1_pred]
                    print(f"Alt2: {alt2_pred} (string)")
                else:
                    alt2_pred = models_alt2[alt1_pred].predict([test_title])[0]
                    print(f"Alt2: {alt2_pred} (model)")

                print(f"BAŞARILI: [{ana_pred}, {alt1_pred}, {alt2_pred}]")
            else:
                print(f"Alt2 anahtarı bulunamadı: {alt1_pred}")
        else:
            print(f"Alt1 anahtarı bulunamadı: {ana_pred}")

        return True

    except Exception as e:
        print(f"Genel hata: {e}")
        import traceback
        print(traceback.format_exc())
        return False

if __name__ == "__main__":
    retrain_models()

Benzersiz ana kategori sayısı: 9
Ana model accuracy: 0.9121
Test tahmini: Elektronik
[Anne & Çocuk] Alt1 accuracy: 0.9888
[Süpermarket] Alt1 accuracy: 0.9170
[Ayakkabı & Çanta] Alt1 accuracy: 0.9721
[Elektronik] Alt1 accuracy: 0.9477
[Erkek] Alt1 accuracy: 0.9108
[Kadın] Alt1 accuracy: 0.9798
[Ev & Mobilya] Alt1 accuracy: 0.9628
[Kitap & Kırtasiye & Hobi] Alt1 accuracy: 0.7930
[Spor & Outdoor] Alt1 accuracy: 0.8142
[Bebek] Alt2 accuracy: 0.9874
[Bebek Bakım] Alt2 accuracy: 0.9587
[Oyuncak] Alt2 accuracy: 0.7195
[Erkek Ayakkabı] Alt2 accuracy: 0.9650
[Kadın Ayakkabı] Alt2 accuracy: 0.9714
[Çocuk Ayakkabı] Alt2 accuracy: 1.0000
[Kadın Çanta] Alt2 accuracy: 0.8859
[Çocuk Çanta] -> 'Beslenme Çantası' (string)
[Beyaz Eşya] Alt2 accuracy: 0.9567
[Dijital Kod & Ürünler] Alt2 accuracy: 0.9528
[Foto & Kamera] Alt2 accuracy: 0.8816
[Giyilebilir Teknoloji] Alt2 accuracy: 0.8428
[Kişisel Bakım Aletleri] Alt2 accuracy: 0.9202
[Küçük Ev Aletleri] Alt2 accuracy: 0.8727
[Telefon] Alt2 accuracy: 0.9885

In [8]:
!pip install gensim nltk



In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib
import os
from gensim.utils import simple_preprocess
from nltk.tokenize import word_tokenize
import nltk
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec
import re

class TurkishWord2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=2, workers=4):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.model = None

    def turkish_preprocess(self, text):
      text = text.lower()
      text = re.sub(r'[^\w\s]', '', text)
      text = re.sub(r'\d+', '', text)
      words = text.split()
      return words

    def fit(self, X, y=None):
        sentences = [self.turkish_preprocess(text) for text in X]
        self.model = Word2Vec(
            sentences=sentences,
            vector_size=self.vector_size,
            window=self.window,
            min_count=self.min_count,
            workers=self.workers
        )
        return self

    def transform(self, X):
        return np.array([
            self.get_sentence_vector(self.turkish_preprocess(text))
            for text in X
        ])

    def get_sentence_vector(self, words):
        vectors = []
        for word in words:
            if word in self.model.wv:
                vectors.append(self.model.wv[word])
        if len(vectors) > 0:
            return np.mean(vectors, axis=0)
        return np.zeros(self.vector_size)

def retrain_models_with_word2vec():
    if 'df' not in globals():
        return False

    os.makedirs("/content/drive/MyDrive/mava/models3_word2vec", exist_ok=True)

    try:
        X = df["title"].astype(str).fillna("").tolist()
        y = df["ana_kategori"].astype(str).fillna("Genel").tolist()

        print(f"Benzersiz ana kategori sayısı: {len(set(y))}")

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        model_ana = Pipeline([
            ("word2vec", TurkishWord2VecVectorizer(vector_size=100)),
            ("clf", LogisticRegression(
                max_iter=1000,
                class_weight='balanced',
                random_state=42,
                solver='lbfgs'
            ))
        ])

        model_ana.fit(X_train, y_train)

        y_pred = model_ana.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Ana model accuracy: {accuracy:.4f}")

        test_pred = model_ana.predict(["iPhone 14 Pro Max Akıllı Telefon"])[0]
        print(f"Test tahmini: {test_pred}")

        joblib.dump(model_ana, "/content/drive/MyDrive/mava/models3_word2vec/ana_kategori.pkl")

        models_alt1 = {}

        for ana_kat in df["ana_kategori"].unique():
            if pd.isna(ana_kat) or ana_kat == "":
                continue

            subset = df[df["ana_kategori"] == ana_kat]

            if len(subset) < 20:
                print(f"[{ana_kat}] Yetersiz veri: {len(subset)} satır")
                continue

            if subset["alt_kategori_1"].nunique() < 2:
                print(f"{ana_kat}] Tek sınıf var")
                continue

            try:
                X_sub = subset["title"].astype(str).fillna("").tolist()
                y_sub = subset["alt_kategori_1"].astype(str).fillna("Genel").tolist()

                X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(
                    X_sub, y_sub, test_size=0.2, random_state=42, stratify=y_sub
                )

                model_sub = Pipeline([
                    ("word2vec", TurkishWord2VecVectorizer(vector_size=80)),
                    ("clf", LogisticRegression(
                        max_iter=1000,
                        class_weight='balanced',
                        random_state=42,
                        solver='lbfgs'
                    ))
                ])

                model_sub.fit(X_train_sub, y_train_sub)
                models_alt1[ana_kat] = model_sub

                acc = accuracy_score(y_test_sub, model_sub.predict(X_test_sub))
                print(f"[{ana_kat}] Alt1 accuracy: {acc:.4f}")

            except Exception as e:
                print(f"[{ana_kat}] Alt1 hata: {e}")

        joblib.dump(models_alt1, "/content/drive/MyDrive/mava/models3_word2vec/alt1_kategori.pkl")

        models_alt2 = {}

        for alt1_kat in df["alt_kategori_1"].unique():
            if pd.isna(alt1_kat) or alt1_kat == "":
                continue

            subset = df[df["alt_kategori_1"] == alt1_kat]

            if subset["alt_kategori_2"].nunique() < 2:
                tek_sinif = subset["alt_kategori_2"].iloc[0]
                if pd.notna(tek_sinif) and tek_sinif != "":
                    models_alt2[alt1_kat] = str(tek_sinif)
                    print(f"[{alt1_kat}] -> '{tek_sinif}' (string)")
                continue

            if len(subset) < 15:
                print(f"[{alt1_kat}] Yetersiz veri: {len(subset)} satır")
                continue

            try:
                X_sub = subset["title"].astype(str).fillna("").tolist()
                y_sub = subset["alt_kategori_2"].astype(str).fillna("Genel").tolist()

                X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(
                    X_sub, y_sub, test_size=0.2, random_state=42, stratify=y_sub
                )

                model_sub = Pipeline([
                    ("word2vec", TurkishWord2VecVectorizer(vector_size=60)),
                    ("clf", LogisticRegression(
                        max_iter=1000,
                        class_weight='balanced',
                        random_state=42,
                        solver='lbfgs'
                    ))
                ])

                model_sub.fit(X_train_sub, y_train_sub)
                models_alt2[alt1_kat] = model_sub

                acc = accuracy_score(y_test_sub, model_sub.predict(X_test_sub))
                print(f"[{alt1_kat}] Alt2 accuracy: {acc:.4f}")

            except Exception as e:
                print(f"[{alt1_kat}] Alt2 hata: {e}")

        joblib.dump(models_alt2, "/content/drive/MyDrive/mava/models3_word2vec/alt2_kategori.pkl")

        test_title = "Apple iPhone 14 Pro Max 256GB"

        ana_pred = model_ana.predict([test_title])[0]
        print(f"Ana: {ana_pred}")

        if ana_pred in models_alt1:
            alt1_pred = models_alt1[ana_pred].predict([test_title])[0]
            print(f"Alt1: {alt1_pred}")

            if alt1_pred in models_alt2:
                if isinstance(models_alt2[alt1_pred], str):
                    alt2_pred = models_alt2[alt1_pred]
                    print(f"Alt2: {alt2_pred} (string)")
                else:
                    alt2_pred = models_alt2[alt1_pred].predict([test_title])[0]
                    print(f"Alt2: {alt2_pred} (model)")

                print(f"BAŞARILI: [{ana_pred}, {alt1_pred}, {alt2_pred}]")
            else:
                print(f"Alt2 anahtarı bulunamadı: {alt1_pred}")
        else:
            print(f"Alt1 anahtarı bulunamadı: {ana_pred}")

        return True

    except Exception as e:
        print(f"Genel hata: {e}")
        import traceback
        print(traceback.format_exc())
        return False

if __name__ == "__main__":
    retrain_models_with_word2vec()

Benzersiz ana kategori sayısı: 9
Ana model accuracy: 0.7686
Test tahmini: Elektronik
[Anne & Çocuk] Alt1 accuracy: 0.9630
[Süpermarket] Alt1 accuracy: 0.8104
[Ayakkabı & Çanta] Alt1 accuracy: 0.9264
[Elektronik] Alt1 accuracy: 0.8574
[Erkek] Alt1 accuracy: 0.8426
[Kadın] Alt1 accuracy: 0.9101
[Ev & Mobilya] Alt1 accuracy: 0.8788
[Kitap & Kırtasiye & Hobi] Alt1 accuracy: 0.6558
[Spor & Outdoor] Alt1 accuracy: 0.6556
[Bebek] Alt2 accuracy: 0.9532
[Bebek Bakım] Alt2 accuracy: 0.8234
[Oyuncak] Alt2 accuracy: 0.5038
[Erkek Ayakkabı] Alt2 accuracy: 0.7380
[Kadın Ayakkabı] Alt2 accuracy: 0.8607
[Çocuk Ayakkabı] Alt2 accuracy: 0.8397
[Kadın Çanta] Alt2 accuracy: 0.6611
[Çocuk Çanta] -> 'Beslenme Çantası' (string)
[Beyaz Eşya] Alt2 accuracy: 0.7275
[Dijital Kod & Ürünler] Alt2 accuracy: 0.6742
[Foto & Kamera] Alt2 accuracy: 0.5483
[Giyilebilir Teknoloji] Alt2 accuracy: 0.6635
[Kişisel Bakım Aletleri] Alt2 accuracy: 0.7627
[Küçük Ev Aletleri] Alt2 accuracy: 0.6481
[Telefon] Alt2 accuracy: 0.8811