In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np
import re

# Preprocessing Function
def preprocess_text(text: str):
    text = text.casefold()  # Lowercase (casefold for Turkish-specific)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    return text

# Train Word2Vec model
def train_word2vec(corpus, vector_size=100, window=5, min_count=1):
    sentences = [text.split() for text in corpus]  # Tokenize sentences for Word2Vec
    model = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=min_count)
    return model

# Generate document vectors using TF-IDF weights and Word2Vec embeddings
def get_document_vectors(corpus, tfidf_vectorizer, word2vec_model):
    tfidf_matrix = tfidf_vectorizer.transform(corpus)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    document_vectors = []

    for doc_idx, doc in enumerate(corpus):
        words = doc.split()
        tfidf_weights = tfidf_matrix[doc_idx].toarray().flatten()
        doc_vector = np.zeros(word2vec_model.vector_size)
        total_weight = 0

        for idx, word in enumerate(feature_names):
            if word in word2vec_model.wv:  # Check if the word exists in Word2Vec
                word_vector = word2vec_model.wv[word]
                weight = tfidf_weights[idx]
                doc_vector += weight * word_vector
                total_weight += weight

        if total_weight > 0:
            doc_vector /= total_weight  # Normalize by the total weight
        document_vectors.append(doc_vector)

    return np.array(document_vectors)

# Load and preprocess training data
corpus = []
train_usernames = []

for username, posts in username2posts_train.items():
    train_usernames.append(username)
    cleaned_captions = []

    for post in posts:
        post_caption = post.get("caption", "")
        if post_caption is None:
            continue
        post_caption = preprocess_text(post_caption)
        if post_caption != "":
            cleaned_captions.append(post_caption)

    user_post_captions = "\n".join(cleaned_captions)
    corpus.append(user_post_captions)

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words=turkish_stopwords, max_features=15000, min_df=10, ngram_range=(1, 3))
vectorizer.fit(corpus)

# Train Word2Vec model
word2vec_model = train_word2vec(corpus)

# Generate document vectors for training data
x_post_train = get_document_vectors(corpus, vectorizer, word2vec_model)
y_train = [username2_category.get(uname, "NA") for uname in train_usernames]

# Preprocess test data
test_corpus = []
test_usernames = []

for username, posts in username2posts_test.items():
    test_usernames.append(username)
    cleaned_captions = []

    for post in posts:
        post_caption = post.get("caption", "")
        if post_caption is None:
            continue
        post_caption = preprocess_text(post_caption)
        if post_caption != "":
            cleaned_captions.append(post_caption)

    user_post_captions = "\n".join(cleaned_captions)
    test_corpus.append(user_post_captions)

# Generate document vectors for test data
x_post_test = get_document_vectors(test_corpus, vectorizer, word2vec_model)

# Now, x_post_train and x_post_test can be used for training and testing your model
print("Training data vectors shape:", x_post_train.shape)
print("Test data vectors shape:", x_post_test.shape)
