# Set Up

In [17]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from pymorphy2 import MorphAnalyzer
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import ParameterSampler
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords
from string import punctuation


import torch
import transformers as ppb
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegressionCV
import pickle

import matplotlib
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mchomak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mchomak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
df = pd.read_csv("data/df_1250.csv")
df.shape

(11250, 5)

# Predprocessing

In [19]:
df['combined_text'] = df[['headers', 'sub_headers', 'text']].fillna('').apply(lambda x: ' '.join(x[x != '']), axis=1)

In [20]:
df = df.drop(columns= ["url", 'headers', 'sub_headers', 'text'])

In [21]:
X = df[["combined_text"]]
y = df[["ID"]]
print(X.shape, y.shape)

(11250, 1) (11250, 1)


In [22]:
def save_model(model, name):
    with open(f'{name}.pkl','wb') as f:
        pickle.dump(model, f)

In [23]:
def clean_text(text):
    # Лемматизация считается 10000 лет и не высчитывается не знаю в чем проблема( помогите
    # morph = MorphAnalyzer()
    stop_words = set(stopwords.words('russian'))

    # Очистка текста от всей пунткуации
    text = text.translate(str.maketrans('', '', punctuation))

    # Очистка текста от всех символов, кроме букв
    text =  re.sub(r'[^а-яёА-ЯЁ]', ' ', text)

    # нижний регистр
    text = text.lower()

    # токинизируем
    text = word_tokenize(text)
    
    # убираем стоп слова
    filtered_words = []
    for word in text:
        if word not in stop_words:
            filtered_words.append(word)
            # filtered_words.append(morph.parse(word)[0].normal_form)

    text = ' '.join(filtered_words)
    return text

In [24]:
X["combined_text"] = X["combined_text"].apply(clean_text).dropna()

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X[["combined_text"]], y["ID"], test_size=0.2, random_state=42)

In [26]:
def get_mean_w2v_vector(sentence, model, HIDDEN):
    Sum = np.zeros(HIDDEN)
    Count = 0

    try:
        words = sentence.split()
    except TypeError:
        words = []

    for w in words:
        if w in model.wv:
            Sum += model.wv[w]
            Count += 1

    if Count == 0:
        return Sum  # Возвращаем нулевой вектор, если нет слов в модели

    return Sum / Count

In [27]:
def w2v_processing(X, model, HIDDEN, y = None):
    NewCols = ['col' + str(i) for i in range(HIDDEN)]
    X['vectors'] = X["combined_text"].map(lambda text: get_mean_w2v_vector(text, model=model, HIDDEN=HIDDEN))

    Idx = []

    for ix, row in X.iterrows():
        if not isinstance(row['vectors'], np.ndarray):
            Idx.append(ix)
    
    X.drop(index=Idx, inplace=True)

    if y is not None:
        y = y.drop(index=Idx)

    X[NewCols] = pd.DataFrame(X['vectors'].tolist(), index=X.index)

    X.drop(["combined_text",'vectors'], axis=1, inplace=True)

    return X, y

In [28]:
sent = [row.split() for row in X_train["combined_text"]]

In [29]:
param_distributions = {
    'vector_size': [50, 100, 200, 300],
    'window': [2, 5, 10],
    'min_count': [1, 2, 5, 10],
    'sample': np.linspace(0.0001, 0.001, num=5),
    'alpha': np.linspace(0.01, 0.05, num=5),
    'min_alpha': np.linspace(0.0001, 0.001, num=5),
    'negative': [5, 10, 20],
    'workers': [4],  # Количество рабочих потоков для обучения модели
}
n_iter = 10  # Количество комбинаций для тестирования

parameter_sampler = ParameterSampler(param_distributions, n_iter)

In [30]:
def evaluate_model(model, HIDDEN):
    X_train, y_train = w2v_processing(X_train, model= model, HIDDEN= HIDDEN)
    X_test, y_test = w2v_processing(X_test, y_test)

    lg = LogisticRegression()
    lg.fit(X_train, y_train)

    lg_train_pred = lg.predict(X_train)
    lg_test_pred = lg.predict(X_test)
    
    return accuracy_score(y_test, lg_test_pred)

In [31]:
best_score = None
best_model = None

for params in parameter_sampler:
    model = Word2Vec(sentences=sent, epochs=30, **params)
    model.train(sentences=sent, total_examples=model.corpus_count, epochs=model.epochs)
    
    # Оцените модель
    score = evaluate_model(model, HIDDEN = model.vector_size)  # X и y должны быть определены пользователем как данные для задачи классификации

    if best_score is None or score > best_score:
        best_score = score
        best_model = model
        best_params = params

print(f"Лучший результат: {best_score}, Лучшие параметры: {best_params}")

TypeError: Either one of corpus_file or corpus_iterable value must be provided