# Set Up

In [235]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from pymorphy2 import MorphAnalyzer
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import ParameterSampler
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords
from string import punctuation


import torch
import transformers as ppb
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegressionCV
import pickle

import matplotlib
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mchomak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mchomak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [190]:
df = pd.read_csv("data/df_1250.csv")
df.shape

(11250, 5)

# Predprocessing

In [191]:
df['combined_text'] = df[['headers', 'sub_headers', 'text']].fillna('').apply(lambda x: ' '.join(x[x != '']), axis=1)

In [192]:
df = df.drop(columns= ["url", 'headers', 'sub_headers', 'text'])

In [193]:
X = df[["combined_text"]]
y = df[["ID"]]
print(X.shape, y.shape)

(11250, 1) (11250, 1)


In [195]:
def save_model(model, name):
    with open(f'{name}.pkl','wb') as f:
        pickle.dump(model, f)

In [196]:
def clean_text(text):
    # Лемматизация считается 10000 лет и не высчитывается не знаю в чем проблема( помогите
    # morph = MorphAnalyzer()
    stop_words = set(stopwords.words('russian'))

    # Очистка текста от всей пунткуации
    text = text.translate(str.maketrans('', '', punctuation))

    # Очистка текста от всех символов, кроме букв
    text =  re.sub(r'[^а-яёА-ЯЁ]', ' ', text)

    # нижний регистр
    text = text.lower()

    # токинизируем
    text = word_tokenize(text)
    
    # убираем стоп слова
    filtered_words = []
    for word in text:
        if word not in stop_words:
            filtered_words.append(word)
            # filtered_words.append(morph.parse(word)[0].normal_form)

    text = ' '.join(filtered_words)
    return text

In [197]:
X["combined_text"] = X["combined_text"].apply(clean_text).dropna()

In [198]:
X_train, X_test, y_train, y_test = train_test_split(X[["combined_text"]], y["ID"], random_state=42)

In [199]:
X_train

Unnamed: 0,combined_text
6004,диас масвидаль проведут поединок правилам бокс...
7620,донстрой выполнил монолитных работ образовател...
10039,наука близка значительным успехам лечении рака...
8399,определено техрешение оптимизации стоимости ле...
5652,переход тикнизяна зенит сорвался изза вопросов...
...,...
5734,сын тихонова перейдет енисей сын тихонова пере...
5191,вяльбе похвалила клебо феноменальную разделку ...
5390,мишина галлямов лидируют короткой программы эт...
860,москвичам рассказали погоде вторник тишковец м...


In [200]:
X_train

Unnamed: 0,combined_text
6004,диас масвидаль проведут поединок правилам бокс...
7620,донстрой выполнил монолитных работ образовател...
10039,наука близка значительным успехам лечении рака...
8399,определено техрешение оптимизации стоимости ле...
5652,переход тикнизяна зенит сорвался изза вопросов...
...,...
5734,сын тихонова перейдет енисей сын тихонова пере...
5191,вяльбе похвалила клебо феноменальную разделку ...
5390,мишина галлямов лидируют короткой программы эт...
860,москвичам рассказали погоде вторник тишковец м...


In [201]:
sent = [row.split() for row in X_train["combined_text"]]

HIDDEN = 100

model = Word2Vec(min_count=20,
                  window=2,
                  vector_size=HIDDEN,
                  sample=6e-5,
                  alpha=0.03,
                  min_alpha=0.0007,
                  negative=20,
                  workers=2)
model.build_vocab(sent, progress_per=10000)
model.train(sent, total_examples=model.corpus_count, epochs=30, report_delay=1)

(22275291, 41090040)

In [222]:
def get_mean_w2v_vector(sentence):
    Sum = np.zeros(HIDDEN)
    Count = 0

    try:
        words = sentence.split()
    except TypeError:
        words = []

    for w in words:
        if w in model.wv:
            Sum += model.wv[w]
            Count += 1

    if Count == 0:
        return Sum  # Возвращаем нулевой вектор, если нет слов в модели

    return Sum / Count

In [223]:
def w2v_processing(X, y = None):
    NewCols = ['col' + str(i) for i in range(HIDDEN)]
    X['vectors'] = X["combined_text"].map(get_mean_w2v_vector)

    Idx = []

    for ix, row in X.iterrows():
        if not isinstance(row['vectors'], np.ndarray):
            Idx.append(ix)
    
    X.drop(index=Idx, inplace=True)

    if y is not None:
        y = y.drop(index=Idx)

    X[NewCols] = pd.DataFrame(X['vectors'].tolist(), index=X.index)

    X.drop(["combined_text",'vectors'], axis=1, inplace=True)

    return X, y

In [206]:
X_train, y_train = w2v_processing(X_train, y_train)
X_test, y_test = w2v_processing(X_test, y_test)

In [207]:
X_train

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,...,col90,col91,col92,col93,col94,col95,col96,col97,col98,col99
6004,-0.131073,0.269280,0.920324,0.179975,-0.097885,-0.457157,0.215255,0.313429,-0.384135,0.115947,...,0.317792,0.075763,0.587511,-0.179349,0.250120,-0.390737,0.031795,-0.187795,0.030913,0.454198
7620,-0.180560,0.650230,0.550954,-0.260067,0.524423,-0.260696,0.110234,0.025104,0.079398,0.032359,...,0.106018,0.159904,0.001088,0.176684,0.017323,-0.288850,0.067044,-0.056375,-0.059306,-0.137998
10039,-0.149871,0.026913,-0.114367,0.131186,0.165109,-0.172803,-0.116495,0.445070,-0.311511,0.167410,...,0.197470,0.117493,-0.168813,-0.377488,0.085581,0.091185,0.182234,-0.485311,0.040401,-0.175425
8399,0.140008,0.376308,0.177216,0.069007,0.344654,-0.445870,-0.099103,0.235592,-0.000284,-0.145447,...,0.516927,-0.053089,-0.344913,0.008824,0.036404,0.218453,0.122188,-0.108021,0.128908,0.027127
5652,0.166061,0.178930,0.637712,0.021134,0.094804,-0.451073,0.078919,0.131754,-0.371247,-0.018009,...,0.257828,-0.168240,0.139962,0.180786,0.408216,0.064149,-0.150265,-0.300923,-0.113087,0.341556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0.050953,0.189848,0.525397,0.110373,0.266401,-0.506104,0.445564,0.128665,-0.241012,0.165371,...,0.198572,-0.230530,0.128304,0.006540,0.375182,-0.138050,-0.113100,-0.485668,-0.333654,0.591043
5191,0.013950,0.059312,0.307494,0.067910,0.300975,-0.385790,0.435438,0.383348,-1.054843,0.581307,...,0.363379,-0.689159,0.298918,-0.189828,0.256581,-0.569366,-0.220929,-0.347891,-0.109501,0.284591
5390,-0.312654,0.117613,0.627009,0.338947,0.412120,-0.243837,-0.155320,-0.146757,-0.531193,0.036206,...,0.390864,-0.327408,0.199008,0.187205,0.250709,-0.209011,0.141522,-0.431505,-0.095398,0.095299
860,-0.090113,0.354084,0.109905,0.040800,0.255334,0.015545,-0.280851,0.243885,-0.017260,0.160016,...,0.298281,0.116908,0.246539,0.088547,0.148385,0.097623,-0.200409,-0.281371,-0.062842,-0.149323


# Logistic Regression

In [211]:
lg = LogisticRegression()
lg.fit(X_train, y_train)

lg_train_pred = lg.predict(X_train)
lg_test_pred = lg.predict(X_test)

In [212]:
# (0.885741377266801, 0.8706007820831853)
accuracy_score(y_train, lg_train_pred), accuracy_score(y_test, lg_test_pred)

(0.8855043261822922, 0.8702452897262709)

In [214]:
save_model(lg, "LG_Word2Vec")

# Predict on Test

In [215]:
df_test = pd.read_csv("data/test_news.csv").dropna()
df_test.head(5)

Unnamed: 0,content
0,Фото: «Фонтанка.ру»ПоделитьсяЭкс-министру обор...
1,В начале февраля 2023 года в Пушкинском районе...
2,Фото: Andy Bao / Getty Images Анастасия Борисо...
3,"Если вы хотели, но так и не съездили на море л..."
4,Сергей Пиняев Фото: Алексей Филиппов / РИА Нов...


In [216]:
df_test.shape

(26275, 1)

In [217]:
df_test["combined_text"] = df_test["content"].apply(clean_text)

In [218]:
df_test2 = df_test[["combined_text"]]

In [219]:
df_test2

Unnamed: 0,combined_text
0,фото фонтанкару поделитьсяэксминистру обороны ...
1,начале февраля года пушкинском районе санктпет...
2,фото анастасия борисова международная федераци...
3,хотели съездили море летом читайте дальше это ...
4,сергей пиняев фото алексей филиппов риа новост...
...,...
26270,фото риа новости алевтина запольская главное у...
26271,вадим гутцайт фото алевтина запольская министр...
26272,фото олег харсеев коммерсантъ александр курбат...
26273,владимир зеленский фото варвара кошечкина през...


In [224]:
df_test3, y = w2v_processing(df_test2)

In [225]:
df_test3.shape

(26275, 100)

In [226]:
df_test_pred = lg.predict(df_test3)

In [227]:
df_test_pred.shape

(26275,)

In [228]:
predictions_df = pd.DataFrame(df_test_pred, columns=['topic'])
predictions_df['index'] = predictions_df.index

In [229]:
predictions_df.shape

(26275, 2)

In [230]:
predictions_df

Unnamed: 0,topic,index
0,0,0
1,6,1
2,4,2
3,7,3
4,4,4
...,...,...
26270,3,26270
26271,3,26271
26272,3,26272
26273,3,26273


In [233]:
predictions_df.to_csv("data/precits/predict_2.csv", index=False)