In [1]:
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz

--2025-03-13 20:32:40--  https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
Resolving github.com (github.com)... 140.82.121.4
connected. to github.com (github.com)|140.82.121.4|:443... 
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/87156914/0b363e00-0126-11e9-9e3c-e8c235463bd6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250313%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250313T173241Z&X-Amz-Expires=300&X-Amz-Signature=3877956e192c1006f6b92263eec95dfe484041d0bfb25160e70fa8dcc915e60c&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dlenta-ru-news.csv.gz&response-content-type=application%2Foctet-stream [following]
--2025-03-13 20:32:41--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/87156914/0b363e00-0126-11e9-9e3c-e8c235463bd6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-A

In [9]:
!pip install pandas nltk pymorphy3 tqdm



In [10]:
import re

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

from corus import load_lenta

import warnings
warnings.filterwarnings('ignore')

DATA_PATH = 'lenta-ru-news.csv.gz'

SIZE_LIMIT = 150_000
TOPIC_SIZE_THRESHOLD = 10

CORPUS = [
    {'text': row.title + '. ' + row.text, 'topic': row.topic}
    for row in load_lenta(DATA_PATH)
]

df = pd.DataFrame(CORPUS)
df = df.sample(n=SIZE_LIMIT, random_state=123).reset_index(drop=True)

assert len(df) == SIZE_LIMIT

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

STOP_WORDS = set(stopwords.words('russian'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /Users/lulchak-
[nltk_data]     pavel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/lulchak-
[nltk_data]     pavel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/lulchak-
[nltk_data]     pavel/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
import pymorphy3
from tqdm import tqdm

morph = pymorphy3.MorphAnalyzer()

pattern = re.compile(r'\W')

lemma_cache = {}

def preprocess_text(text: str):
    text = text.lower()
    text = pattern.sub(' ', text)
    
    words = text.split()
    lemmatized_words = []
    for word in words:
        if word not in STOP_WORDS:
            if word not in lemma_cache:
                lemma_cache[word] = morph.parse(word)[0].normal_form
            lemmatized_words.append(lemma_cache[word])
    
    return ' '.join(lemmatized_words)

tqdm.pandas(
    desc='Preprocessing',
    bar_format='{l_bar}{bar:15}{r_bar}',
    ncols=80,
)

df['text'] = df['text'].progress_apply(preprocess_text)

topic_counts = df['topic'].value_counts()
other_topics = topic_counts[topic_counts < TOPIC_SIZE_THRESHOLD].index

df['topic'] = df['topic'].apply(lambda x: 'Other' if x in other_topics else x)
df['topic'] = df['topic'].astype('category').cat.codes

df.head(3)

Preprocessing: 100%|███████████████| 150000/150000 [00:33<00:00, 4468.08it/s]


Unnamed: 0,text,topic
0,туляк дать полтора год тюрьма экстремизм вконт...,7
1,microsoft google готовый уладить дело кайф аме...,7
2,киев митинговать ворваться здание минюст понед...,4


In [12]:
X = df['text']
y = df['topic']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [13]:
import gensim
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

tokenized_sentences = [text.split() for text in X_train]

w2v_model = Word2Vec(
    sentences=tokenized_sentences,
    vector_size=300,
    window=10,
    min_count=5,
    workers=4,
    sg=1,
    negative=15,
    epochs=10,
    seed=42,
)

w2v_model.save("word2vec_lenta.model")

ModuleNotFoundError: No module named 'gensim'