In [None]:
!pip install -U spacy
!python -m spacy download ru_core_news_sm

Collecting ru-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.8.0/ru_core_news_sm-3.8.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymorphy3>=1.0.0 (from ru-core-news-sm==3.8.0)
  Downloading pymorphy3-2.0.6-py3-none-any.whl.metadata (2.4 kB)
Collecting dawg2-python>=0.8.0 (from pymorphy3>=1.0.0->ru-core-news-sm==3.8.0)
  Downloading dawg2_python-0.9.0-py3-none-any.whl.metadata (7.5 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-sm==3.8.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.6-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.9/53.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dawg2_python-0.9.0-py3-none-any.whl (9.3 kB)
Downloading pymorphy3

In [None]:
import pandas as pd
import spacy
import os
import numpy as np

In [None]:
nlp = spacy.load("ru_core_news_sm", disable=["parser", "ner"])

In [None]:
data_path = 'training_data.csv'
features_path = 'training_features.csv'

In [None]:
df_data = pd.read_csv(data_path)
df_features = pd.read_csv(features_path)

print(f"Loaded Raw Data: {df_data.shape}")
print(f"Loaded Features: {df_features.shape}")

Loaded Raw Data: (1138, 2)
Loaded Features: (1138, 20)


In [None]:
vocab_levels = ['a1', 'a2', 'b1', 'b2', 'c1']
vocab_sets = {}
seen_words = set()

print("\nLoading Vocabulary Lists (Disjoint Mode)...")
for level in vocab_levels:
    filename = f"new_vocab_{level}.txt"
    try:
        with open(filename, 'r', encoding='utf-8') as f:

            raw_words = {line.strip().lower() for line in f if line.strip()}


            unique_words = raw_words - seen_words


            seen_words.update(raw_words)

            vocab_sets[level] = unique_words

        print(f"  - {level.upper()}: {len(vocab_sets[level])} unique words (Raw: {len(raw_words)})")
    except FileNotFoundError:
        print(f"  - ERROR: Could not find {filename}")
        vocab_sets[level] = set()




Loading Vocabulary Lists (Disjoint Mode)...
  - A1: 1006 unique words (Raw: 1006)
  - A2: 574 unique words (Raw: 1560)
  - B1: 928 unique words (Raw: 2482)
  - B2: 3208 unique words (Raw: 5500)
  - C1: 6467 unique words (Raw: 11956)


In [None]:
def get_lemmas(text):
    if not isinstance(text, str):
        return []

    doc = nlp(text)


    lemmas = [token.lemma_.lower() for token in doc if token.is_alpha]
    return lemmas

In [None]:
def get_vocab_ratios(lemma_list):
    if not lemma_list:
        return pd.Series([0.0] * 5, index=[f'ratio_{l}' for l in vocab_levels])

    total_tokens = len(lemma_list)
    ratios = {}

    for level in vocab_levels:
        count = sum(1 for lemma in lemma_list if lemma in vocab_sets[level])
        ratios[f'ratio_{level}'] = count / total_tokens

    return pd.Series(ratios)

In [None]:
df_data['lemmas'] = df_data['text'].apply(get_lemmas)

In [None]:
vocab_features = df_data['lemmas'].apply(get_vocab_ratios)

In [None]:
df_extended = pd.concat([df_features, vocab_features], axis=1)

In [None]:
cols = [c for c in df_extended.columns if c != 'label'] + ['label']
df_extended = df_extended[cols]

In [None]:
output_filename = 'training_features_extended.csv'
df_extended.to_csv(output_filename, index=False)

In [None]:
print(vocab_features.head())

   ratio_a1  ratio_a2  ratio_b1  ratio_b2  ratio_c1
0  0.575000  0.125000  0.033333  0.025000  0.033333
1  1.000000  0.000000  0.000000  0.000000  0.000000
2  0.600000  0.000000  0.000000  0.000000  0.000000
3  0.670213  0.053191  0.021277  0.010638  0.021277
4  0.636872  0.083799  0.044693  0.011173  0.022346


In [None]:
print(df_data['lemmas'].head())

0    [сообщение, об, это, слово, мы, думать, каждый...
1                                       [я, не, знать]
2                          [у, меня, не, быть, словай]
3    [следуюший, пожалуйста, если, мне, не, нравить...
4    [мне, казаться, что, личный, телефон, и, компю...
Name: lemmas, dtype: object
