# Lab5 - FS

## Набор данных

In [40]:
import pandas as pd

def load_dataset(path: str):
    if path.endswith('tsv'):
        return pd.read_csv(path, sep='\t')
    else:
        return pd.read_csv(path)

Удаление пунктуации и чисел

In [41]:
import re

def clean_text(text: str):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return text

Приведение слов к начальной форме

In [35]:
from natasha import MorphVocab, Doc, Segmenter, NewsEmbedding, NewsMorphTagger

morph_vocab = MorphVocab()
segmenter = Segmenter()
emd = NewsEmbedding()
morph_tager = NewsMorphTagger(emd)

def lemmatize_text(text: str):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tager)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    lemmas = [token.lemma for token in doc.tokens]
    return ' '.join(lemmas)

Разбиение на выборки

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

def split_dataset(dataset, target_feature):
    y = pd.DataFrame(label_encoder.fit_transform(dataset[target_feature]), columns=[target_feature])
    X = dataset.drop(columns=[target_feature])
    return train_test_split(X, y, test_size=0.2, random_state=9)

In [46]:
def preprocess_dataset(path, target_feature, text_feature):
    df = load_dataset(path)
    df[text_feature] = df[text_feature].apply(clean_text)
    df[text_feature] = df[text_feature].apply(lemmatize_text)
    df.to_csv('data/preprocessed.csv', index=False)
    return split_dataset(df, target_feature)


In [47]:
X_train, X_test, y_train, y_test = preprocess_dataset('data/castle-or-lock.tsv', 'class', 'text')

In [50]:
preprocessed = pd.read_csv('data/preprocessed.csv')
X_train, X_test, y_train, y_test = split_dataset(preprocessed, 'class')

In [51]:
X_train.head()

Unnamed: 0,text
50,засов дверной задвижка шпингалет больший задви...
72,замок с магнитный ключ или замок с магнитный к...
39,крепость пфальцграфенштайна он burg pfalzgrafe...
25,замок гапсалить он bischofsburg hapsal ныне им...
61,электромеханический замок разновидность электр...


## Алгоритмы

### Встроенный

In [20]:
from sklearn.svm import LinearSVC
import numpy as np
import pandas as pd

class CustomEmbedded:
    def __init__(self, n_top=30):
        self.n_top=n_top
    
    def select(self, X, y):
        y = y.ravel()
        model = LinearSVC()
        model.fit(X, y)
        feature_weights = np.abs(model.coef_[0])

        return np.argsort(feature_weights)[-self.n_top:][::-1]

### Фильтрующий

$\chi^2=\sum\limits_{i=1}^n{\frac{(n_{эмпир} - n_{теор})^2}{n_{теор}}}$

In [31]:
import numpy as np

class CustomFilter:
    def __init__(self, n_top=30):
        self.n_top=n_top

    # m[i, j]: i - признак присутствует, j - класс равен cls
    def _build_empirical_matrix(self, feature, cls):
        m = np.zeros((2, 2))
        m[0][0] = np.sum(np.logical_and(feature == 0, self.y != cls))
        m[0][1] = np.sum(np.logical_and(feature == 0, self.y == cls))
        m[1][0] = np.sum(np.logical_and(feature > 0, self.y != cls))
        m[1][1] = np.sum(np.logical_and(feature > 0, self.y == cls))
        return m
    
    def _build_expected_matrix(self, empirical_matrix):
        total_with_feature = empirical_matrix[1][1] + empirical_matrix[1][0]
        total_without_feature = empirical_matrix[0][0] + empirical_matrix[0][1]
        total_of_cls = empirical_matrix[0][1] + empirical_matrix[1][1]
        total_of_not_cls = empirical_matrix[0][0] + empirical_matrix[1][0]
        total = total_with_feature + total_without_feature

        m = np.zeros((2, 2))
        m[0][0] = (total_without_feature * total_of_not_cls) / total
        m[0][1] = (total_without_feature * total_of_cls) / total
        m[1][0] = (total_with_feature * total_of_not_cls) / total
        m[1][1] = (total_with_feature * total_of_cls) / total
        
        return m
    
    def _compute_chi2(self, empirical_matrix, expected_matrix):
        chi2 = 0
        for i in range(2):
            for j in range(2):
                if (expected_matrix[i, j]) > 0:
                    chi2 += ((empirical_matrix[i, j] - expected_matrix[i, j]) ** 2) / expected_matrix[i, j]
        return chi2

    def select(self, X, y):
        self.y = y
        n_samples, n_features = X.shape
        chi2_scores = []

        for i in range(n_features):
            feature = X[:, i].flatten()
            chi2 = 0

            for cls in np.unique(y):
                empirical_matrix = self._build_empirical_matrix(feature, cls)
                expected_matrix = self._build_expected_matrix(empirical_matrix)
                print(empirical_matrix == expected_matrix)
                chi2 += self._compute_chi2(empirical_matrix, expected_matrix)
            
            chi2_scores.append(chi2)
        
        return np.argsort(chi2_scores)[-self.n_top:][::-1]


### Обертка

In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

class CustomWrapper:
    def __init__(self, n_top=30, model=LogisticRegression(), metric=accuracy_score):
        self.n_top=n_top
        self.model=model
        self.metric=metric
    
    def select(self, X, y):
        n_samples, n_features = X.shape
        y = y.ravel()
        features = list(range(n_features))
        current_features = []
        for _ in tqdm(range(self.n_top)):
            best_score = -np.inf
            best_feature = None

            for feature in features:
                if (feature in current_features):
                    continue

                temp_features = current_features + [feature]

                X_train, X_test, y_train, y_test = train_test_split(X[:, temp_features], y, test_size=0.3, random_state=7)

                self.model.fit(X_train, y_train)
                y_pred = self.model.predict(X_test)

                score = self.metric(y_pred, y_test)
                if score > best_score:
                    best_score = score
                    best_feature = feature
            current_features.append(best_feature)
        self.best_features = current_features

        return self.best_features
            

## Задание

### Векторизация

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
russian_stopwords = list(stopwords.words('russian'))

vectorizer = CountVectorizer(max_features=5000, stop_words=russian_stopwords)
X_train_vectorized = vectorizer.fit_transform(X_train['text'])
X_test_vectorized = vectorizer.transform(X_test['text'])

feature_names = vectorizer.get_feature_names_out()

X_train_vectorized = pd.DataFrame(X_train_vectorized.toarray(), columns=feature_names)
X_test_vectorized = pd.DataFrame(X_test_vectorized.toarray(), columns=feature_names)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ladyp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Выбор признаков

In [22]:
def names_from_indeces(indeces):
    return feature_names[indeces]

In [27]:
X_train_vectorized.values

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [32]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

chi2_selector = SelectKBest(chi2, k=30)  # Выбираем 30 признаков
X_chi2 = chi2_selector.fit_transform(X_train_vectorized.values, y_train.values)

selected_words_chi2 = [vectorizer.get_feature_names_out()[i] for i in chi2_selector.get_support(indices=True)]
print("Фильтрующий метод (χ²):", selected_words_chi2)

Фильтрующий метод (χ²): ['ii', 'башня', 'безопасность', 'бруэма', 'век', 'война', 'время', 'год', 'город', 'дверь', 'де', 'зал', 'защелка', 'использоваться', 'клиффорд', 'ключ', 'колесо', 'конструкция', 'король', 'крепость', 'механизм', 'мочь', 'обычно', 'построить', 'резиденция', 'стена', 'устройство', 'цилиндр', 'штифт', 'электронный']


In [29]:
f = CustomFilter()
top_indeces = f.select(X_train_vectorized.values, y_train.values)

Average sparsity: 0.946
Min sparsity: 0.000
Max sparsity: 0.985
Class 0: 32 samples (47.8%)
Class 1: 35 samples (52.2%)

Feature 0, Class 0
Empirical matrix:
[[2205. 2016.]
 [ 140.  128.]]
Expected matrix:
[[2205. 2016.]
 [ 140.  128.]]

Feature 0, Class 1
Empirical matrix:
[[2016. 2205.]
 [ 128.  140.]]
Expected matrix:
[[2016. 2205.]
 [ 128.  140.]]

Feature 1, Class 0
Empirical matrix:
[[2275. 2080.]
 [  70.   64.]]
Expected matrix:
[[2275. 2080.]
 [  70.   64.]]

Feature 1, Class 1
Empirical matrix:
[[2080. 2275.]
 [  64.   70.]]
Expected matrix:
[[2080. 2275.]
 [  64.   70.]]

Feature 2, Class 0
Empirical matrix:
[[2310. 2112.]
 [  35.   32.]]
Expected matrix:
[[2310. 2112.]
 [  35.   32.]]

Feature 2, Class 1
Empirical matrix:
[[2112. 2310.]
 [  32.   35.]]
Expected matrix:
[[2112. 2310.]
 [  32.   35.]]


In [30]:
names_from_indeces(top_indeces)

array(['abloy', 'ящик', 'яхта', 'ясный', 'ярус', 'яркоголубой', 'ярко',
       'янош', 'an', 'ancestors', 'and', 'ansi', 'assa', 'at', 'august',
       'berg', 'best', 'bhma', 'ble', 'bluetooth', 'bramah', 'breeden',
       'briggs', 'brougham', 'эрик', 'эркер', 'эстенс', 'эстония',
       'эстонский', 'этаж'], dtype=object)