In [1]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Загружаем данные (тексты и метки)
categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers','footers','quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers','footers','quotes'))

X_train = newsgroups_train.data
y_train = newsgroups_train.target
X_test = newsgroups_test.data
y_test = newsgroups_test.target

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Создаем пайплайн: TF-IDF + логистическая регрессия
pipeline_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.7)),
    ('clf', LogisticRegression(max_iter=1000))
])

# Обучаем модель
pipeline_tfidf.fit(X_train, y_train)

# Предсказание и оценка
y_pred_tfidf = pipeline_tfidf.predict(X_test)
print("Отчет по классификации (TF-IDF + LogisticRegression):")
print(classification_report(y_test, y_pred_tfidf, target_names=categories))

Отчет по классификации (TF-IDF + LogisticRegression):
                        precision    recall  f1-score   support

           alt.atheism       0.80      0.56      0.66       319
         comp.graphics       0.90      0.93      0.92       389
               sci.med       0.80      0.91      0.85       396
soc.religion.christian       0.79      0.84      0.81       398

              accuracy                           0.82      1502
             macro avg       0.82      0.81      0.81      1502
          weighted avg       0.82      0.82      0.82      1502



In [3]:
from gensim.models import KeyedVectors

# Укажите путь к файлу с моделью fastText
model_path = r"C:\Users\artem\OneDrive\Рабочий стол\Бомонка\Мага\2 семестр\MMO\Lab6\wiki-news-300d-1M.vec"

# Загрузка модели (если .vec — текстовый формат, если .bin — binary=True)
fasttext_model = KeyedVectors.load_word2vec_format(model_path, binary=False)

In [6]:
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator

class FastTextVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        self.dim = model.vector_size
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        vectors = []
        for doc in X:
            words = doc.lower().split()
            word_vecs = [self.model[word] for word in words if word in self.model]
            if word_vecs:
                vectors.append(np.mean(word_vecs, axis=0))
            else:
                vectors.append(np.zeros(self.dim))
        return np.array(vectors)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Преобразуем тексты в векторы
X_train_vec = FastTextVectorizer(fasttext_model).fit_transform(X_train)
X_test_vec = FastTextVectorizer(fasttext_model).transform(X_test)

# Обучаем классификатор
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)

# Предсказания и отчет
y_pred = clf.predict(X_test_vec)
print("Отчет по классификации (fastText + LogisticRegression):")
print(classification_report(y_test, y_pred, target_names=categories))


Отчет по классификации (fastText + LogisticRegression):
                        precision    recall  f1-score   support

           alt.atheism       0.66      0.37      0.47       319
         comp.graphics       0.85      0.90      0.88       389
               sci.med       0.82      0.85      0.84       396
soc.religion.christian       0.67      0.85      0.75       398

              accuracy                           0.76      1502
             macro avg       0.75      0.74      0.73      1502
          weighted avg       0.76      0.76      0.75      1502

