Для произвольного набора данных, предназначенного для классификации текстов, решите задачу классификации текста двумя способами:
1.   Способ 1. На основе CountVectorizer или TfidfVectorizer.
2.   Способ 2. На основе моделей word2vec или Glove или fastText.
3.   Сравните качество полученных моделей.

In [None]:
!pip install gensim



In [None]:
import gensim
from gensim.models import Word2Vec

In [None]:
gensim.__path__

['/usr/local/lib/python3.10/dist-packages/gensim']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import pandas as pd
import re
import numpy as np
from typing import Dict, Tuple
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Описание набора данных**

Для предобработки введем предложение

In [None]:
data = pd.read_csv('df_file.csv')

In [None]:
data.head()

Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,0
1,Army chiefs in regiments decision\n \n Militar...,0
2,Howard denies split over ID cards\n \n Michael...,0
3,Observers to monitor UK election\n \n Minister...,0
4,Kilroy names election seat target\n \n Ex-chat...,0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    2225 non-null   object
 1   Label   2225 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 34.9+ KB


In [None]:
X, Y = data['Text'], data['Label']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

## Способ 1

In [None]:
# Сформируем общий словарь для обучения моделей из обучающей и тестовой выборки
vocab_list = data['Text'].tolist()

In [None]:
vocabVect = CountVectorizer()
vocabVect.fit(vocab_list)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков - {}'.format(len(corpusVocab)))

Количество сформированных признаков - 29421


In [None]:
tfidf_vect = TfidfVectorizer()
tfidf_features = tfidf_vect.fit_transform(vocab_list)
tfidf_features

<2225x29421 sparse matrix of type '<class 'numpy.float64'>'
	with 449254 stored elements in Compressed Sparse Row format>

In [None]:
#for i in list(corpusVocab)[1:10]:
#    print('{}={}'.format(i, corpusVocab[i]))

In [None]:
#test_features = vocabVect.transform(vocab_list)
#test_features

# Способ 2

In [None]:
corpus = []
stop_words = stopwords.words('english')
tok = WordPunctTokenizer()
for line in data['Text'].values:
    line1 = line.strip().lower()
    line1 = re.sub("[^a-zA-Z]"," ", line1)
    text_tok = tok.tokenize(line1)
    text_tok1 = [w for w in text_tok if not w in stop_words]
    corpus.append(text_tok1)

In [None]:
corpus[:5]

[['budget',
  'set',
  'scene',
  'election',
  'gordon',
  'brown',
  'seek',
  'put',
  'economy',
  'centre',
  'labour',
  'bid',
  'third',
  'term',
  'power',
  'delivers',
  'ninth',
  'budget',
  'gmt',
  'expected',
  'stress',
  'importance',
  'continued',
  'economic',
  'stability',
  'low',
  'unemployment',
  'interest',
  'rates',
  'chancellor',
  'expected',
  'freeze',
  'petrol',
  'duty',
  'raise',
  'stamp',
  'duty',
  'threshold',
  'conservatives',
  'lib',
  'dems',
  'insist',
  'voters',
  'face',
  'higher',
  'taxes',
  'means',
  'testing',
  'labour',
  'treasury',
  'officials',
  'said',
  'pre',
  'election',
  'giveaway',
  'mr',
  'brown',
  'thought',
  'bn',
  'spare',
  'increase',
  'stamp',
  'duty',
  'threshold',
  'freeze',
  'petrol',
  'duty',
  'extension',
  'tax',
  'credit',
  'scheme',
  'poorer',
  'families',
  'possible',
  'help',
  'pensioners',
  'stamp',
  'duty',
  'threshold',
  'rise',
  'intended',
  'help',
  'first',
  

In [None]:
# количество текстов в корпусе не изменилось и соответствует целевому признаку
assert data.shape[0]==len(corpus)

In [None]:
%time model= Word2Vec(corpus, workers=4, min_count=10, window=10, sample=1e-3)

CPU times: user 7.02 s, sys: 31.5 ms, total: 7.05 s
Wall time: 3.87 s


In [None]:
# Проверим, что модель обучилась
print(model.wv.most_similar(positive=['sport'], topn=3))

[('programme', 0.9887061715126038), ('radio', 0.973021924495697), ('correspondent', 0.9706646800041199)]


In [None]:
# Обучающая и тестовая выборки
boundary = len(X_train)
X_train_1 = corpus[:boundary]
X_test_1 = corpus[boundary:]
y_train_1 = data.Label.values[:boundary]
y_test_1 = data.Label.values[boundary:]

# Сравнение способов

In [None]:
def accuracy_score_for_classes(
    y_true: np.ndarray,
    y_pred: np.ndarray) -> Dict[int, float]:
    """
    Вычисление метрики accuracy для каждого класса
    y_true - истинные значения классов
    y_pred - предсказанные значения классов
    Возвращает словарь: ключ - метка класса,
    значение - Accuracy для данного класса
    """
    # Для удобства фильтрации сформируем Pandas DataFrame
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values,
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

def print_accuracy_score_for_classes(
    y_true: np.ndarray,
    y_pred: np.ndarray):
    """
    Вывод метрики accuracy для каждого класса
    """
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

In [None]:
def VectorizeAndClassify(vectorizers_list, classifiers_list):
    for v in vectorizers_list:
        for c in classifiers_list:
            pipeline1 = Pipeline([("vectorizer", v), ("classifier", c)])
            score = cross_val_score(pipeline1, data['Text'], data['Label'], scoring='accuracy', cv=3).mean()
            print('Векторизация - {}'.format(v))
            print('Модель для классификации - {}'.format(c))
            print('Accuracy = {}'.format(score))
            print('===========================')

In [None]:
class EmbeddingVectorizer(object):
    '''
    Для текста усредним вектора входящих в него слов
    '''
    def __init__(self, model):
        self.model = model
        self.size = model.vector_size

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([np.mean(
            [self.model[w] for w in words if w in self.model]
            or [np.zeros(self.size)], axis=0)
            for words in X])

In [None]:
vectorizers_list = [TfidfVectorizer(vocabulary = corpusVocab),EmbeddingVectorizer(model.wv)]
classifiers_list = [LogisticRegression(C=3.0), LinearSVC(),KNeighborsClassifier()]
VectorizeAndClassify(vectorizers_list, classifiers_list)

Векторизация - TfidfVectorizer(vocabulary={'00': 0, '000': 1, '0001': 2, '000bn': 3, '000m': 4,
                            '000s': 5, '000th': 6, '001': 7, '001and': 8,
                            '001st': 9, '004': 10, '0051': 11, '007': 12,
                            '01': 13, '0100': 14, '011': 15, '0130': 16,
                            '02': 17, '0200': 18, '0227': 19, '028': 20,
                            '03': 21, '0300': 22, '033': 23, '037': 24,
                            '03bn': 25, '04': 26, '0400': 27, '041': 28,
                            '04bn': 29, ...})
Модель для классификации - LogisticRegression(C=3.0)
Accuracy = 0.9743856496587381
Векторизация - TfidfVectorizer(vocabulary={'00': 0, '000': 1, '0001': 2, '000bn': 3, '000m': 4,
                            '000s': 5, '000th': 6, '001': 7, '001and': 8,
                            '001st': 9, '004': 10, '0051': 11, '007': 12,
                            '01': 13, '0100': 14, '011': 15, '0130': 16,
                   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Векторизация - <__main__.EmbeddingVectorizer object at 0x7ef0d1277b80>
Модель для классификации - LogisticRegression(C=3.0)
Accuracy = 0.3447188362779227
Векторизация - <__main__.EmbeddingVectorizer object at 0x7ef0d1277b80>
Модель для классификации - LinearSVC()
Accuracy = 0.37078181666066473
Векторизация - <__main__.EmbeddingVectorizer object at 0x7ef0d1277b80>
Модель для классификации - KNeighborsClassifier()
Accuracy = 0.3393249694143438


In [None]:
def sentiment(v, c):
    model = Pipeline(
        [("vectorizer", v),
         ("classifier", c)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print_accuracy_score_for_classes(y_test, y_pred)

In [None]:
sentiment(TfidfVectorizer(), LogisticRegression(C=3.0))

Метка 	 Accuracy
0 	 0.9347826086956522
1 	 1.0
2 	 0.961038961038961
3 	 0.9736842105263158
4 	 0.9803921568627451
