Для произвольного набора данных, предназначенного для классификации текстов, решите задачу классификации текста двумя способами:

    Способ 1. На основе CountVectorizer или TfidfVectorizer.
    Способ 2. На основе моделей word2vec или Glove или fastText.
    Сравните качество полученных моделей.


In [22]:
# 20 Newsgroups - набор состоящий из 20 тясяч постов по 20 различным темам.
from sklearn.datasets import fetch_20newsgroups

import numpy as np
import pandas as pd
from typing import Dict, Tuple
from scipy import stats
from IPython.display import Image
from sklearn.datasets import load_iris, load_boston
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC, NuSVC, LinearSVC, OneClassSVM, SVR, NuSVR, LinearSVR
import seaborn as sns
import matplotlib.pyplot as plt

import gensim
from gensim.models import word2vec
import re
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

%matplotlib inline 
sns.set(style="ticks")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arthur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray) -> Dict[int, float]:
    """
    Вычисление метрики accuracy для каждого класса
    y_true - истинные значения классов
    y_pred - предсказанные значения классов
    Возвращает словарь: ключ - метка класса, 
    значение - Accuracy для данного класса
    """
    # Для удобства фильтрации сформируем Pandas DataFrame 
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют 
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values, 
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

def print_accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray):
    """
    Вывод метрики accuracy для каждого класса
    """
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

In [4]:
categories = ["comp.os.ms-windows.misc", "sci.crypt", "talk.religion.misc", "rec.autos"]
# Извлекаем из набора подмножество - для обучения
newsgroups = fetch_20newsgroups(subset='train', categories=categories)
data = newsgroups['data']

In [5]:
data

['From: pp@cbnewsl.cb.att.com (peter.peng)\nSubject: 1990 Integra LS for sale\nOrganization: AT&T Bell Laboratories\nDistribution: nj\nKeywords: for sale integra\nLines: 15\n\n\n\n********* 1990 Integra LS for Sale *********\n\n5 speed, sunroof, rear spoiler, new tires\n59.7K miles\n\n$ 7950 or best offer.\n\ncall 908-949-0878\n     908-938-4101\n\nemail att!hotsoup!peng\n\n*********************************************\n',
 'From: ricktait@bnr.co.uk (Rick Tait)\nSubject: Re: What the clipper nay-sayers sound like to me.\nNntp-Posting-Host: 47.20.192.158\nOrganization: Network Management Systems, Bell Northern Research.\nX-Newsreader: TIN [version 1.1 PL8]\nDistribution: na\nLines: 45\n\nNathaniel Sammons (ns111310@LANCE.ColoState.Edu) wrote on Mon, 19 Apr 1993 02:36:36 GMT: \n> If the gov establishes a cryptography standard that has to be used by\n> everyone, and everyone\'s personal key is divided into two segments\n> and stored at two separate, albeit easy to find places, and that ke

In [6]:
# CountVectorizer
vocabVect = CountVectorizer()
vocabVect.fit(data)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков - {}'.format(len(corpusVocab)))

Количество сформированных признаков - 63878


In [7]:
for i in list(corpusVocab)[1:10]:
    print('{}={}'.format(i, corpusVocab[i]))

pp=46599
cbnewsl=19077
cb=19041
att=15577
com=20266
peter=45744
peng=45600
subject=54145
1990=2697


In [8]:
test_features = vocabVect.transform(data)
test_features

<2157x63878 sparse matrix of type '<class 'numpy.int64'>'
	with 381088 stored elements in Compressed Sparse Row format>

In [9]:
# Размер нулевой строки
len(test_features.todense()[0].getA1())

63878

In [1]:
# Непустые значения нулевой строки
# [i for i in test_features.todense()[0].getA1() if i>0]

In [11]:
vocabVect.get_feature_names()[100:120]

['00101100',
 '00101100b',
 '00101101',
 '00101101b',
 '00101110',
 '00101110b',
 '00101111',
 '00101111b',
 '0011',
 '00110000',
 '00110000b',
 '00110001',
 '00110001b',
 '00110010',
 '00110010b',
 '00110011',
 '00110011b',
 '00110100',
 '00110100b',
 '00110101']

In [12]:
def VectorizeAndClassify(vectorizers_list, classifiers_list):
    for v in vectorizers_list:
        for c in classifiers_list:
            pipeline1 = Pipeline([("vectorizer", v), ("classifier", c)])
            score = cross_val_score(pipeline1, newsgroups['data'], newsgroups['target'], scoring='accuracy', cv=3).mean()
            print('Векторизация - {}'.format(v))
            print('Модель для классификации - {}'.format(c))
            print('Accuracy = {}'.format(score))
            print('===========================')

In [13]:
vectorizers_list = [CountVectorizer(vocabulary = corpusVocab), TfidfVectorizer(vocabulary = corpusVocab)]
classifiers_list = [LogisticRegression(C=3.0), LinearSVC(), KNeighborsClassifier()]
VectorizeAndClassify(vectorizers_list, classifiers_list)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Векторизация - CountVectorizer(vocabulary={'00': 0, '000': 1, '00000000': 2, '00000000b': 3,
                            '00000001': 4, '00000001b': 5, '00000010': 6,
                            '00000010b': 7, '00000011': 8, '00000011b': 9,
                            '00000100': 10, '00000100b': 11, '00000101': 12,
                            '00000101b': 13, '00000110': 14, '00000110b': 15,
                            '00000111': 16, '00000111b': 17, '00001000': 18,
                            '00001000b': 19, '00001001': 20, '00001001b': 21,
                            '00001010': 22, '00001010b': 23, '00001011': 24,
                            '00001011b': 25, '00001100': 26, '00001100b': 27,
                            '00001101': 28, '00001101b': 29, ...})
Модель для классификации - LogisticRegression(C=3.0)
Accuracy = 0.956884561891516




Векторизация - CountVectorizer(vocabulary={'00': 0, '000': 1, '00000000': 2, '00000000b': 3,
                            '00000001': 4, '00000001b': 5, '00000010': 6,
                            '00000010b': 7, '00000011': 8, '00000011b': 9,
                            '00000100': 10, '00000100b': 11, '00000101': 12,
                            '00000101b': 13, '00000110': 14, '00000110b': 15,
                            '00000111': 16, '00000111b': 17, '00001000': 18,
                            '00001000b': 19, '00001001': 20, '00001001b': 21,
                            '00001010': 22, '00001010b': 23, '00001011': 24,
                            '00001011b': 25, '00001100': 26, '00001100b': 27,
                            '00001101': 28, '00001101b': 29, ...})
Модель для классификации - LinearSVC()
Accuracy = 0.9601298099211869
Векторизация - CountVectorizer(vocabulary={'00': 0, '000': 1, '00000000': 2, '00000000b': 3,
                            '00000001': 4, '00000001b': 5, '0000

In [17]:
# word2vec 
# Подготовим корпус
corpus = []
stop_words = stopwords.words('english')
tok = WordPunctTokenizer()
for line in newsgroups['data']:
    line1 = line.strip().lower()
    line1 = re.sub("[^a-zA-Z]"," ", line1)
    text_tok = tok.tokenize(line1)
    text_tok1 = [w for w in text_tok if not w in stop_words]
    corpus.append(text_tok1)

In [18]:
corpus[:5]

[['pp',
  'cbnewsl',
  'cb',
  'att',
  'com',
  'peter',
  'peng',
  'subject',
  'integra',
  'ls',
  'sale',
  'organization',
  'bell',
  'laboratories',
  'distribution',
  'nj',
  'keywords',
  'sale',
  'integra',
  'lines',
  'integra',
  'ls',
  'sale',
  'speed',
  'sunroof',
  'rear',
  'spoiler',
  'new',
  'tires',
  'k',
  'miles',
  'best',
  'offer',
  'call',
  'email',
  'att',
  'hotsoup',
  'peng'],
 ['ricktait',
  'bnr',
  'co',
  'uk',
  'rick',
  'tait',
  'subject',
  'clipper',
  'nay',
  'sayers',
  'sound',
  'like',
  'nntp',
  'posting',
  'host',
  'organization',
  'network',
  'management',
  'systems',
  'bell',
  'northern',
  'research',
  'x',
  'newsreader',
  'tin',
  'version',
  'pl',
  'distribution',
  'na',
  'lines',
  'nathaniel',
  'sammons',
  'ns',
  'lance',
  'colostate',
  'edu',
  'wrote',
  'mon',
  'apr',
  'gmt',
  'gov',
  'establishes',
  'cryptography',
  'standard',
  'used',
  'everyone',
  'everyone',
  'personal',
  'key',
 

In [23]:
%time model_data = word2vec.Word2Vec(corpus, workers=4, min_count=10, window=10, sample=1e-3)

Wall time: 4.46 s


In [24]:
def sentiment(v, c):
    model = Pipeline(
        [("vectorizer", v), 
         ("classifier", c)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print_accuracy_score_for_classes(y_test, y_pred)

In [25]:
class EmbeddingVectorizer(object):
    '''
    Для текста усредним вектора входящих в него слов
    '''
    def __init__(self, model):
        self.model = model
        self.size = model.vector_size

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([np.mean(
            [self.model[w] for w in words if w in self.model] 
            or [np.zeros(self.size)], axis=0)
            for words in X])

In [26]:
# Обучающая и тестовая выборки
boundary = 700
X_train = corpus[:boundary] 
X_test = corpus[boundary:]
y_train = newsgroups['target'][:boundary]
y_test = newsgroups['target'][boundary:]

In [27]:
sentiment(EmbeddingVectorizer(model_data.wv), LogisticRegression(C=5.0))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Метка 	 Accuracy
0 	 0.9266503667481663
1 	 0.896551724137931
2 	 0.9425587467362925
3 	 0.861003861003861
