### Лабораторная работа №6: "Классификация текста"
#### ИУ5-21 Курганова Александра
#### Задание:
Для произвольного набора данных, предназначенного для классификации текстов решите задачу классификации текста двумя способами:
* Способ 1. На основе CountVectorizer или TfidfVectorizer.
* Способ 2. На основе моделей word2vec или Glove или fastText.
* Сравните качество полученных моделей.

In [None]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC, NuSVC, LinearSVC, OneClassSVM, SVR, NuSVR, LinearSVR
import seaborn as sns
import tensorflow as tf
from collections import Counter
from sklearn.datasets import fetch_20newsgroups
import gensim
from gensim.models import word2vec

%matplotlib inline 
sns.set(style="ticks")

In [None]:
categories = ["talk.politics.guns", "alt.atheism", "sci.med", "rec.autos"]
newsgroups = fetch_20newsgroups(subset='train', categories=categories)
data = newsgroups['data']

In [None]:
def accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray) -> Dict[int, float]:
    """
    Вычисление метрики accuracy для каждого класса
    y_true - истинные значения классов
    y_pred - предсказанные значения классов
    Возвращает словарь: ключ - метка класса, 
    значение - Accuracy для данного класса
    """
    # Для удобства фильтрации сформируем Pandas DataFrame 
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют 
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values, 
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

def print_accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray):
    """
    Вывод метрики accuracy для каждого класса
    """
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

In [None]:
# CountVectorizer
vocabVect = CountVectorizer()
vocabVect.fit(data)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков - {}'.format(len(corpusVocab)))

Количество сформированных признаков - 37176


In [None]:
for i in list(corpusVocab)[1:10]:
    print('{}={}'.format(i, corpusVocab[i]))

thom=33375
morgan=23251
ucs=34360
mun=23527
ca=8754
thomas=33376
clancy=9784
subject=32210
re=28101


In [None]:
test_features = vocabVect.transform(data)
test_features

<2214x37176 sparse matrix of type '<class 'numpy.int64'>'
	with 375168 stored elements in Compressed Sparse Row format>

In [None]:
# Размер нулевой строки
len(test_features.todense()[0].getA1())

37176

In [None]:
# Непустые значения нулевой строки
[i for i in test_features.todense()[0].getA1() if i>0]

[1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 6,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 2,
 6,
 1,
 1,
 1,
 1,
 2,
 1,
 3,
 1,
 1,
 1,
 6,
 1,
 1,
 1,
 1,
 4,
 1,
 5,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 3,
 1,
 1,
 3,
 1,
 5,
 1,
 1,
 1,
 1,
 2,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 3,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 6,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 6,
 11,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 6,
 10,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 3,
 2,
 3,
 1,
 3,
 2,
 1,
 3,
 1,
 10,
 3]

In [None]:
vocabVect.get_feature_names()[100:120]

['025818u28037',
 '025924',
 '0278',
 '02908',
 '03',
 '030031',
 '030105',
 '030334',
 '0306',
 '030706',
 '030734',
 '031423',
 '0318',
 '0320',
 '032251',
 '032620',
 '032905',
 '033',
 '033446',
 '034']

In [None]:
def VectorizeAndClassify(vectorizers_list, classifiers_list):
    for v in vectorizers_list:
        for c in classifiers_list:
            pipeline1 = Pipeline([("vectorizer", v), ("classifier", c)])
            score = cross_val_score(pipeline1, newsgroups['data'], newsgroups['target'], scoring='accuracy', cv=3).mean()
            print('Векторизация - {}'.format(v))
            print('Модель для классификации - {}'.format(c))
            print('Accuracy = {}'.format(score))
            print('===========================')

In [None]:
vectorizers_list = [CountVectorizer(vocabulary = corpusVocab), TfidfVectorizer(vocabulary = corpusVocab)]
classifiers_list = [LogisticRegression(C=3.0), LinearSVC(), KNeighborsClassifier()]
VectorizeAndClassify(vectorizers_list, classifiers_list)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Векторизация - CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None,
                vocabulary={'00': 0, '000': 1, '0000': 2, '0000001200': 3,
                            '00014': 4, '000152': 5, '000406': 6,
                            '0005111312': 7, '0005111312na3em': 8, '000601': 9,
                            '000710': 10, '000mi': 11, '000miles': 12,
                            '000s': 13, '001': 14, '0010': 15, '001004': 16,
                            '001125': 17, '001319': 18, '001642': 19, '002': 20,
                            '002142': 21, '002651': 22, '003': 23,
                            '003258u19250': 24, '0033': 25, '003522': 

In [None]:
# word2vec 
# Подготовим корпус
corpus = []
stop_words = stopwords.words('english')
tok = WordPunctTokenizer()
for line in newsgroups['data']:
    line1 = line.strip().lower()
    line1 = re.sub("[^a-zA-Z]"," ", line1)
    text_tok = tok.tokenize(line1)
    text_tok1 = [w for w in text_tok if not w in stop_words]
    corpus.append(text_tok1)

In [None]:
corpus[:5]

[['thom',
  'morgan',
  'ucs',
  'mun',
  'ca',
  'thomas',
  'clancy',
  'subject',
  'thrush',
  'good',
  'grief',
  'candida',
  'albicans',
  'organization',
  'memorial',
  'university',
  'newfoundland',
  'lines',
  'dyer',
  'spdcc',
  'com',
  'steve',
  'dyer',
  'writes',
  'article',
  'apr',
  'ucsvax',
  'sdsu',
  'edu',
  'mccurdy',
  'ucsvax',
  'sdsu',
  'edu',
  'mccurdy',
  'writes',
  'dyer',
  'beyond',
  'rude',
  'drink',
  'yeah',
  'yeah',
  'yeah',
  'threaten',
  'rip',
  'lips',
  'snort',
  'always',
  'people',
  'blinded',
  'knowledge',
  'unopen',
  'anything',
  'already',
  'established',
  'given',
  'medical',
  'community',
  'know',
  'surprised',
  'outlook',
  'duh',
  'nice',
  'see',
  'steve',
  'still',
  'high',
  'almighty',
  'intellectual',
  'prowess',
  'tact',
  'record',
  'several',
  'outbreaks',
  'thrush',
  'several',
  'past',
  'years',
  'indication',
  'immunosuppression',
  'nutritional',
  'deficiencies',
  'taken',
  'an

In [None]:
%time model_data = word2vec.Word2Vec(corpus, workers=4, min_count=10, window=10, sample=1e-3)

CPU times: user 7.91 s, sys: 39.4 ms, total: 7.95 s
Wall time: 4.94 s


In [None]:
def sentiment(v, c):
    model = Pipeline(
        [("vectorizer", v), 
         ("classifier", c)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print_accuracy_score_for_classes(y_test, y_pred)

In [None]:
class EmbeddingVectorizer(object):
    '''
    Для текста усредним вектора входящих в него слов
    '''
    def __init__(self, model):
        self.model = model
        self.size = model.vector_size

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([np.mean(
            [self.model[w] for w in words if w in self.model] 
            or [np.zeros(self.size)], axis=0)
            for words in X])

In [None]:
# Обучающая и тестовая выборки
boundary = 700
X_train = corpus[:boundary] 
X_test = corpus[boundary:]
y_train = newsgroups['target'][:boundary]
y_test = newsgroups['target'][boundary:]

In [None]:
sentiment(EmbeddingVectorizer(model_data.wv), LogisticRegression(C=5.0))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Метка 	 Accuracy
0 	 0.9003021148036254
1 	 0.8833746898263027
2 	 0.837772397094431
3 	 0.8365122615803815
