# Классификация текста простыми методами

Загружем необходимые данные для nltk.

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Мы будем использовать датасет fetch_20newsgroups. Он содержит коллекции новостей с 20 различных источников. Но мы возьмем только 4.

In [None]:
from sklearn.datasets import fetch_20newsgroups
categories = ['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

In [None]:
set(newsgroups_train.target)

{0, 1, 2, 3}

Загружаем данные и таргеты

In [None]:
X_train = newsgroups_train.data
y_train = newsgroups_train.target

In [None]:
X_test = newsgroups_test.data
y_test = newsgroups_test.target

Смотрим на количество данных

In [None]:
len(X_train), len(y_train)

(2373, 2373)

In [None]:
len(X_test), len(y_test)

(1579, 1579)

TfidfVectorizer – это одновременно CountVectorizer после которого идет TfidfTransformer. 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_train_vec.shape

(2373, 38683)

Воспользуемся LogisticRegression.

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X_train_vec, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Оценим модель.

In [None]:
X_test_vec = X_train_vec = vectorizer.transform(X_test)
y_pred = lr.predict(X_test_vec)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=y_pred, target_names=newsgroups_train.target_names))

                 precision    recall  f1-score   support

      sci.crypt       0.95      0.90      0.93       396
sci.electronics       0.80      0.94      0.86       393
        sci.med       0.95      0.87      0.91       396
      sci.space       0.97      0.93      0.95       394

       accuracy                           0.91      1579
      macro avg       0.92      0.91      0.91      1579
   weighted avg       0.92      0.91      0.91      1579



## Предобработка данных

До этого мы не применяли предобработку. Посмотрим насколько она нам может помочь.

Рассмотрим сначала предобработку на одном примере.



Токенизируем.

In [None]:
x = nltk.word_tokenize(x)
print(x)

['From', ':', 'al', '@', 'escom.com', '(', 'Al', 'Donaldson', ')', 'Subject', ':', 'Re', ':', 'Once', 'tapped', ',', 'your', 'code', 'is', 'no', 'good', 'any', 'more', '.', 'Reply-To', ':', 'al', '@', 'escom.COM', '(', 'Al', 'Donaldson', ')', 'Organization', ':', 'ESCOM', 'Corp.', ',', 'Oakton', 'VA', '(', 'USA', ')', 'Distribution', ':', 'na', 'Lines', ':', '16', 'amolitor', '@', 'nmsu.edu', '(', 'Andrew', 'Molitor', ')', 'writes', ':', '>', 'Yes', ',', 'those', 'evil', 'guys', 'in', 'the', 'FBI', 'can', 'probably', ',', 'with', 'some', '>', 'effort', ',', 'abuse', 'the', 'system', '.', 'I', 'got', 'news', 'for', 'you', ',', 'if', 'the', 'evil', 'guys', 'in', '>', 'the', 'FBI', 'decide', 'they', 'want', 'to', 'persecute', 'you', ',', 'they', "'re", 'gon', 'na', ',', '...', 'And', 'if', 'Richard', 'Nixon', 'had', 'had', 'this', 'kind', 'of', 'toy', ',', 'he', 'would', "n't", 'have', 'had', 'to', 'send', 'people', 'into', 'the', 'Watergate', '.', 'But', 'that', "'s", 'not', 'really', 't

Удалим слова со знаками препинания.

In [None]:
x = [word for word in x if word.isalnum()]
print(x)

['From', 'al', 'Al', 'Donaldson', 'Subject', 'Re', 'Once', 'tapped', 'your', 'code', 'is', 'no', 'good', 'any', 'more', 'al', 'Al', 'Donaldson', 'Organization', 'ESCOM', 'Oakton', 'VA', 'USA', 'Distribution', 'na', 'Lines', '16', 'amolitor', 'Andrew', 'Molitor', 'writes', 'Yes', 'those', 'evil', 'guys', 'in', 'the', 'FBI', 'can', 'probably', 'with', 'some', 'effort', 'abuse', 'the', 'system', 'I', 'got', 'news', 'for', 'you', 'if', 'the', 'evil', 'guys', 'in', 'the', 'FBI', 'decide', 'they', 'want', 'to', 'persecute', 'you', 'they', 'gon', 'na', 'And', 'if', 'Richard', 'Nixon', 'had', 'had', 'this', 'kind', 'of', 'toy', 'he', 'would', 'have', 'had', 'to', 'send', 'people', 'into', 'the', 'Watergate', 'But', 'that', 'not', 'really', 'the', 'issue', 'The', 'real', 'issue', 'is', 'whether', 'this', 'will', 'be', 'used', 'to', 'justify', 'a', 'ban', 'against', 'individuals', 'use', 'of', 'private', 'anything', 'else', 'encryption', 'methods', 'Unrelated', 'question', 'is', 'the', 'term', '

Лемматизируем.

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(word):
     tag = nltk.pos_tag([word])[0][1][0].upper()
     tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
     return tag_dict.get(tag, wordnet.NOUN)


lemmatizer = WordNetLemmatizer()
x = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in x]
print(x)

['From', 'al', 'Al', 'Donaldson', 'Subject', 'Re', 'Once', 'tapped', 'your', 'code', 'be', 'no', 'good', 'any', 'more', 'al', 'Al', 'Donaldson', 'Organization', 'ESCOM', 'Oakton', 'VA', 'USA', 'Distribution', 'na', 'Lines', '16', 'amolitor', 'Andrew', 'Molitor', 'writes', 'Yes', 'those', 'evil', 'guy', 'in', 'the', 'FBI', 'can', 'probably', 'with', 'some', 'effort', 'abuse', 'the', 'system', 'I', 'get', 'news', 'for', 'you', 'if', 'the', 'evil', 'guy', 'in', 'the', 'FBI', 'decide', 'they', 'want', 'to', 'persecute', 'you', 'they', 'gon', 'na', 'And', 'if', 'Richard', 'Nixon', 'have', 'have', 'this', 'kind', 'of', 'toy', 'he', 'would', 'have', 'have', 'to', 'send', 'people', 'into', 'the', 'Watergate', 'But', 'that', 'not', 'really', 'the', 'issue', 'The', 'real', 'issue', 'be', 'whether', 'this', 'will', 'be', 'use', 'to', 'justify', 'a', 'ban', 'against', 'individual', 'use', 'of', 'private', 'anything', 'else', 'encryption', 'method', 'Unrelated', 'question', 'be', 'the', 'term', 'Cl

Удалим стоп-слова.

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
print(len(x))
x = [word for word in x if not word in stop_words]
print(x)
print(len(x))

131
['From', 'al', 'Al', 'Donaldson', 'Subject', 'Re', 'Once', 'tapped', 'code', 'good', 'al', 'Al', 'Donaldson', 'Organization', 'ESCOM', 'Oakton', 'VA', 'USA', 'Distribution', 'na', 'Lines', '16', 'amolitor', 'Andrew', 'Molitor', 'writes', 'Yes', 'evil', 'guy', 'FBI', 'probably', 'effort', 'abuse', 'system', 'I', 'get', 'news', 'evil', 'guy', 'FBI', 'decide', 'want', 'persecute', 'gon', 'na', 'And', 'Richard', 'Nixon', 'kind', 'toy', 'would', 'send', 'people', 'Watergate', 'But', 'really', 'issue', 'The', 'real', 'issue', 'whether', 'use', 'justify', 'ban', 'individual', 'use', 'private', 'anything', 'else', 'encryption', 'method', 'Unrelated', 'question', 'term', 'Clipper', 'neat', 'already', 'take', 'Intergraph', 'Al']
80


In [None]:
from tqdm import tqdm

def preprocces(X):
  X_proccess = []
  lemmatizer = WordNetLemmatizer()

  for x in tqdm(X):
    
    x = x.lower()
    x = nltk.word_tokenize(x)
    x = [word for word in x if word.isalnum()]
    x = [lemmatizer.lemmatize(w) for w in x]
    x = [word for word in x if not word in stop_words]
    X_proccess.append(' '.join(x))
  return X_proccess


X_train_proc = preprocces(X_train)
X_test_proc = preprocces(X_test)

100%|██████████| 2373/2373 [00:09<00:00, 242.91it/s]
100%|██████████| 1579/1579 [00:05<00:00, 276.81it/s]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train_proc)
X_test_vec = vectorizer.transform(X_test_proc)
print(X_train_vec.shape)
print(X_test_vec.shape)

(2373, 29333)
(1579, 29333)


In [None]:
lr = LogisticRegression()
lr.fit(X_train_vec, y_train)
y_pred = lr.predict(X_test_vec)

print(classification_report(y_true=y_test, y_pred=y_pred, target_names=newsgroups_train.target_names))

                 precision    recall  f1-score   support

      sci.crypt       0.98      0.92      0.95       396
sci.electronics       0.85      0.97      0.90       393
        sci.med       0.96      0.92      0.94       396
      sci.space       0.98      0.95      0.97       394

       accuracy                           0.94      1579
      macro avg       0.94      0.94      0.94      1579
   weighted avg       0.94      0.94      0.94      1579



## Стемминг 

Воспользуемся стеммингом вместо лемматизации

In [None]:
from nltk.stem import PorterStemmer

def preprocces(X):
  X_proccess = []
  stemmer = PorterStemmer()

  for x in tqdm(X):
    
    x = x.lower()
    x = nltk.word_tokenize(x)
    x = [word for word in x if word.isalnum()]
    x = [stemmer.stem(w) for w in x]
    x = [word for word in x if not word in stop_words]
    X_proccess.append(' '.join(x))
  return X_proccess


X_train_proc = preprocces(X_train)
X_test_proc = preprocces(X_test)

100%|██████████| 2373/2373 [00:20<00:00, 118.62it/s]
100%|██████████| 1579/1579 [00:11<00:00, 137.16it/s]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train_proc)
X_test_vec = vectorizer.transform(X_test_proc)
print(X_train_vec.shape)
print(X_test_vec.shape)

(2373, 23978)
(1579, 23978)


In [None]:
lr = LogisticRegression()
lr.fit(X_train_vec, y_train)
y_pred = lr.predict(X_test_vec)

print(classification_report(y_true=y_test, y_pred=y_pred, target_names=newsgroups_train.target_names))

                 precision    recall  f1-score   support

      sci.crypt       0.96      0.92      0.94       396
sci.electronics       0.84      0.95      0.89       393
        sci.med       0.95      0.91      0.93       396
      sci.space       0.99      0.94      0.96       394

       accuracy                           0.93      1579
      macro avg       0.94      0.93      0.93      1579
   weighted avg       0.94      0.93      0.93      1579

