In [161]:
import numpy as np
from sklearn import datasets
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression

In [2]:
categories = ['comp.graphics', 'comp.sys.ibm.pc.hardware', 'comp.windows.x', 'sci.electronics']

In [3]:
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [163]:
count_vect = CountVectorizer()

In [164]:
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test = count_vect.transform(twenty_test.data)

In [100]:
n_fold = 20
stratified_folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

In [165]:
def grid(model, parameter_grid):
    grid_search = GridSearchCV(model, param_grid=parameter_grid, cv=stratified_folds, scoring = 'f1_macro')
    grid_search.fit(X_train_counts, twenty_train.target)
    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

MultinomialNB

In [166]:
multi_nb = MultinomialNB()

parameter_grid = {'alpha': [1e-06, 1e-04, 0.001, 0.1, 2, 10]}

In [167]:
grid(multi_nb, parameter_grid)

Best score: 0.9186216140596164
Best parameters: {'alpha': 0.1}


LogisticRegression

In [7]:
log_reg = LogisticRegression(multi_class = 'auto', max_iter = 1500)
parameter_grid = {'class_weight' : ['balanced', None],
                  'solver' : ['lbfgs', 'newton-cg'],
                  'C' : [0.001, 0.01, 0.08, 0.1, 0.15, 1.0, 10.0, 100.0]
                 }

In [11]:
grid(log_reg, parameter_grid)

Best score: 0.8806094269589704
Best parameters: {'C': 100.0, 'class_weight': 'balanced', 'solver': 'lbfgs'}


DecisionTreeClassifier


In [151]:
dtc = DecisionTreeClassifier()

parameter_grid = {'splitter' : ['best', 'random'],
                  'max_depth' : [10, 20, 30, 40]}

In [0]:
grid(dtc, parameter_grid)

Best score: 0.6994560036808652
Best parameters: {'max_depth': 40, 'splitter': 'best'}


Обучаем с оптимальными параметрами

In [8]:
def cross_val(model, n):
    scores = cross_val_score(model, X_train_counts, twenty_train.target, cv=n, scoring='f1_macro')
    print('macro F1={0:1.4f}'.format(np.mean(scores)))

In [9]:
def model_test(model):
    model.fit(X_train_counts, twenty_train.target)
    predicted = model.predict(X_test)
    macro_f1 = f1_score(twenty_test.target, predicted, average = 'macro')
    print('macro F1={0:1.4f}'.format(macro_f1))

In [172]:
multi_nb = MultinomialNB(alpha = 0.1)
log_reg = LogisticRegression(multi_class = 'auto', solver = 'lbfgs', max_iter = 1500, class_weight = 'balanced', C = 100)
dtc = DecisionTreeClassifier(max_depth = 40, splitter = 'best')

In [169]:
print('Кросс-валидация:')
cross_val(multi_nb, 20)
print('Результат на тесте:')
model_test(multi_nb)

Кросс-валидация:
macro F1=0.9195
Результат на тесте:
macro F1=0.8311


In [173]:
print('Кросс-валидация:')
cross_val(log_reg, 20)
print('Результат на тесте:')
model_test(log_reg)

Кросс-валидация:
macro F1=0.8801
Результат на тесте:
macro F1=0.8086


In [174]:
print('Кросс-валидация:')
cross_val(dtc, 20)
print('Результат на тесте:')
model_test(dtc)

Кросс-валидация:
macro F1=0.7004
Результат на тесте:
macro F1=0.6504


Разница при кросс-валидации и оценке на тесте:

multi_nb  0.08

log_reg  0.079

dtc  0.05

## Отбор признаков

In [158]:
index_to_word = {v:k for k,v in count_vect.vocabulary_.items()}

In [159]:
def analyze_features(model, n):
    for i in range(model.coef_.shape[0]):
        print('\nClass {}'.format(categories[i]))
        feats = []
        for j in range(model.coef_.shape[1]):
            feats.append((model.coef_[i][j], index_to_word[j]))
        for f in sorted(feats, reverse = True)[:n]:
            print(f)

In [162]:
analyze_features(multi_nb, 20)


Class comp.graphics
(-3.396320433316946, 'the')
(-3.887921692166975, 'to')
(-4.00434983910181, 'of')
(-4.0406260231720905, 'and')
(-4.3483643128564085, 'is')
(-4.393332767312525, 'in')
(-4.439793051143201, 'for')
(-4.530103548226745, 'it')
(-4.737937860699033, 'from')
(-4.865453204438198, 'you')
(-4.872180118138575, 'edu')
(-4.9025270044362586, 'that')
(-5.008318318901138, 'on')
(-5.0804649816983165, 'this')
(-5.206879748758175, 'or')
(-5.231434249339488, 'be')
(-5.258024117009269, 'with')
(-5.317931013187596, 'have')
(-5.346958410196067, 'lines')
(-5.359437124611821, 'can')

Class comp.sys.ibm.pc.hardware
(-3.232958248022854, 'the')
(-3.93343775313184, 'to')
(-4.139901461116846, 'and')
(-4.287930433379346, 'of')
(-4.360012972415311, 'is')
(-4.435663397374319, 'it')
(-4.592289871652685, 'in')
(-4.651029534728877, 'for')
(-4.7685920250071545, 'that')
(-4.802229638032376, 'with')
(-4.812051934857639, 'from')
(-4.88153140891626, 'on')
(-4.908339791128955, 'edu')
(-4.98043495916883, 'you'

Очень много стоп-слов

In [164]:
analyze_features(log_reg, 20)


Class comp.graphics
(0.6726981286221385, 'graphics')
(0.3914723091719991, '3d')
(0.3590483356841819, 'image')
(0.34134528750797677, 'images')
(0.3081967382489525, 'pov')
(0.30360072476896144, 'files')
(0.27861363396016914, 'vga')
(0.26636459206916746, 'tiff')
(0.2503519376943796, 'polygon')
(0.2415900178984446, 'gif')
(0.23881743712538553, 'points')
(0.23824765994625854, 'cview')
(0.23636365122516712, 'algorithm')
(0.23626752982215757, '3do')
(0.23358290199431517, 'file')
(0.22280017567490315, 'animation')
(0.2205709533454828, '24')
(0.21714438788391763, 'library')
(0.20196439713182668, '42')
(0.19948912901646196, 'format')

Class comp.sys.ibm.pc.hardware
(0.37837235451250084, 'drive')
(0.3398221130559374, 'monitor')
(0.3347698691192369, 'card')
(0.3253611263134171, 'memory')
(0.32273610278534304, 'disk')
(0.2909147430364155, 'gateway')
(0.29082496742541125, 'pc')
(0.285146221930094, 'scsi')
(0.27096077568678045, 'motherboard')
(0.2543378340068293, '486')
(0.2462709654561896, 'floppy'

Встречаются цифры

In [121]:
def dtc_features(model, n):
    feats = []
    for i in range(model.feature_importances_.shape[0]):
        feats.append((model.feature_importances_[i], index_to_word[i]))
    for f in sorted(feats, reverse = True)[:n]:
        print(f)

In [166]:
dtc_features(dtc, 30)

(0.0734276921852101, 'window')
(0.05848350949358611, 'graphics')
(0.04896244215508005, 'drive')
(0.04262174419224283, 'mit')
(0.034374208093933475, 'power')
(0.030867846878212424, 'card')
(0.029931955498954183, 'motif')
(0.029439662982855948, 'circuit')
(0.026440857341249152, 'bus')
(0.02281868553127265, 'x11r5')
(0.017779179892356833, 'image')
(0.014742973605610974, 'motherboard')
(0.013865964602255514, 'monitor')
(0.01370310406124126, 'electronics')
(0.013502518518889978, 'monitors')
(0.013335758925404553, 'widget')
(0.012611408715036989, 'port')
(0.01251514075890928, '3d')
(0.011276365423020462, 'dos')
(0.010303994102023472, 'tiff')
(0.01023103144250121, 'server')
(0.009725732716514113, 'use')
(0.007431242418916751, 'cview')
(0.007299596618019035, 'algorithm')
(0.00723710880126056, 'gateway')
(0.006940690795091781, 'gif')
(0.006431734039167531, 'might')
(0.006181382475031193, 'ide')
(0.006079854654526844, 'xlib')
(0.005997766793061355, 'points')


## Параметры для CountVectorizer

In [231]:
new_count_vect = CountVectorizer(min_df=2, max_df=0.9, stop_words = 'english', token_pattern = '[a-zA-Z][a-zA-Z]+')

In [228]:
X_train_counts = new_count_vect.fit_transform(twenty_train.data)
X_test = new_count_vect.transform(twenty_test.data)

Обучим новые классификаторы

In [229]:
multi_nb = MultinomialNB(alpha = 0.1)
log_reg = LogisticRegression(multi_class = 'auto', solver = 'lbfgs', max_iter = 1500, class_weight = 'balanced', C = 100)
dtc = DecisionTreeClassifier(max_depth = 40, splitter = 'best')

In [126]:
print('Кросс-валидация:')
cross_val(multi_nb, 20)
print('Результат на тесте:')
model_test(multi_nb)

Кросс-валидация:
macro F1=0.9207
Результат на тесте:
macro F1=0.8338


In [225]:
print('Кросс-валидация:')
cross_val(log_reg, 20)
print('Результат на тесте:')
model_test(log_reg)

Кросс-валидация:
macro F1=0.8874
Результат на тесте:
macro F1=0.8104


In [198]:
print('Кросс-валидация:')
cross_val(dtc, 20)
print('Результат на тесте:')
model_test(dtc)

Кросс-валидация:
macro F1=0.6973
Результат на тесте:
macro F1=0.6653


При изменении параметров Count Vectorizer (применение стоп-слов, удаление всех цифр, изменение максимальной и минимальной частоты) получались разные результаты:

- при парметрах, приведенных выше, несколько улучшается результат моделей на тесте, но при этом сохраняется разрыв с результатами кросс-валидации (кроме деревьев решений)

- можно было несколько уменьшить разрыв теста и корсс-валидации, но при этом итоговый результат на тесте меньше, чем был изначально

- при некоторых параметрах у одних моделей переобучение уменьшалось, а у других наоборот росло