In [1]:
from sklearn.datasets import fetch_20newsgroups
cts = ['comp.graphics', 'comp.sys.mac.hardware', 'sci.crypt', 'sci.space']
train_data = [fetch_20newsgroups(subset='train', categories=[c]) for c in cts]
test_data = [fetch_20newsgroups(subset='test', categories=[c]) for c in cts]

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
all_train_data = []
for d in train_data:
  all_train_data += d.data
  
all_test_data = []
for d in test_data:
  all_test_data += d.data

In [0]:
train_classes = []
for i in range(len(train_data)):
  train_classes += len(train_data[i].data) * [i]

test_classes = []
for i in range(len(test_data)):
  test_classes += len(test_data[i].data) * [i]

In [4]:
print(len(all_train_data))
print(len(train_classes))
print(len(all_test_data))
print(len(test_classes))

2350
2350
1564
1564


In [0]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=3, max_df=0.4, max_features=1000)
X_train = cv.fit_transform(all_train_data)
X_test = cv.transform(all_test_data)

In [0]:
from sklearn.model_selection import GridSearchCV
def grid_search_best(model, grid, folds, grid_train_data, grid_train_classes):
  grid_search = GridSearchCV(model, param_grid=grid, cv=folds, scoring='f1_macro')
  grid_search.fit(grid_train_data, grid_train_classes) 
  return grid_search.best_score_, grid_search.best_params_, grid_search.best_estimator_

In [0]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

n_fold = 10
stratified_folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

models = [MultinomialNB(), 
          DecisionTreeClassifier(), 
          RandomForestClassifier()]
grids = [
{
    'alpha': [0.1, 0.5, 1, 1.5, 2],
    'fit_prior': [True, False]
},{
    'max_features': [50,100,150],
    'random_state': [42]
},{
    'n_estimators': [50,100,150],
    'max_features': [50,100,150],
    'random_state': [42]
}]

In [0]:
def find_models(find_models, find_grids, find_folds, find_train, find_classes):
  best_models = []
  train_scores = []
  for i in range(len(find_models)):
    score, params, estimator = grid_search_best(find_models[i], find_grids[i], find_folds, find_train.toarray(), find_classes)
    print('Best score: {}'.format(score))
    print('Best parameters: {}'.format(params))
    train_scores.append(score)
    best_models.append(estimator)
  return best_models, train_scores

In [9]:
best_models, train_scores = find_models(models, grids, stratified_folds, X_train, train_classes)

Best score: 0.9273203398561364
Best parameters: {'alpha': 0.1, 'fit_prior': True}
Best score: 0.833014943440908
Best parameters: {'max_features': 150, 'random_state': 42}
Best score: 0.9125963885220265
Best parameters: {'max_features': 50, 'n_estimators': 150, 'random_state': 42}


MultinomialNB: 0.9273203398561364, {'alpha': 0.1, 'fit_prior': True}  
DecisionTreeClassifier: 0.833014943440908 {'max_features': 150, 'random_state': 42}  
RandomForestClassifier: 0.9125963885220265 {'max_features': 50, 'n_estimators': 150, 'random_state': 42}

In [0]:
from sklearn.metrics import f1_score
def get_test_scores(models, get_test, get_classes):
  test_scores = []
  for model in models:
    preds = model.predict(get_test.toarray())
    score = f1_score(get_classes, preds, average='macro')
    test_scores.append(score)
    print(score)
  return test_scores

In [11]:
test_scores = get_test_scores(best_models, X_test, test_classes)

0.9049165648893678
0.7647810545335663
0.8957139068057778


Результаты на тестовой выборке для 1 и 3 классификатора стали незначительно хуже - ~2%; для второго сильно меньше - ~7%

In [0]:
index_to_word = {v:k for k,v in cv.vocabulary_.items()}

In [0]:
def array_to_words(weights, n, index_to_word):
  features = {i:weights[i] for i in range(len(weights))}
  sorted_f = {k: v for k, v in sorted(features.items(), key=lambda item: item[1], reverse=True)}
  j = 0
  for i, v in sorted_f.items():
    j+= 1
    if j > n:
      break
    print("word: ", index_to_word[i], " \nweight: ", v)

def analyze_features(model, n, index_to_word):
  try:
    f = model.feature_importances_
    array_to_words(f, n, index_to_word)
  except AttributeError:
    for i in range(len(model.coef_)):
      print("class", i+1)
      array_to_words(model.coef_[i], n, index_to_word)
      print()

In [20]:
for model in best_models:
  analyze_features(model, 5, index_to_word)
  print()

class 1
word:  graphics  
weight:  -4.4389539618363285
word:  image  
weight:  -4.5658622673787
word:  there  
weight:  -4.771725060760443
word:  an  
weight:  -4.836094274288771
word:  any  
weight:  -4.843509038857904

class 2
word:  mac  
weight:  -4.405462088601611
word:  my  
weight:  -4.454147715372693
word:  apple  
weight:  -4.533010383851667
word:  an  
weight:  -4.738442993074061
word:  what  
weight:  -4.84465834820678

class 3
word:  key  
weight:  -4.328001948671288
word:  they  
weight:  -4.478830388293348
word:  will  
weight:  -4.5022265518396445
word:  by  
weight:  -4.582024678751
word:  encryption  
weight:  -4.716860817758163

class 4
word:  space  
weight:  -3.882096018851964
word:  was  
weight:  -4.452970578448493
word:  nasa  
weight:  -4.523671392280601
word:  by  
weight:  -4.591008562112682
word:  they  
weight:  -4.725971416005442


word:  space  
weight:  0.10948150477193562
word:  encryption  
weight:  0.08669088865715406
word:  keys  
weight:  0.078340655

В общем слова описывают тексты, однако почему-то в список попали общие слова типа 'by', 'an', 'was', 'my'. Попробуем это исправить

In [0]:
initial = train_scores
def get_delta(train_scores, test_scores):
  result_metric = 0
  for i in range(len(train_scores)):
    if train_scores[i] < initial[i] - 0.02:
      return 1
    delta = train_scores[i] - test_scores[i]
    result_metric += delta
  return result_metric / 3

In [16]:
stop_words = ['english', None]
ngram_ranges = [(1,1), (1,2), (1,3), (2,3)]
max_dfs = [0.975, 1.0]
max_features = [750, 1000, 1250]

best_cv = None
new_best_models = []
best_metric_score = get_delta(train_scores, test_scores)
for sw in stop_words:
  for nr in ngram_ranges:
    for mad in max_dfs:
      for mf in max_features:
        new_cv = CountVectorizer(stop_words = sw, ngram_range = nr, max_df=mad, max_features=mf)
        new_X_train = new_cv.fit_transform(all_train_data)
        new_X_test = new_cv.transform(all_test_data)
        curr_best_models, curr_train_scores = find_models(models, grids, stratified_folds, new_X_train, train_classes)
        curr_test_scores = get_test_scores(curr_best_models, new_X_test, test_classes)
        curr_metric_score = get_delta(curr_train_scores, curr_test_scores)
        if curr_metric_score < best_metric_score:
          best_metric_score = curr_metric_score
          new_best_models = curr_best_models
          best_cv = new_cv

Best score: 0.9273932981356765
Best parameters: {'alpha': 0.1, 'fit_prior': True}
Best score: 0.8217130815455975
Best parameters: {'max_features': 100, 'random_state': 42}
Best score: 0.9105157207078907
Best parameters: {'max_features': 50, 'n_estimators': 100, 'random_state': 42}
0.895081119693
0.7709933551653921
0.8796731581930259
Best score: 0.9350708188182809
Best parameters: {'alpha': 0.1, 'fit_prior': True}
Best score: 0.8283655344834668
Best parameters: {'max_features': 100, 'random_state': 42}
Best score: 0.918605483128324
Best parameters: {'max_features': 50, 'n_estimators': 150, 'random_state': 42}
0.9096976339898506
0.7688309322160573
0.8939045370652265
Best score: 0.9362988720767869
Best parameters: {'alpha': 0.1, 'fit_prior': True}
Best score: 0.8255661607968424
Best parameters: {'max_features': 150, 'random_state': 42}
Best score: 0.9223204795324964
Best parameters: {'max_features': 50, 'n_estimators': 150, 'random_state': 42}
0.9185270349949075
0.7825624448775597
0.90548

In [22]:
new_index_to_word = {v:k for k,v in best_cv.vocabulary_.items()}
print(new_best_models)
if len(new_best_models) > 0:
  for model in new_best_models:
    analyze_features(model, 5,new_index_to_word)
    print()
else:
  print("Couldn't find better models")

[MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True), DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=150, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best'), RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=50,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
              

Как мы видим, те слова, что были выделены выше, исчезли, но теперь первый классификатор использует похожие слова для всех классов