In [1]:
from nltk import word_tokenize

from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split

# Grid search

In [2]:
dataset = fetch_20newsgroups(categories=('comp.graphics', 'sci.electronics', 'sci.space', 'rec.autos'))

In [52]:
def get_data(vectorizer, test_size):
    X = vectorizer.fit_transform(dataset['data'])
    y = dataset['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
                                                        stratify=y, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [53]:
cv = CountVectorizer(ngram_range=(1, 3), min_df=5, max_df=0.75, tokenizer=word_tokenize)
X_train, X_test, y_train, y_test = get_data(cv, 0.2)

print(*[x.shape for x in (X_train, X_test, y_train, y_test)])

(1889, 40237) (473, 40237) (1889,) (473,)


## Random Forest

In [6]:
param_grid = {
    'n_estimators': [10, 100],
    'max_features': ['log2', 'sqrt', 1.0]
}

grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid,
                    scoring='f1_macro', cv=5, n_jobs=-1)
grid = grid.fit(X_train, y_train)

In [7]:
grid.best_params_

{'max_features': 'sqrt', 'n_estimators': 100}

In [8]:
rfc = RandomForestClassifier(n_estimators=100, max_features='sqrt')
rfc.fit(X_train, y_train)
preds = rfc.predict(X_test)

In [9]:
f1_score(y_test, preds, average='macro')

0.9120803283117466

In [10]:
f1_score(y_train, rfc.predict(X_train), average='macro')

1.0

## Logistic Regression

In [11]:
param_grid = {
    'C': [0.1, 0.5, 1.0],
    'fit_intercept': [False, True]
}

grid = GridSearchCV(LogisticRegression(), param_grid=param_grid,
                    scoring='f1_macro', cv=5, n_jobs=-1)
grid = grid.fit(X_train, y_train)



In [12]:
grid.best_params_

{'C': 0.5, 'fit_intercept': True}

In [13]:
logreg = LogisticRegression(C=0.5, fit_intercept=True)
logreg.fit(X_train, y_train)
preds = logreg.predict(X_test)

In [14]:
f1_score(y_test, preds, average='macro')

0.9346816976127321

In [15]:
f1_score(y_train, logreg.predict(X_train), average='macro')

1.0

## SVM

In [16]:
param_grid = {
    'C': [0.1, 0.5, 1.0],
    'fit_intercept': [False, True]
}

grid = GridSearchCV(LinearSVC(), param_grid=param_grid,
                    scoring='f1_macro', cv=5, n_jobs=-1)
grid = grid.fit(X_train, y_train)



In [17]:
grid.best_params_

{'C': 0.1, 'fit_intercept': False}

In [18]:
svc = LinearSVC(C=0.1, fit_intercept=False)
svc.fit(X_train, y_train)
preds = svc.predict(X_test)

In [19]:
f1_score(y_test, preds, average='macro')

0.9325200873386106

In [20]:
f1_score(y_train, svc.predict(X_train), average='macro')

1.0

We can see that all three classifiers memorized the training data perfectly, while on the test data their performance is slightly worse. Let's find out what features (words or n-grams) our classifiers focus on so that we can possibly eliminate the worst ones.

# Feature analysis

In [22]:
idx2word = {idx: word for word, idx in cv.vocabulary_.items()}

## Random Forest

In [46]:
[idx2word[idx] for idx, _ in sorted(enumerate(rfc.feature_importances_),
                                    key = lambda x: x[1], reverse=True)][:10]

['car',
 'space',
 'graphics',
 'cars',
 'image',
 'nasa',
 'orbit',
 'the car',
 'engine',
 'circuit']

For RFC, this is the best we can get.

## Logistic Regression

In [43]:
def find_best_features(model, n: int):
    for i, cls in enumerate(model.coef_):
        best_features = [idx2word[idx] for idx, _ in sorted(enumerate(cls),
                                                            key = lambda x: x[1],
                                                            reverse=True)][:n]
        print(f'Class {i}: {", ".join(best_features)}')

In [47]:
find_best_features(logreg, 10)

Class 0: graphics, image, file, program, format, 3d, files, animation, package, code
Class 1: car, cars, my, ford, engine, toyota, auto, list, bmw, saturn
Class 2: circuit, electronics, power, tv, used, chips, chip, design, an, motorola
Class 3: space, orbit, moon, planets, launch, nasa, funding, sci.space, spacecraft, rockets


## SVM

In [48]:
find_best_features(svc, 10)

Class 0: graphics, image, format, gif, program, package, file, 3d, library, code
Class 1: car, cars, my, ford, auto, saturn, bmw, list, parts, toyota
Class 2: circuit, power, used, electronics, chips, tv, design, motorola, an, scope
Class 3: space, orbit, planets, funding, moon, spacecraft, nasa, for a, dc-x, russian


# Model retraining

There doesn't seem to be much we could improve except remove some stopwords like articles and prepositions. Let's use a list of stopwords + corpus statistics for that.

In [54]:
cv = CountVectorizer(ngram_range=(1, 3), min_df=5, max_df=0.5,
                     tokenizer=word_tokenize, stop_words='english')
X_train, X_test, y_train, y_test = get_data(cv, 0.2)

print(*[x.shape for x in (X_train, X_test, y_train, y_test)])

(1889, 25050) (473, 25050) (1889,) (473,)


## Random Forest

In [55]:
rfc = RandomForestClassifier(n_estimators=100, max_features='sqrt')
rfc.fit(X_train, y_train)
preds = rfc.predict(X_test)

In [56]:
f1_score(y_test, preds, average='macro')

0.9217852739281013

In [57]:
f1_score(y_train, rfc.predict(X_train), average='macro')

1.0

## Logistic Regression

In [58]:
logreg = LogisticRegression(C=0.5, fit_intercept=True)
logreg.fit(X_train, y_train)
preds = logreg.predict(X_test)



In [59]:
f1_score(y_test, preds, average='macro')

0.9385479606343832

In [60]:
f1_score(y_train, logreg.predict(X_train), average='macro')

1.0

## SVM

In [61]:
svc = LinearSVC(C=0.1, fit_intercept=False)
svc.fit(X_train, y_train)
preds = svc.predict(X_test)



In [62]:
f1_score(y_test, preds, average='macro')

0.9301855093145587

In [63]:
f1_score(y_train, svc.predict(X_train), average='macro')

1.0

As a result, we managed to get a little better results for RFC and LogReg, while the metric for SVM actually diminished. I'm pretty sure this is about as good as we can expect from these simple models.