In [1]:
from nltk import word_tokenize

from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split

# Grid search

In [2]:
dataset = fetch_20newsgroups(categories=('comp.graphics', 'sci.electronics', 'sci.space', 'rec.autos'))

In [3]:
cv = CountVectorizer(ngram_range=(1, 3), min_df=5, max_df=0.75, tokenizer=word_tokenize)
X = cv.fit_transform(dataset['data'])
X.shape

(2362, 40237)

In [4]:
y = dataset['target']
y.shape

(2362,)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## Random Forest

In [6]:
param_grid = {
    'n_estimators': [10, 100],
    'max_features': ['log2', 'sqrt', 1.0]
}

grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid,
                    scoring='f1_macro', cv=5, n_jobs=-1)
grid = grid.fit(X_train, y_train)

In [7]:
grid.best_params_

{'max_features': 'sqrt', 'n_estimators': 100}

In [8]:
rfc = RandomForestClassifier(n_estimators=100, max_features='sqrt')
rfc.fit(X_train, y_train)
preds = rfc.predict(X_test)

In [9]:
f1_score(y_test, preds, average='macro')

0.9120803283117466

In [10]:
f1_score(y_train, rfc.predict(X_train), average='macro')

1.0

## Logistic Regression

In [11]:
param_grid = {
    'C': [0.1, 0.5, 1.0],
    'fit_intercept': [False, True]
}

grid = GridSearchCV(LogisticRegression(), param_grid=param_grid,
                    scoring='f1_macro', cv=5, n_jobs=-1)
grid = grid.fit(X_train, y_train)



In [12]:
grid.best_params_

{'C': 0.5, 'fit_intercept': True}

In [13]:
logreg = LogisticRegression(C=0.5, fit_intercept=True)
logreg.fit(X_train, y_train)
preds = logreg.predict(X_test)

In [14]:
f1_score(y_test, preds, average='macro')

0.9346816976127321

In [15]:
f1_score(y_train, logreg.predict(X_train), average='macro')

1.0

## SVM

In [16]:
param_grid = {
    'C': [0.1, 0.5, 1.0],
    'fit_intercept': [False, True]
}

grid = GridSearchCV(LinearSVC(), param_grid=param_grid,
                    scoring='f1_macro', cv=5, n_jobs=-1)
grid = grid.fit(X_train, y_train)



In [17]:
grid.best_params_

{'C': 0.1, 'fit_intercept': False}

In [18]:
svc = LinearSVC(C=0.1, fit_intercept=False)
svc.fit(X_train, y_train)
preds = svc.predict(X_test)

In [19]:
f1_score(y_test, preds, average='macro')

0.9325200873386106

In [20]:
f1_score(y_train, svc.predict(X_train), average='macro')

1.0

We can see that all three classifiers memorized the training data perfectly, while on the test data their performance is slightly worse. Let's find out what features (words or n-grams) our classifiers focus on so that we can possibly eliminate the worst ones.

# Feature analysis

# Model retraining