In [1]:
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import KFold
import pandas as pd
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')
import numpy as np
newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

In [2]:
X = newsgroups['data']
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
X

<1786x28382 sparse matrix of type '<class 'numpy.float64'>'
	with 303138 stored elements in Compressed Sparse Row format>

In [3]:
y = newsgroups['target']
y

array([0, 0, 1, ..., 1, 1, 0])

In [4]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv, n_jobs=-1)
gs.fit(X, y)
c_best = gs.best_params_
c_best

{'C': 1.0}

In [5]:
best_model = SVC(kernel='linear', random_state=241, C = c_best['C'])
best_model.fit(X, y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=241, shrinking=True, tol=0.001,
    verbose=False)

In [6]:
coefs = abs(best_model.coef_.todense().A1)
coefs = np.argsort(coefs)
sorted_coefs = np.argsort(np.abs(np.asarray(best_model.coef_.todense())).reshape(-1))[-10:]
sorted_coefs

array([22936, 15606,  5776, 21850, 23673, 17802,  5093,  5088, 12871,
       24019])

In [7]:
df = pd.DataFrame(coefs, index=np.asarray(vectorizer.get_feature_names()) , columns=["weigts"])
words = df.iloc[sorted_coefs].index

In [8]:
sorted(words)

['atheism',
 'atheists',
 'bible',
 'god',
 'keith',
 'moon',
 'religion',
 'sci',
 'sky',
 'space']